### https://cran.r-project.org/web/packages/udpipe/vignettes/udpipe-usecase-postagging-lemmatisation.html
library(udpipe)
<- udpipe_download_model(language = "english") ud_model
library(tidyverse)
library(tidyr)
library(dplyr)
library(ggplot2)
library(ggrepel)
library(knitr)
library(tm)
library(quanteda)
library(lattice)
library(latticeExtra)
library(plotly)
library(pdp)
library(patchwork)
### CODE DIRECTLY FROM: https://burtmonroe.github.io/TextAsDataCourse/Tutorials/TADA-FightinWords.nb.html#
<- function(dtm, groups, pair = NULL, weights = rep(1,nrow(dtm)), k.prior = .1) {
fwgroups
is.na(weights)] <- 0
weights[
<- weights/mean(weights)
weights
<- rowSums(dtm)==0 | weights==0
zero.doc <- colSums(dtm[!zero.doc,])==0
zero.term
<- apply(dtm[!zero.doc,!zero.term],2,"*", weights[!zero.doc])
dtm.nz
<- tcrossprod(rowSums(dtm.nz),colSums(dtm.nz))/sum(dtm.nz)
g.prior
#
<- as.matrix(dtm.nz + k.prior*g.prior)
g.posterior
<- groups[!zero.doc]
groups <- droplevels(groups)
groups
<- as.matrix(aggregate(x=g.posterior,by=list(groups=groups),FUN=sum)[,-1])
g.adtm rownames(g.adtm) <- levels(groups)
<- log(g.adtm)
g.ladtm
<- t(scale( t(scale(g.ladtm, center=T, scale=F)), center=T, scale=F))
g.delta
<- -sweep(g.adtm,1,rowSums(g.adtm)) # terms not w spoken by k
g.adtm_w <- -sweep(g.adtm,2,colSums(g.adtm)) # w spoken by groups other than k
g.adtm_k <- sum(g.adtm) - g.adtm_w - g.adtm_k - g.adtm # total terms not w or k
g.adtm_kw
<- sqrt(1/g.adtm + 1/g.adtm_w + 1/g.adtm_k + 1/g.adtm_kw)
g.se
<- g.delta/g.se
g.zeta
<- as.matrix(aggregate(x=dtm.nz, by = list(groups=groups), FUN=sum)[,-1])
g.counts
if (!is.null(pair)) {
<- t(scale( t(scale(g.ladtm[pair,], center = T, scale =F)), center=T, scale=F))
pr.delta <- -sweep(g.adtm[pair,],1,rowSums(g.adtm[pair,]))
pr.adtm_w <- -sweep(g.adtm[pair,],2,colSums(g.adtm[pair,])) # w spoken by groups other than k
pr.adtm_k <- sum(g.adtm[pair,]) - pr.adtm_w - pr.adtm_k - g.adtm[pair,] # total terms not w or k
pr.adtm_kw <- sqrt(1/g.adtm[pair,] + 1/pr.adtm_w + 1/pr.adtm_k + 1/pr.adtm_kw)
pr.se <- pr.delta/pr.se
pr.zeta
return(list(zeta=pr.zeta[1,], delta=pr.delta[1,],se=pr.se[1,], counts = colSums(dtm.nz), acounts = colSums(g.adtm)))
else {
} return(list(zeta=g.zeta,delta=g.delta,se=g.se,counts=g.counts,acounts=g.adtm))
}
}
############## FIGHTIN' WORDS PLOTTING FUNCTION
# helper function
<-function(someColor, alpha=100)
makeTransparent
{<-col2rgb(someColor)
newColorapply(newColor, 2, function(curcoldata){rgb(red=curcoldata[1], green=curcoldata[2],
blue=curcoldata[3],alpha=alpha, maxColorValue=255)})
}
<- function(fw.ch, groups.use = as.factor(rownames(fw.ch$zeta)), max.words = 50, max.countrank = 400, colorpalette=rep("black",length(groups.use)), sizescale=2, title="Comparison of Terms by Groups", subtitle = "", caption = "Group-specific terms are ordered by Fightin' Words statistic (Monroe, et al. 2008)") {
fw.ggplot.groups if (is.null(dim(fw.ch$zeta))) {## two-group fw object consists of vectors, not matrices
<- cbind(rank(-fw.ch$zeta),rank(fw.ch$zeta))
zetarankmat colnames(zetarankmat) <- groups.use
<- rank(-(fw.ch$counts))
countrank else {
} <- apply(-fw.ch$zeta[groups.use,],1,rank)
zetarankmat <- rank(-colSums(fw.ch$counts))
countrank
}<- as_tibble(cbind(zetarankmat,countrank=countrank))
wideplotmat $term=names(countrank)
wideplotmat#rankplot <- gather(wideplotmat, party, zetarank, 1:ncol(zetarankmat))
<- gather(wideplotmat, groups.use, zetarank, 1:ncol(zetarankmat))
rankplot $plotsize <- sizescale*(50/(rankplot$zetarank))^(1/4)
rankplot<- rankplot[rankplot$zetarank < max.words + 1 & rankplot$countrank<max.countrank+1,]
rankplot $groups.use <- factor(rankplot$groups.use,levels=groups.use)
rankplot
<- ggplot(rankplot, aes((nrow(rankplot)-countrank)^1, -(zetarank^1), colour=groups.use)) +
p geom_point(show.legend=F,size=sizescale/2) +
theme_classic() +
theme(axis.ticks=element_blank(), axis.text=element_blank() ) +
ylim(-max.words,40) +
facet_grid(groups.use ~ .) +
geom_text_repel(aes(label = term), size = rankplot$plotsize, point.padding=.05,
box.padding = unit(0.20, "lines"), show.legend=F, max.overlaps = Inf) +
scale_colour_manual(values = alpha(colorpalette, .7)) +
# labs(x="Terms used more frequently overall →", y="Terms used more frequently by group →", title=title, subtitle=subtitle , caption = caption)
labs(x=paste("Terms used more frequently overall -->"), y=paste("Terms used more frequently by group -->"), title=title, subtitle=subtitle , caption = caption)
}
options(ggrepel.max.overlaps = Inf)
<- function(fw.ch,n.keys=10) {
fw.keys <- nrow(fw.ch$zeta)
n.groups <- matrix("",n.keys,n.groups)
keys colnames(keys) <- rownames(fw.ch$zeta)
for (g in 1:n.groups) {
<- names(sort(fw.ch$zeta[g,],dec=T)[1:n.keys])
keys[,g]
}
keys }
Query search: (activi* | ahbash | akromiya | anjem | ansharut | anticapital* | antidemocr* | antiestablish* | antifa | antigovern* | antimilitar* | antimonarch* | antipatri* | antireli* | antisem* | antisocia* | antisyst* | apost* | atharis | athei* | atheists | außerparlamentari* | authoritar* | bagau* | bigots | bplf | bukhari* | capitulatio* | conspirato* | counterj* | cybercalip* | damigo | dawro* | demon* | deradicaliza* | deviatio* | diqqi | dissid* | djamaat | dotbus* | ecofas* | espou* | ethnonationa* | extrem* | facists | fadaia* | fanat* | fasci* | fetö | fightdem* | freedo* | fundamental* | fuqra | gafatar | gamerga* | gemidzii | ghuluww | globali* | gramsc* | gülen* | hacktiv* | haquna | hardline | harkatul | hatemon* | heimwe* | hezbol* | hinduph* | hindutva | hizbut | hojja* | ideolog* | incitem* | inciters | insurr* | intacti* | islam4uk | islam* | jaljalat | jbakc | jihadi* | jmjb | jrtn | judai* | juhayman | jundu* | kadiza* | kahanism | kahanist | karram* | kaysa* | khalis* | khatmia | khawarij | khomein* | koutla | leftist | leftists | leftwing | liberatio* | madkha* | madkhal* | maimonid* | manosp* | mauras* | mcln | militan* | millatu | monarc* | mudja* | muhaji* | mujahid* | murab* | muttahi* | najjadah | nationali* | neofas* | neona* | opantish | oppositi* | paleolibertar* | paramili* | parliamenta* | pegida | populist | principa* | profe* | prowar | putinist | qadari | quranism | quranist | qutbism | qutbist | qutbists | racis* | radica* | reactioni* | reformis* | reichsbürgerbewe* | rightist | rightw* | rofiq | russoph* | sabireen | sadda* | salaf* | sayaff | scriptura* | secula* | separationi* | sharia4hol* | sikrikim | split* | squadism | strasse* | subver* | suidlan* | sukarn* | suprema* | supremac* | sympathi* | table* | tabliq | takfir | takfir* | takfi* |terror* | theoc* | titoite | triba* | trots* | trotsk* | ukrainoph* | ultraconserva* | ultralib* | ultranationa* | ultrar* | uscmo | wahabbi | wahab* | wahha* | xenoph* | yulde* | zinovie*)
Load and clean the data
<-function(corpus){
text_cleaner<-Corpus(VectorSource(corpus))
tempcorpus<-tm_map(tempcorpus,
tempcorpus
removePunctuation)<-tm_map(tempcorpus,
tempcorpus
stripWhitespace)<-tm_map(tempcorpus,
tempcorpus
removeNumbers)<-tm_map(tempcorpus,
tempcorpusstopwords("english"))
removeWords, <-tm_map(tempcorpus,
tempcorpus
stemDocument)return(tempcorpus)
}
Calculate FW.
<- dfm(extremecorpus$content)
e message(dim(e))
head(e)
Document-feature matrix of: 6 documents, 15,213 features (100.0% sparse).
features
docs us struggl uptick american plot attack past month last week
text1 1 1 1 1 1 0 0 0 0 0
text2 0 0 0 0 0 1 1 1 1 1
text3 0 0 0 0 0 0 0 0 0 0
text4 0 0 0 0 0 0 0 0 0 0
text5 0 0 0 0 0 0 0 0 0 0
text6 0 0 0 0 0 0 0 0 0 0
[ reached max_nfeat ... 15,203 more features ]
#############################################
<- dfm_select(e, pattern = stopwords("english"), selection = "remove")
e <- dfm_select(e, min_nchar = 2)
e <- dfm_trim(e, min_termfreq = 4, min_docfreq = .05, verbose=TRUE)
e
#dim(e)
# sparsity(e)
#############################################
<- convert(e, to='data.frame')
extrem_dtm <- extrem_dtm[-c(1)]
extrem_dtm <- which( sapply(extrem_dtm, class ) == 'character' )
w
#############################################
<- fwgroups(extrem_dtm, groups=extrem_NYT.dfm.long$Context)
fw.extrem
rm(extrem_dtm)
Get and show the top words per group by zeta.
<- fw.keys(fw.extrem, n.keys=20)
fwkeys.extrem <- rev(colnames(fwkeys.extrem))
cols <- fwkeys.extrem[,cols]
fwkeys.extrem kable(fwkeys.extrem)
Context.before | Context.after |
---|---|
islamic | group |
palestinian | hussein |
war | attack |
iraqi | parti |
suspect | elect |
crack | leader |
jemaah | jihad |
crackdown | milit |
main | movement |
alleg | said |
support | guerrilla |
evid | network |
albanian | regim |
prodemocraci | armi |
sept | organ |
megawati | politician |
muslim | outsid |
prevent | univers |
kosovo | bomb |
battl | rocket |
NA
Plot: Before in Blue, After in Red
<- fw.ggplot.groups(fw.extrem,sizescale=4,max.words=200,
p.fw.extrem max.countrank=400,colorpalette=c("red","blue"),
title = 'Comparison of Terms Before and After Query Word')
p.fw.extrem
$Query.item <- as.factor(extrem_NYT.dfm.long$Query.item)
extrem_NYT.dfm.long
<-as.data.frame(sort(table(extrem_NYT.dfm.long$Query.item), decreasing = TRUE)[1:5])
top_n message(dim(top_n))
colnames(top_n) <- c('term', 'Freq')
message(top_n)
<- convert(e, to='data.frame')
extrem_dtm_topn $Number.of.hit <- extrem_NYT.dfm.long$Number.of.hit
extrem_dtm_topn
<- extrem_NYT.dfm.long %>%
topn_terms filter(Query.item %in% top_n$term)
<- extrem_dtm_topn %>%
extrem_dtm_topn_keep filter(Number.of.hit %in% topn_terms$Number.of.hit)
<- sum(length(extrem_dtm_topn_keep))
r
<- extrem_dtm_topn_keep[-c(1, r)]
extrem_dtm_topn_keep
<- fwgroups(extrem_dtm_topn_keep,groups = topn_terms$Query.item)
fw.query_item <- fw.keys(fw.query_item, n.keys=15)
fwkeys.query_item kable(fwkeys.query_item)
Islamic | militants | opposition | Saddam | terrorist |
---|---|---|---|---|
milit | islamic | parti | hussein | attack |
hama | palestinian | democrat | iraqi | organ |
jihad | kill | parliament | iraq | sept |
group | israeli | leader | presid | unite |
movement | suspect | elect | regim | state |
extremist | gaza | vote | saddam | activ |
front | israel | opposit | baghdad | alqaida |
republ | attack | lawmak | un | act |
insurg | kashmir | polit | kuwait | network |
radic | taliban | main | war | group |
law | arafat | candid | captur | suspect |
fundamentalist | forc | coalit | oust | laden |
revolut | pakistani | conserv | weapon | bin |
somalia | fire | allianc | bush | connect |
hardlin | pakistan | protest | us | financ |
rm(r)
rm(extrem_dtm_topn)
########################################################
<- fw.ggplot.groups(fw.query_item,sizescale=3.2,max.words=150,max.countrank=400,
p.fw.query_item colorpalette=c("darkgreen","darkgreen","darkgreen","darkgreen","darkgreen"),
title = 'Comparison of Terms by Overall Top Terms')
p.fw.query_item
<- convert(e, to='data.frame')
extrem_dtm_topn $Number.of.hit <- extrem_NYT.dfm.long$Number.of.hit
extrem_dtm_topn$Context <- extrem_NYT.dfm.long$Context
extrem_dtm_topn
<- extrem_NYT.dfm.long %>%
topn_terms filter(Query.item %in% top_n$term)
<- extrem_dtm_topn %>%
extrem_dtm_topn_keep filter(Number.of.hit %in% topn_terms$Number.of.hit)
<- extrem_dtm_topn_keep[grep("before",extrem_dtm_topn_keep$Context),]
extrem_dtm_topn_keep_before <- extrem_dtm_topn_keep[grep("after", extrem_dtm_topn_keep$Context),]
extrem_dtm_topn_keep_after
<-dim(topn_terms)[1]
rr <- sum(length(extrem_dtm_topn_keep_before))
r <- topn_terms[seq(1,rr,2),]
topn_terms_before <- topn_terms[seq(2,rr,2),]
topn_terms_after
<- extrem_dtm_topn_keep_before[-c(1, r-1, r)]
extrem_dtm_topn_keep_before <- extrem_dtm_topn_keep_after[-c(1, r-1, r)]
extrem_dtm_topn_keep_after
rm(rr)
rm(r)
rm(extrem_dtm_topn)
#############################################
<- fwgroups(extrem_dtm_topn_keep_before,groups = topn_terms_before$Query.item)
fw.query_item_before <- fw.keys(fw.query_item_before, n.keys=15)
fwkeys.query_item_before kable(fwkeys.query_item_before, caption = "Top 15 Words for Query Term: BEFORE")
Islamic | militants | opposition | Saddam | terrorist |
---|---|---|---|---|
milit | islamic | main | iraqi | sept |
strict | palestinian | elect | presid | alqaida |
radic | kill | polit | iraq | unite |
somalia | suspect | parliament | war | bin |
hama | taliban | opposit | saddam | laden |
organiz | attack | parti | oust | involv |
secular | israeli | minist | baghdad | suspect |
hardlin | muslim | vote | fall | link |
iran | gaza | democrat | bush | state |
turkey | armi | prime | hussein | consid |
extremist | troop | strong | captur | connect |
malaysia | israel | percent | regim | appar |
suprem | pakistan | despit | usled | osama |
gaza | forc | poll | former | charg |
fundamentalist | arafat | voic | death | list |
<- fw.ggplot.groups(fw.query_item_before,sizescale=2,max.words=150,max.countrank=400,
p.fw.query_item_before colorpalette = c('blue','blue','blue', 'blue','blue'),
title = 'Comparison of Terms by Overall Top Terms: BEFORE')
p.fw.query_item_before
#############################################
<- fwgroups(extrem_dtm_topn_keep_after,groups = topn_terms_after$Query.item)
fw.query_item_after <- fw.keys(fw.query_item_after, n.keys=15)
fwkeys.query_item_after kable(fwkeys.query_item_after, caption = "Top 15 Words for Query Term: AFTER")
Islamic | militants | opposition | Saddam | terrorist |
---|---|---|---|---|
milit | israeli | parti | hussein | attack |
group | israel | leader | regim | organ |
jihad | kashmir | democrat | iraq | unite |
hama | gaza | vote | iraqi | state |
movement | palestinian | parliament | un | activ |
law | kill | lawmak | weapon | group |
insurg | attack | protest | saddam | act |
extremist | region | candid | son | network |
front | southern | politician | captur | bomb |
revolut | fire | coalit | kuwait | suspect |
republ | rocket | elect | mass | threat |
islamic | pakistani | social | baghdad | financ |
court | fight | conserv | us | sept |
fundamentalist | border | win | resolut | link |
radic | area | opposit | palac | cell |
<- fw.ggplot.groups(fw.query_item_after,sizescale=2,max.words=150,max.countrank=400,
p.fw.query_item_after colorpalette = c('red', 'red','red','red','red'),
title = 'Comparison of Terms by Overall Top Terms: AFTER')
p.fw.query_item_after
NA
NA
Calculate FW and keys
<- udpipe_load_model(ud_model$file_model)
ud_model
<-as.character(extrem_NYT.dfm.long$context.text)
txt
<- udpipe_annotate(ud_model, x = txt, doc_id = seq_along(txt))
x_udp <- as.data.frame(x_udp)
x
$doc_id <-as.integer(x$doc_id)
x
<- x[x$doc_id %% 2 == 1,]
x_odd.before <-x[x$doc_id %% 2 == 0, ] x_even.after
A few barchart functions
## UNIVERSAL PoS
<- function(df1, df2){
UPOS_barchart <- txt_freq(df1$upos)
stats1 $key <- factor(stats1$key, levels = rev(stats1$key))
stats1
<- txt_freq(df2$upos)
stats2 $key <- factor(stats2$key, levels = rev(stats2$key))
stats2
c(barchart(key ~ freq, data = stats1, col = "cadetblue",
main = "UPOS (Universal Parts of Speech)\n frequency of occurrence: BEFORE vs AFTER",
xlab = "Freq"),
barchart(key ~ freq, data = stats2, col = 'skyblue',
xlab = "Freq"))
}
## NOUNS
<- function(df1, df2){
NOUNS_barchart
<- subset(df1, upos %in% c("NOUN"))
stats1 <- txt_freq(stats1$token)
stats1 $key <- factor(stats1$key, levels = rev(stats1$key))
stats1
<- subset(df2, upos %in% c("NOUN"))
stats2 <- txt_freq(stats2$token)
stats2 $key <- factor(stats2$key, levels = rev(stats2$key))
stats2
c(barchart(key ~ freq, data = head(stats1, 20), col = "cadetblue",
main = "Most occurring nouns: BEFORE vs AFTER", xlab = "Freq"),
barchart(key ~ freq, data = head(stats2, 20), col = "skyblue",
xlab = "Freq"))
}
## ADJECTIVES
<- function(df1, df2){
ADJ_barchart
<- subset(df1, upos %in% c("ADJ"))
stats1 <- txt_freq(stats1$token)
stats1 $key <- factor(stats1$key, levels = rev(stats1$key))
stats1
<- subset(df2, upos %in% c("ADJ"))
stats2 <- txt_freq(stats2$token)
stats2 $key <- factor(stats2$key, levels = rev(stats2$key))
stats2
c(barchart(key ~ freq, data = head(stats1, 20), col = "cadetblue",
main = "Most occurring adjectives: BEFORE vs AFTER", xlab = "Freq"),
barchart(key ~ freq, data = head(stats2, 20), col = "skyblue",
xlab = "Freq"))
}
## Using RAKE to find keywords
<- function(df1,df2){
RAKE_KW_barchart
<- keywords_rake(x = df1, term = "lemma", group = "doc_id",
stats1 relevant = df1$upos %in% c("NOUN", "ADJ"))
$key <- factor(stats1$keyword, levels = rev(stats1$keyword))
stats1
<- keywords_rake(x = df2, term = "lemma", group = "doc_id",
stats2 relevant = df2$upos %in% c("NOUN", "ADJ"))
$key <- factor(stats2$keyword, levels = rev(stats2$keyword))
stats2
c(barchart(key ~ rake, data = head(subset(stats1, freq > 3), 20), col = "cadetblue",
main = "Keywords identified by RAKE: BEFORE vs AFTER",
xlab = "Rake"),
barchart(key ~ rake, data = head(subset(stats2, freq > 3), 20), col = "skyblue",
xlab = "Rake"))
}
## Using Pointwise Mutual Information Collocations
<- function(df1, df2){
PWI_barchart
$word <- tolower(df1$token)
df1<- keywords_collocation(x = df1, term = "word", group = "doc_id")
stats1 $key <- factor(stats1$keyword, levels = rev(stats1$keyword))
stats1
$word <- tolower(df2$token)
df2<- keywords_collocation(x = df2, term = "word", group = "doc_id")
stats2 $key <- factor(stats2$keyword, levels = rev(stats2$keyword))
stats2
c(barchart(key ~ pmi, data = head(subset(stats1, freq > 3), 20), col = "cadetblue",
main = "Keywords identified by PMI Collocation: BEFORE vs AFTER",
xlab = "PMI (Pointwise Mutual Information)"),
barchart(key ~ pmi, data = head(subset(stats2, freq > 3), 20), col = "skyblue",
xlab = "PMI (Pointwise Mutual Information)"))
}
## Using a sequence of POS tags (noun phrases / verb phrases)
<- function(df1, df2){
POS_barchart
$phrase_tag <- as_phrasemachine(df1$upos, type = "upos")
df1<- keywords_phrases(x = df1$phrase_tag, term = tolower(df1$token),
stats1 pattern = "(A|N)*N(P+D*(A|N)*N)*",
is_regex = TRUE, detailed = FALSE)
<- subset(stats1, ngram > 1 & freq > 3)
stats1 $key <- factor(stats1$keyword, levels = rev(stats1$keyword))
stats1
$phrase_tag <- as_phrasemachine(df2$upos, type = "upos")
df2<- keywords_phrases(x = df2$phrase_tag, term = tolower(df2$token),
stats2 pattern = "(A|N)*N(P+D*(A|N)*N)*",
is_regex = TRUE, detailed = FALSE)
<- subset(stats2, ngram > 1 & freq > 3)
stats2 $key <- factor(stats2$keyword, levels = rev(stats2$keyword))
stats2
c(barchart(key ~ freq, data = head(stats1, 20), col = "cadetblue",
main = "Keywords - simple noun phrases: BEFORE vs AFTER", xlab = "Frequency"),
barchart(key ~ freq, data = head(stats2, 20), col = "skyblue",
xlab = "Frequency"))
}
UPOS_barchart(x_odd.before, x_even.after)
NOUNS_barchart(x_odd.before, x_even.after)
ADJ_barchart(x_odd.before, x_even.after)
RAKE_KW_barchart(x_odd.before, x_even.after)
PWI_barchart(x_odd.before, x_even.after)
POS_barchart(x_odd.before, x_even.after)
<- function(df){
Corrs $id <- unique_identifier(df, fields = c("sentence_id", "doc_id"))
df<- subset(df, upos %in% c("NOUN", "ADJ"))
dtm <- document_term_frequencies(dtm, document = "id", term = "lemma")
dtm <- document_term_matrix(dtm)
dtm <- dtm_remove_lowfreq(dtm, minfreq = 5)
dtm <- dtm_cor(dtm)
termcorrelations <- as_cooccurrence(termcorrelations)
y <- subset(y, term1 < term2 & abs(cooc) > 0.2)
y <- y[order(abs(y$cooc), decreasing = TRUE), ]
y print(y[1:25,])
}
Corrs(x_odd.before)
Corrs(x_even.after)
NA
rm(list=ls())