### https://cran.r-project.org/web/packages/udpipe/vignettes/udpipe-usecase-postagging-lemmatisation.html
library(udpipe)
ud_model <- udpipe_download_model(language = "english")library(tidyverse)
library(tidyr)
library(dplyr)
library(ggplot2)
library(ggrepel)
library(knitr)
library(tm)
library(quanteda)
library(lattice)
library(latticeExtra)
library(plotly)
library(pdp)
library(patchwork)
### CODE DIRECTLY FROM: https://burtmonroe.github.io/TextAsDataCourse/Tutorials/TADA-FightinWords.nb.html#
fwgroups <- function(dtm, groups, pair = NULL, weights = rep(1,nrow(dtm)), k.prior = .1) {
weights[is.na(weights)] <- 0
weights <- weights/mean(weights)
zero.doc <- rowSums(dtm)==0 | weights==0
zero.term <- colSums(dtm[!zero.doc,])==0
dtm.nz <- apply(dtm[!zero.doc,!zero.term],2,"*", weights[!zero.doc])
g.prior <- tcrossprod(rowSums(dtm.nz),colSums(dtm.nz))/sum(dtm.nz)
#
g.posterior <- as.matrix(dtm.nz + k.prior*g.prior)
groups <- groups[!zero.doc]
groups <- droplevels(groups)
g.adtm <- as.matrix(aggregate(x=g.posterior,by=list(groups=groups),FUN=sum)[,-1])
rownames(g.adtm) <- levels(groups)
g.ladtm <- log(g.adtm)
g.delta <- t(scale( t(scale(g.ladtm, center=T, scale=F)), center=T, scale=F))
g.adtm_w <- -sweep(g.adtm,1,rowSums(g.adtm)) # terms not w spoken by k
g.adtm_k <- -sweep(g.adtm,2,colSums(g.adtm)) # w spoken by groups other than k
g.adtm_kw <- sum(g.adtm) - g.adtm_w - g.adtm_k - g.adtm # total terms not w or k
g.se <- sqrt(1/g.adtm + 1/g.adtm_w + 1/g.adtm_k + 1/g.adtm_kw)
g.zeta <- g.delta/g.se
g.counts <- as.matrix(aggregate(x=dtm.nz, by = list(groups=groups), FUN=sum)[,-1])
if (!is.null(pair)) {
pr.delta <- t(scale( t(scale(g.ladtm[pair,], center = T, scale =F)), center=T, scale=F))
pr.adtm_w <- -sweep(g.adtm[pair,],1,rowSums(g.adtm[pair,]))
pr.adtm_k <- -sweep(g.adtm[pair,],2,colSums(g.adtm[pair,])) # w spoken by groups other than k
pr.adtm_kw <- sum(g.adtm[pair,]) - pr.adtm_w - pr.adtm_k - g.adtm[pair,] # total terms not w or k
pr.se <- sqrt(1/g.adtm[pair,] + 1/pr.adtm_w + 1/pr.adtm_k + 1/pr.adtm_kw)
pr.zeta <- pr.delta/pr.se
return(list(zeta=pr.zeta[1,], delta=pr.delta[1,],se=pr.se[1,], counts = colSums(dtm.nz), acounts = colSums(g.adtm)))
} else {
return(list(zeta=g.zeta,delta=g.delta,se=g.se,counts=g.counts,acounts=g.adtm))
}
}
############## FIGHTIN' WORDS PLOTTING FUNCTION
# helper function
makeTransparent<-function(someColor, alpha=100)
{
newColor<-col2rgb(someColor)
apply(newColor, 2, function(curcoldata){rgb(red=curcoldata[1], green=curcoldata[2],
blue=curcoldata[3],alpha=alpha, maxColorValue=255)})
}
fw.ggplot.groups <- function(fw.ch, groups.use = as.factor(rownames(fw.ch$zeta)), max.words = 50, max.countrank = 400, colorpalette=rep("black",length(groups.use)), sizescale=2, title="Comparison of Terms by Groups", subtitle = "", caption = "Group-specific terms are ordered by Fightin' Words statistic (Monroe, et al. 2008)") {
if (is.null(dim(fw.ch$zeta))) {## two-group fw object consists of vectors, not matrices
zetarankmat <- cbind(rank(-fw.ch$zeta),rank(fw.ch$zeta))
colnames(zetarankmat) <- groups.use
countrank <- rank(-(fw.ch$counts))
} else {
zetarankmat <- apply(-fw.ch$zeta[groups.use,],1,rank)
countrank <- rank(-colSums(fw.ch$counts))
}
wideplotmat <- as_tibble(cbind(zetarankmat,countrank=countrank))
wideplotmat$term=names(countrank)
#rankplot <- gather(wideplotmat, party, zetarank, 1:ncol(zetarankmat))
rankplot <- gather(wideplotmat, groups.use, zetarank, 1:ncol(zetarankmat))
rankplot$plotsize <- sizescale*(50/(rankplot$zetarank))^(1/4)
rankplot <- rankplot[rankplot$zetarank < max.words + 1 & rankplot$countrank<max.countrank+1,]
rankplot$groups.use <- factor(rankplot$groups.use,levels=groups.use)
p <- ggplot(rankplot, aes((nrow(rankplot)-countrank)^1, -(zetarank^1), colour=groups.use)) +
geom_point(show.legend=F,size=sizescale/2) +
theme_classic() +
theme(axis.ticks=element_blank(), axis.text=element_blank() ) +
ylim(-max.words,40) +
facet_grid(groups.use ~ .) +
geom_text_repel(aes(label = term), size = rankplot$plotsize, point.padding=.05,
box.padding = unit(0.20, "lines"), show.legend=F, max.overlaps = Inf) +
scale_colour_manual(values = alpha(colorpalette, .7)) +
# labs(x="Terms used more frequently overall →", y="Terms used more frequently by group →", title=title, subtitle=subtitle , caption = caption)
labs(x=paste("Terms used more frequently overall -->"), y=paste("Terms used more frequently by group -->"), title=title, subtitle=subtitle , caption = caption)
}
options(ggrepel.max.overlaps = Inf)
fw.keys <- function(fw.ch,n.keys=10) {
n.groups <- nrow(fw.ch$zeta)
keys <- matrix("",n.keys,n.groups)
colnames(keys) <- rownames(fw.ch$zeta)
for (g in 1:n.groups) {
keys[,g] <- names(sort(fw.ch$zeta[g,],dec=T)[1:n.keys])
}
keys
}Query search: (activi* | ahbash | akromiya | anjem | ansharut | anticapital* | antidemocr* | antiestablish* | antifa | antigovern* | antimilitar* | antimonarch* | antipatri* | antireli* | antisem* | antisocia* | antisyst* | apost* | atharis | athei* | atheists | außerparlamentari* | authoritar* | bagau* | bigots | bplf | bukhari* | capitulatio* | conspirato* | counterj* | cybercalip* | damigo | dawro* | demon* | deradicaliza* | deviatio* | diqqi | dissid* | djamaat | dotbus* | ecofas* | espou* | ethnonationa* | extrem* | facists | fadaia* | fanat* | fasci* | fetö | fightdem* | freedo* | fundamental* | fuqra | gafatar | gamerga* | gemidzii | ghuluww | globali* | gramsc* | gülen* | hacktiv* | haquna | hardline | harkatul | hatemon* | heimwe* | hezbol* | hinduph* | hindutva | hizbut | hojja* | ideolog* | incitem* | inciters | insurr* | intacti* | islam4uk | islam* | jaljalat | jbakc | jihadi* | jmjb | jrtn | judai* | juhayman | jundu* | kadiza* | kahanism | kahanist | karram* | kaysa* | khalis* | khatmia | khawarij | khomein* | koutla | leftist | leftists | leftwing | liberatio* | madkha* | madkhal* | maimonid* | manosp* | mauras* | mcln | militan* | millatu | monarc* | mudja* | muhaji* | mujahid* | murab* | muttahi* | najjadah | nationali* | neofas* | neona* | opantish | oppositi* | paleolibertar* | paramili* | parliamenta* | pegida | populist | principa* | profe* | prowar | putinist | qadari | quranism | quranist | qutbism | qutbist | qutbists | racis* | radica* | reactioni* | reformis* | reichsbürgerbewe* | rightist | rightw* | rofiq | russoph* | sabireen | sadda* | salaf* | sayaff | scriptura* | secula* | separationi* | sharia4hol* | sikrikim | split* | squadism | strasse* | subver* | suidlan* | sukarn* | suprema* | supremac* | sympathi* | table* | tabliq | takfir | takfir* | takfi* |terror* | theoc* | titoite | triba* | trots* | trotsk* | ukrainoph* | ultraconserva* | ultralib* | ultranationa* | ultrar* | uscmo | wahabbi | wahab* | wahha* | xenoph* | yulde* | zinovie*)
Load and clean the data
text_cleaner<-function(corpus){
tempcorpus<-Corpus(VectorSource(corpus))
tempcorpus<-tm_map(tempcorpus,
removePunctuation)
tempcorpus<-tm_map(tempcorpus,
stripWhitespace)
tempcorpus<-tm_map(tempcorpus,
removeNumbers)
tempcorpus<-tm_map(tempcorpus,
removeWords, stopwords("english"))
tempcorpus<-tm_map(tempcorpus,
stemDocument)
return(tempcorpus)
}Calculate FW.
e <- dfm(extremecorpus$content)
message(dim(e))
head(e)Document-feature matrix of: 6 documents, 15,213 features (100.0% sparse).
features
docs us struggl uptick american plot attack past month last week
text1 1 1 1 1 1 0 0 0 0 0
text2 0 0 0 0 0 1 1 1 1 1
text3 0 0 0 0 0 0 0 0 0 0
text4 0 0 0 0 0 0 0 0 0 0
text5 0 0 0 0 0 0 0 0 0 0
text6 0 0 0 0 0 0 0 0 0 0
[ reached max_nfeat ... 15,203 more features ]
#############################################
e <- dfm_select(e, pattern = stopwords("english"), selection = "remove")
e <- dfm_select(e, min_nchar = 2)
e <- dfm_trim(e, min_termfreq = 4, min_docfreq = .05, verbose=TRUE)
#dim(e)
# sparsity(e)
#############################################
extrem_dtm <- convert(e, to='data.frame')
extrem_dtm <- extrem_dtm[-c(1)]
w <- which( sapply(extrem_dtm, class ) == 'character' )
#############################################
fw.extrem <- fwgroups(extrem_dtm, groups=extrem_NYT.dfm.long$Context)
rm(extrem_dtm)Get and show the top words per group by zeta.
fwkeys.extrem <- fw.keys(fw.extrem, n.keys=20)
cols <- rev(colnames(fwkeys.extrem))
fwkeys.extrem <- fwkeys.extrem[,cols]
kable(fwkeys.extrem)| Context.before | Context.after |
|---|---|
| islamic | group |
| palestinian | hussein |
| war | attack |
| iraqi | parti |
| suspect | elect |
| crack | leader |
| jemaah | jihad |
| crackdown | milit |
| main | movement |
| alleg | said |
| support | guerrilla |
| evid | network |
| albanian | regim |
| prodemocraci | armi |
| sept | organ |
| megawati | politician |
| muslim | outsid |
| prevent | univers |
| kosovo | bomb |
| battl | rocket |
NAPlot: Before in Blue, After in Red
p.fw.extrem <- fw.ggplot.groups(fw.extrem,sizescale=4,max.words=200,
max.countrank=400,colorpalette=c("red","blue"),
title = 'Comparison of Terms Before and After Query Word')
p.fw.extrem
extrem_NYT.dfm.long$Query.item <- as.factor(extrem_NYT.dfm.long$Query.item)
top_n <-as.data.frame(sort(table(extrem_NYT.dfm.long$Query.item), decreasing = TRUE)[1:5])
message(dim(top_n))
colnames(top_n) <- c('term', 'Freq')
message(top_n)
extrem_dtm_topn <- convert(e, to='data.frame')
extrem_dtm_topn$Number.of.hit <- extrem_NYT.dfm.long$Number.of.hit
topn_terms <- extrem_NYT.dfm.long %>%
filter(Query.item %in% top_n$term)
extrem_dtm_topn_keep <- extrem_dtm_topn %>%
filter(Number.of.hit %in% topn_terms$Number.of.hit)
r <- sum(length(extrem_dtm_topn_keep))
extrem_dtm_topn_keep <- extrem_dtm_topn_keep[-c(1, r)]
fw.query_item <- fwgroups(extrem_dtm_topn_keep,groups = topn_terms$Query.item)
fwkeys.query_item <- fw.keys(fw.query_item, n.keys=15)
kable(fwkeys.query_item)| Islamic | militants | opposition | Saddam | terrorist |
|---|---|---|---|---|
| milit | islamic | parti | hussein | attack |
| hama | palestinian | democrat | iraqi | organ |
| jihad | kill | parliament | iraq | sept |
| group | israeli | leader | presid | unite |
| movement | suspect | elect | regim | state |
| extremist | gaza | vote | saddam | activ |
| front | israel | opposit | baghdad | alqaida |
| republ | attack | lawmak | un | act |
| insurg | kashmir | polit | kuwait | network |
| radic | taliban | main | war | group |
| law | arafat | candid | captur | suspect |
| fundamentalist | forc | coalit | oust | laden |
| revolut | pakistani | conserv | weapon | bin |
| somalia | fire | allianc | bush | connect |
| hardlin | pakistan | protest | us | financ |
rm(r)
rm(extrem_dtm_topn)
########################################################
p.fw.query_item <- fw.ggplot.groups(fw.query_item,sizescale=3.2,max.words=150,max.countrank=400,
colorpalette=c("darkgreen","darkgreen","darkgreen","darkgreen","darkgreen"),
title = 'Comparison of Terms by Overall Top Terms')
p.fw.query_item
extrem_dtm_topn <- convert(e, to='data.frame')
extrem_dtm_topn$Number.of.hit <- extrem_NYT.dfm.long$Number.of.hit
extrem_dtm_topn$Context <- extrem_NYT.dfm.long$Context
topn_terms <- extrem_NYT.dfm.long %>%
filter(Query.item %in% top_n$term)
extrem_dtm_topn_keep <- extrem_dtm_topn %>%
filter(Number.of.hit %in% topn_terms$Number.of.hit)
extrem_dtm_topn_keep_before <- extrem_dtm_topn_keep[grep("before",extrem_dtm_topn_keep$Context),]
extrem_dtm_topn_keep_after <- extrem_dtm_topn_keep[grep("after", extrem_dtm_topn_keep$Context),]
rr <-dim(topn_terms)[1]
r <- sum(length(extrem_dtm_topn_keep_before))
topn_terms_before <- topn_terms[seq(1,rr,2),]
topn_terms_after <- topn_terms[seq(2,rr,2),]
extrem_dtm_topn_keep_before <- extrem_dtm_topn_keep_before[-c(1, r-1, r)]
extrem_dtm_topn_keep_after <- extrem_dtm_topn_keep_after[-c(1, r-1, r)]
rm(rr)
rm(r)
rm(extrem_dtm_topn)
#############################################
fw.query_item_before <- fwgroups(extrem_dtm_topn_keep_before,groups = topn_terms_before$Query.item)
fwkeys.query_item_before <- fw.keys(fw.query_item_before, n.keys=15)
kable(fwkeys.query_item_before, caption = "Top 15 Words for Query Term: BEFORE")| Islamic | militants | opposition | Saddam | terrorist |
|---|---|---|---|---|
| milit | islamic | main | iraqi | sept |
| strict | palestinian | elect | presid | alqaida |
| radic | kill | polit | iraq | unite |
| somalia | suspect | parliament | war | bin |
| hama | taliban | opposit | saddam | laden |
| organiz | attack | parti | oust | involv |
| secular | israeli | minist | baghdad | suspect |
| hardlin | muslim | vote | fall | link |
| iran | gaza | democrat | bush | state |
| turkey | armi | prime | hussein | consid |
| extremist | troop | strong | captur | connect |
| malaysia | israel | percent | regim | appar |
| suprem | pakistan | despit | usled | osama |
| gaza | forc | poll | former | charg |
| fundamentalist | arafat | voic | death | list |
p.fw.query_item_before <- fw.ggplot.groups(fw.query_item_before,sizescale=2,max.words=150,max.countrank=400,
colorpalette = c('blue','blue','blue', 'blue','blue'),
title = 'Comparison of Terms by Overall Top Terms: BEFORE')
p.fw.query_item_before
#############################################
fw.query_item_after <- fwgroups(extrem_dtm_topn_keep_after,groups = topn_terms_after$Query.item)
fwkeys.query_item_after <- fw.keys(fw.query_item_after, n.keys=15)
kable(fwkeys.query_item_after, caption = "Top 15 Words for Query Term: AFTER")| Islamic | militants | opposition | Saddam | terrorist |
|---|---|---|---|---|
| milit | israeli | parti | hussein | attack |
| group | israel | leader | regim | organ |
| jihad | kashmir | democrat | iraq | unite |
| hama | gaza | vote | iraqi | state |
| movement | palestinian | parliament | un | activ |
| law | kill | lawmak | weapon | group |
| insurg | attack | protest | saddam | act |
| extremist | region | candid | son | network |
| front | southern | politician | captur | bomb |
| revolut | fire | coalit | kuwait | suspect |
| republ | rocket | elect | mass | threat |
| islamic | pakistani | social | baghdad | financ |
| court | fight | conserv | us | sept |
| fundamentalist | border | win | resolut | link |
| radic | area | opposit | palac | cell |
p.fw.query_item_after <- fw.ggplot.groups(fw.query_item_after,sizescale=2,max.words=150,max.countrank=400,
colorpalette = c('red', 'red','red','red','red'),
title = 'Comparison of Terms by Overall Top Terms: AFTER')
p.fw.query_item_afterNA
NACalculate FW and keys
ud_model <- udpipe_load_model(ud_model$file_model)
txt <-as.character(extrem_NYT.dfm.long$context.text)
x_udp <- udpipe_annotate(ud_model, x = txt, doc_id = seq_along(txt))
x <- as.data.frame(x_udp)
x$doc_id <-as.integer(x$doc_id)
x_odd.before <- x[x$doc_id %% 2 == 1,]
x_even.after <-x[x$doc_id %% 2 == 0, ]A few barchart functions
## UNIVERSAL PoS
UPOS_barchart <- function(df1, df2){
stats1 <- txt_freq(df1$upos)
stats1$key <- factor(stats1$key, levels = rev(stats1$key))
stats2 <- txt_freq(df2$upos)
stats2$key <- factor(stats2$key, levels = rev(stats2$key))
c(barchart(key ~ freq, data = stats1, col = "cadetblue",
main = "UPOS (Universal Parts of Speech)\n frequency of occurrence: BEFORE vs AFTER",
xlab = "Freq"),
barchart(key ~ freq, data = stats2, col = 'skyblue',
xlab = "Freq"))
}
## NOUNS
NOUNS_barchart <- function(df1, df2){
stats1 <- subset(df1, upos %in% c("NOUN"))
stats1 <- txt_freq(stats1$token)
stats1$key <- factor(stats1$key, levels = rev(stats1$key))
stats2 <- subset(df2, upos %in% c("NOUN"))
stats2 <- txt_freq(stats2$token)
stats2$key <- factor(stats2$key, levels = rev(stats2$key))
c(barchart(key ~ freq, data = head(stats1, 20), col = "cadetblue",
main = "Most occurring nouns: BEFORE vs AFTER", xlab = "Freq"),
barchart(key ~ freq, data = head(stats2, 20), col = "skyblue",
xlab = "Freq"))
}
## ADJECTIVES
ADJ_barchart <- function(df1, df2){
stats1 <- subset(df1, upos %in% c("ADJ"))
stats1 <- txt_freq(stats1$token)
stats1$key <- factor(stats1$key, levels = rev(stats1$key))
stats2 <- subset(df2, upos %in% c("ADJ"))
stats2 <- txt_freq(stats2$token)
stats2$key <- factor(stats2$key, levels = rev(stats2$key))
c(barchart(key ~ freq, data = head(stats1, 20), col = "cadetblue",
main = "Most occurring adjectives: BEFORE vs AFTER", xlab = "Freq"),
barchart(key ~ freq, data = head(stats2, 20), col = "skyblue",
xlab = "Freq"))
}
## Using RAKE to find keywords
RAKE_KW_barchart <- function(df1,df2){
stats1 <- keywords_rake(x = df1, term = "lemma", group = "doc_id",
relevant = df1$upos %in% c("NOUN", "ADJ"))
stats1$key <- factor(stats1$keyword, levels = rev(stats1$keyword))
stats2 <- keywords_rake(x = df2, term = "lemma", group = "doc_id",
relevant = df2$upos %in% c("NOUN", "ADJ"))
stats2$key <- factor(stats2$keyword, levels = rev(stats2$keyword))
c(barchart(key ~ rake, data = head(subset(stats1, freq > 3), 20), col = "cadetblue",
main = "Keywords identified by RAKE: BEFORE vs AFTER",
xlab = "Rake"),
barchart(key ~ rake, data = head(subset(stats2, freq > 3), 20), col = "skyblue",
xlab = "Rake"))
}
## Using Pointwise Mutual Information Collocations
PWI_barchart <- function(df1, df2){
df1$word <- tolower(df1$token)
stats1 <- keywords_collocation(x = df1, term = "word", group = "doc_id")
stats1$key <- factor(stats1$keyword, levels = rev(stats1$keyword))
df2$word <- tolower(df2$token)
stats2 <- keywords_collocation(x = df2, term = "word", group = "doc_id")
stats2$key <- factor(stats2$keyword, levels = rev(stats2$keyword))
c(barchart(key ~ pmi, data = head(subset(stats1, freq > 3), 20), col = "cadetblue",
main = "Keywords identified by PMI Collocation: BEFORE vs AFTER",
xlab = "PMI (Pointwise Mutual Information)"),
barchart(key ~ pmi, data = head(subset(stats2, freq > 3), 20), col = "skyblue",
xlab = "PMI (Pointwise Mutual Information)"))
}
## Using a sequence of POS tags (noun phrases / verb phrases)
POS_barchart <- function(df1, df2){
df1$phrase_tag <- as_phrasemachine(df1$upos, type = "upos")
stats1 <- keywords_phrases(x = df1$phrase_tag, term = tolower(df1$token),
pattern = "(A|N)*N(P+D*(A|N)*N)*",
is_regex = TRUE, detailed = FALSE)
stats1 <- subset(stats1, ngram > 1 & freq > 3)
stats1$key <- factor(stats1$keyword, levels = rev(stats1$keyword))
df2$phrase_tag <- as_phrasemachine(df2$upos, type = "upos")
stats2 <- keywords_phrases(x = df2$phrase_tag, term = tolower(df2$token),
pattern = "(A|N)*N(P+D*(A|N)*N)*",
is_regex = TRUE, detailed = FALSE)
stats2 <- subset(stats2, ngram > 1 & freq > 3)
stats2$key <- factor(stats2$keyword, levels = rev(stats2$keyword))
c(barchart(key ~ freq, data = head(stats1, 20), col = "cadetblue",
main = "Keywords - simple noun phrases: BEFORE vs AFTER", xlab = "Frequency"),
barchart(key ~ freq, data = head(stats2, 20), col = "skyblue",
xlab = "Frequency"))
}
UPOS_barchart(x_odd.before, x_even.after)NOUNS_barchart(x_odd.before, x_even.after)ADJ_barchart(x_odd.before, x_even.after)RAKE_KW_barchart(x_odd.before, x_even.after)PWI_barchart(x_odd.before, x_even.after)POS_barchart(x_odd.before, x_even.after)
Corrs <- function(df){
df$id <- unique_identifier(df, fields = c("sentence_id", "doc_id"))
dtm <- subset(df, upos %in% c("NOUN", "ADJ"))
dtm <- document_term_frequencies(dtm, document = "id", term = "lemma")
dtm <- document_term_matrix(dtm)
dtm <- dtm_remove_lowfreq(dtm, minfreq = 5)
termcorrelations <- dtm_cor(dtm)
y <- as_cooccurrence(termcorrelations)
y <- subset(y, term1 < term2 & abs(cooc) > 0.2)
y <- y[order(abs(y$cooc), decreasing = TRUE), ]
print(y[1:25,])
}
Corrs(x_odd.before)
Corrs(x_even.after)NA
rm(list=ls())