############################################################### ########### SENTIMENT ANALYSIS: TERRITORIAL PAPERS ############ ############################################################### rm(list = ls()) library(stm) library(tm) library(slam) ############################################################### ######################### TERRITORIAL PAPERS ################## ################## Reading from file datacomb.csv ############# ############ authors and recipients already cleaned up ######## ############################################################### data <- read.csv("C:\\Johannes Ledolter\\2020March01Book\\Chapter6WEB\\datacomb.csv",stringsAsFactors=F) dim(data) data[1:10,1:5] ids1=data$indc ids2=data$autr ids3=data$recr ids4=data$yearc ids1 length(ids1) ids2 length(ids2) ids3 length(ids3) ids4 length(ids4) corpus <- Corpus(VectorSource(data$text),readerControl = list(reader = readPlain)) ## this is how to create corpus corpus1 <- tm_map(corpus, stripWhitespace) corpus2 <- tm_map(corpus1, content_transformer(tolower)) corpus3 <- tm_map(corpus2, removePunctuation) corpus4 <- tm_map(corpus3, removeNumbers) corpus5 <- tm_map(corpus4, removeWords, stopwords("english")) corpus.dtm <- DocumentTermMatrix(corpus5,control=list(stemming=FALSE)) ## no stemming as default dim(corpus.dtm) #################################################################### ####################### SENTIMENT ANALYSIS ######################### #################################################################### ## loading the words with positive and negative sentiment load("C:\\Johannes Ledolter\\2020March01Book\\Chapter6WEB\\SentimentWords.RData") pos ## words with positive sentiment neg ## words with negative sentiment corpuscomb=corpus5 ## represents the cleaned-up corpus (removed stopwords, numbers/punctuations, short words of length two) ## create the folder "cleancorpus" ## write out corpus as txt in a temporary folder "cleancorpus" ntotal=dim(corpus.dtm)[1] ## combined length of letters for (d in 1:ntotal) { writeCorpus(corpuscomb[d],path="C:/Johannes Ledolter/2020March01Book/Chapter6WEB/cleancorpus",paste(d,".txt", sep = "")) } ## reading the clean corpus from the temporary folder "cleancorpus" ## matching the sentiment and creating an output file that contains for each document (in row) ## creating the file out.file that contains the number and proportion of positive (negative) matches ## the number of positive and negative matches in out.file[,1] and out.file[,2] ## the proportion of postive matches among all sentiment-scored words (either positive or negative matches) in out.file[,5] ## the proportion of negative matches among all sentiment-scored words (either positive or negative matches) in outfile[,6] out.file = data.frame(pos = NA, neg = NA, pospercent = NA, negpercent = NA, posmatchpercent = NA, negmatchpercent = NA)[numeric(0),] for (i in 1:ntotal){ text_read = readLines(paste("C:/Johannes Ledolter/2020March01Book/Chapter6WEB/cleancorpus",paste("/", i, ".txt",sep=""),sep="")) text = as.character(text_read) text = unlist(strsplit(text," ")) text=tolower(text) posmatch=match(text,pos,nomatch=-999) nupos=sum(posmatch>0) negmatch=match(text,neg,nomatch=-999) nuneg=sum(negmatch>0) newrow = c(nupos,nuneg, 100*(nupos/length(text)), 100*(nuneg/length(text)), 100*(nupos/(nupos + nuneg)), 100*(nuneg/(nupos+nuneg))) out.file = rbind(out.file, newrow) } colnames(out.file) = c("positive matches", "negative matches", "pos% (length base)", "neg% (length base)", "pos% (match base)", "neg% (match base)") out.file[1:10,1:6] dim(out.file) out.file[1:10,1:6] dim(out.file) ## sentiment information is now attached to each document ## assessing word characteristics of the documents - whether or not they include certain key words ## counting up the number of occurrences for indian h1=labels(corpus.dtm)$Terms ind=h1=="indian" nuterms=row_sums(as.matrix(corpus.dtm[,ind])) indicator<-function(condition) ifelse(condition,1,0) ## indicator = 1 if indian is a word in the document occIndian=indicator(nuterms) table(occIndian) ## counting up the number of occurrences for land h1=labels(corpus.dtm)$Terms ind=h1=="land" nuterms=row_sums(as.matrix(corpus.dtm[,ind])) indicator<-function(condition) ifelse(condition,1,0) occLand=indicator(nuterms) table(occLand) ## counting up the number of occurrences for war h1=labels(corpus.dtm)$Terms ind=h1=="war" nuterms=row_sums(as.matrix(corpus.dtm[,ind])) indicator<-function(condition) ifelse(condition,1,0) occWar=indicator(nuterms) table(occWar) ## counting up the number of occurrences for treaty h1=labels(corpus.dtm)$Terms ind=h1=="treaty" nuterms=row_sums(as.matrix(corpus.dtm[,ind])) indicator<-function(condition) ifelse(condition,1,0) occTreaty=indicator(nuterms) table(occTreaty) ## counting up the number of occurrences for government h1=labels(corpus.dtm)$Terms ind=h1=="government" nuterms=row_sums(as.matrix(corpus.dtm[,ind])) indicator<-function(condition) ifelse(condition,1,0) occGovernment=indicator(nuterms) table(occGovernment) occAutGov=data$autr=="GOVERNOR" ## indicator if letter authored by governor table(occAutGov) occRecGov=data$recr=="GOVERNOR" ## indicator if letter received by governor table(occRecGov) occIndian occLand occWar occTreaty occGovernment occAutGov occRecGov ############################################################################################################################## ## METHOD 1 ################################################################################################################## ## defining a document as positive if its proportion of positive sentiment words is above the corpus average of proportions ## ## defining a document as negative if its proportion of positive sentiment words is below the corpus average of proportions ## ## investigate whether sentiment is changing over time (year) ################################################################ ## investigate whether sentiment is related to the occurrence of certain words ############################################### ############################################################################################################################## nletters=ntotal id=dim(nletters) for (i in 1:nletters) { id[i]=i } id percpos=out.file$"pos% (match base)" np=out.file$"positive matches" nn=out.file$"negative matches" nt=np+nn par(mfrow=c(1,1)) hist(percpos) ave=sum(np)/(sum(np)+sum(nn)) ave ## perhaps this is a better way to get the average ## 70 percent positive, on average idstime=as.numeric(ids4) idsdoc=as.numeric(ids1) b=data.frame(id,percpos,nt,idstime,idsdoc,occIndian,occLand,occWar,occTreaty,occGovernment,occAutGov,occRecGov) b[1:10,] index=nt>1 ## to get rid of na. Could use letters with nt > cutoff ## decided to use all letters with nt>1; proportions from groups with small nt won't become significant anyway b=b[index,] dim(b) b$indexpos=b$percpos-ave*100>0 b$indexneg=b$percpos-ave*100<0 b1=b b1[1:10,] table(b1$idstime,b1$indexpos) t50=tapply(b1$indexpos,b1$idstime,FUN=mean,na.rm=TRUE) t50 ## barplot(t50) t51=t50 t51[colSums(table(b1$indexpos,b1$idstime))<=9]=NA t51 par(mfrow=c(1,2)) barplot(t51,main="POSITIVE LETTERS",ylim=c(0,1),xlab="Year",ylab="Percentage of positive letters") ## probably best xx=as.numeric(levels(as.factor(b1$idstime))) yy=t51 pp=predict(loess(yy~xx),xx) plot(xx,yy,main="POSITIVE LETTERS",ylim=c(0,1),xlab="Year",ylab="Percentage of positive letters") lines(xx,pp) par(mfrow=c(1,1)) table(b1$idsdoc,b1$indexpos) t50=tapply(b1$indexpos,b1$idsdoc,FUN=mean,na.rm=TRUE) t50 ## barplot(t50) t51=t50 t51[colSums(table(b1$indexpos,b1$idsdoc))<=9]=NA t51 barplot(t51,main="POSITIVE LETTERS",ylim=c(0,1),xlab="Corpus",ylab="Percentage of positive letters") ## probably best ## relationship to content: indian, war, land, government, treaty ### relating the presence of term "indian" to positive sentiment tbl=table(b1$occIndian,b1$indexpos) tbl tbl[1,2]/(tbl[1,1]+tbl[1,2]) tbl[2,2]/(tbl[2,1]+tbl[2,2]) chisq.test(tbl) ### relating the presence of term "land" to positive sentiment tbl=table(b1$occLand,b1$indexpos) tbl tbl[1,2]/(tbl[1,1]+tbl[1,2]) tbl[2,2]/(tbl[2,1]+tbl[2,2]) chisq.test(tbl) ### relating the presence of term "war" to positive sentiment tbl=table(b1$occWar,b1$indexpos) tbl tbl[1,2]/(tbl[1,1]+tbl[1,2]) tbl[2,2]/(tbl[2,1]+tbl[2,2]) chisq.test(tbl) ### relating the presence of term "treaty" to positive sentiment tbl=table(b1$occTreaty,b1$indexpos) tbl tbl[1,2]/(tbl[1,1]+tbl[1,2]) tbl[2,2]/(tbl[2,1]+tbl[2,2]) chisq.test(tbl) ### relating the presence of term "government" to positive sentiment tbl=table(b1$occGovernment,b1$indexpos) tbl tbl[1,2]/(tbl[1,1]+tbl[1,2]) tbl[2,2]/(tbl[2,1]+tbl[2,2]) chisq.test(tbl) ## relationship whether written or received from Governor ### relating the letters authored/received by "Governor" to positive sentiment tbl=table(b1$occAutGov,b1$indexpos) tbl tbl[2,2]/(tbl[2,1]+tbl[2,2]) tbl=table(b1$occRecGov,b1$indexpos) tbl tbl[2,2]/(tbl[2,1]+tbl[2,2]) ############################################################################################################################## ## METHOD 2: FACTORING IN THE RELIABILITY OF THE ESTIMATE FOR POSITIVITY ##################################################### ## defining a document as positive if its proportion of positive sentiment is larger than the upper reliability bound ######## ## defining a document as negative if its proportion of positive sentiment is smaller than the lower reliability bound ####### ## this leaves a third group: neutral documents ############################################################################## ## investigate whether sentiment is changing over time (year) ################################################################ ## investigate whether sentiment is related to the occurrence of certain words ############################################### ############################################################################################################################## temp=1.28*sqrt(ave*(1-ave)/b$nt) ## 1.28 for 90 percent coverage b$indexSpos=(b$percpos-100*temp)-ave*100>0 b$indexSneg=(b$percpos+100*temp)-ave*100<0 b$indexSneutral=b$indexSpos==b$indexSneg b[1:10,] b1=b[order(b[,2],decreasing=TRUE),] ## letters with positive sentiment, significantly larger than average b1[b1$indexSpos,] dim(b1[b1$indexSpos,])[1] ## letters with negative sentiment, significantly smaller than average b1[b1$indexSneg,] dim(b1[b1$indexSneg,])[1] ## letters with neutral sentiment, not significantly different than average b1[b1$indexSneutral,] dim(b1[b1$indexSneutral,])[1] ## graphic display: start b1[,1]=b[,1] plot(b1[b1$indexSpos,1],b1[b1$indexSpos,2],type="p",col="red",xlim=c(1,nletters),ylim=c(0,100),ylab="Percent Positive",xlab="Letter ID", main="More positive than average (RED), Less positive than average (GREEN), Not different than average (Black)", sub="Blue Line: Percent Positive across all letters") ## color red for letters with sentiment significantly larger than average points(b1[b1$indexSneg,1],b1[b1$indexSneg,2],type="p",col="green") ## color green for letters with sentiment significantly smaller than average points(b1[b1$indexSneutral,1],b1[b1$indexSneutral,2],type="p",pch="*",col="black") ## color black for letters with sentiment not significantly different than average abline(ave*100,0,col="blue") ## add horizontal line at ave*100 ## graphic display: end table(b1$idstime,b1$indexSpos) t50=tapply(b1$indexSpos,b1$idstime,FUN=mean,na.rm=TRUE) t50 ## barplot(t50) t51=t50 t51[colSums(table(b1$indexSpos,b1$idstime))<=9]=NA t51 par(mfrow=c(1,2)) barplot(t51,main="SIG POSITIVE LETTERS",ylim=c(0,0.35),xlab="Year",ylab="Percentage of sig positive letters") ## probably best xx=as.numeric(levels(as.factor(b1$idstime))) yy=t51 pp=predict(loess(yy~xx),xx) plot(xx,yy,main="SIG POSITIVE LETTERS",ylim=c(0,0.35),xlab="Year",ylab="Percentage of sig positive letters") lines(xx,pp) table(b1$idstime,b1$indexSneg) t50=tapply(b1$indexSneg,b1$idstime,FUN=mean,na.rm=TRUE) t50 ## barplot(t50) t51=t50 t51[colSums(table(b1$indexSneg,b1$idstime))<=9]=NA t51 par(mfrow=c(1,2)) barplot(t51,main="SIG NEGATIVE LETTERS",ylim=c(0,0.35),xlab="Year",ylab="Percentage of sig negative letters") ## probably best xx=as.numeric(levels(as.factor(b1$idstime))) yy=t51 pp=predict(loess(yy~xx),xx) plot(xx,yy,main="SIG NEGATIVE LETTERS",ylim=c(0,0.35),xlab="Year",ylab="Percentage of sig negative letters") lines(xx,pp) table(b1$idsdoc,b1$indexSpos) t50=tapply(b1$indexSpos,b1$idsdoc,FUN=mean,na.rm=TRUE) t50 ## barplot(t50) t51=t50 t51[colSums(table(b1$indexSpos,b1$idsdoc))<=9]=NA t51 barplot(t51,main="SIG POSITIVE LETTERS",ylim=c(0,0.35),xlab="Corpus",ylab="Percentage of sig positive letters") ## probably best table(b1$idsdoc,b1$indexSneg) t50=tapply(b1$indexSneg,b1$idsdoc,FUN=mean,na.rm=TRUE) t50 ## barplot(t50) t51=t50 t51[colSums(table(b1$indexSneg,b1$idsdoc))<=9]=NA t51 barplot(t51,main="SIG NEGATIVE LETTERS",ylim=c(0,0.35),xlab="Corpus",ylab="Percentage of sig negative letters") ## probably best ## relationship to content: indian, war, land, government, treaty ### relating the presence of term "indian" to positive/negative sentiment tbl=table(b1$occIndian,b1$indexSpos) tbl tbl[1,2]/(tbl[1,1]+tbl[1,2]) tbl[2,2]/(tbl[2,1]+tbl[2,2]) chisq.test(tbl) tbl=table(b1$occIndian,b1$indexSneg) tbl tbl[1,2]/(tbl[1,1]+tbl[1,2]) tbl[2,2]/(tbl[2,1]+tbl[2,2]) chisq.test(tbl) ### relating the presence of term "land" to positive/negative sentiment tbl=table(b1$occLand,b1$indexSpos) tbl tbl[1,2]/(tbl[1,1]+tbl[1,2]) tbl[2,2]/(tbl[2,1]+tbl[2,2]) chisq.test(tbl) tbl=table(b1$occLand,b1$indexSneg) tbl tbl[1,2]/(tbl[1,1]+tbl[1,2]) tbl[2,2]/(tbl[2,1]+tbl[2,2]) chisq.test(tbl) ### relating the presence of term "war" to positive/negative sentiment tbl=table(b1$occWar,b1$indexSpos) tbl tbl[1,2]/(tbl[1,1]+tbl[1,2]) tbl[2,2]/(tbl[2,1]+tbl[2,2]) chisq.test(tbl) tbl=table(b1$occWar,b1$indexSneg) tbl tbl[1,2]/(tbl[1,1]+tbl[1,2]) tbl[2,2]/(tbl[2,1]+tbl[2,2]) chisq.test(tbl) ### relating the presence of term "treaty" to positive/negative sentiment tbl=table(b1$occTreaty,b1$indexSpos) tbl tbl[1,2]/(tbl[1,1]+tbl[1,2]) tbl[2,2]/(tbl[2,1]+tbl[2,2]) chisq.test(tbl) tbl=table(b1$occTreaty,b1$indexSneg) tbl tbl[1,2]/(tbl[1,1]+tbl[1,2]) tbl[2,2]/(tbl[2,1]+tbl[2,2]) chisq.test(tbl) ### relating the presence of term "government" to positive/negative sentiment tbl=table(b1$occGovernment,b1$indexSpos) tbl tbl[1,2]/(tbl[1,1]+tbl[1,2]) tbl[2,2]/(tbl[2,1]+tbl[2,2]) chisq.test(tbl) tbl=table(b1$occGovernment,b1$indexSneg) tbl tbl[1,2]/(tbl[1,1]+tbl[1,2]) tbl[2,2]/(tbl[2,1]+tbl[2,2]) chisq.test(tbl) ## relationship whether written or received from Governor ### relating the letters authored/received by "Governor" to positive sentiment tbl=table(b1$occAutGov,b1$indexSpos) tbl tbl[2,2]/(tbl[2,1]+tbl[2,2]) tbl=table(b1$occRecGov,b1$indexSpos) tbl tbl[2,2]/(tbl[2,1]+tbl[2,2]) ### relating the letters authored/received by "Governor" to negative sentiment tbl=table(b1$occAutGov,b1$indexSneg) tbl tbl[2,2]/(tbl[2,1]+tbl[2,2]) tbl=table(b1$occRecGov,b1$indexSneg) tbl tbl[2,2]/(tbl[2,1]+tbl[2,2]) ############################################################################################################################## ## METHOD 3: FACTORING IN THE RELIABILITY OF THE ESTIMATE FOR POSITIVITY ##################################################### ## VISUALIZING ALL THREE GROUPS: SigPos, SigNeg, NotSig ###################################################################### ############################################################################################################################## library(Rcmdr) comb=dim(length(b1$indexSpos)) for (i in 1:length(b1$indexSpos)) { if(b1$indexSpos[i]==TRUE) comb[i]=1 if(b1$indexSneutral[i]==TRUE) comb[i]=2 if(b1$indexSneg[i]==TRUE) comb[i]=3 } comb b1$indexcomb=factor(comb) ## relating to time par(mfrow=c(2,2)) t40=table(b1$idstime,b1$indexcomb) t40[colSums(table(b1$indexcomb,b1$idstime))<=9]=NA t41=rowPercents(t40) barplot(t41[,1],ylim=c(0,25),xlab="Year",ylab="Percentage of reliably positive letters") barplot(t41[,3],ylim=c(0,25),xlab="Year",ylab="Percentage of reliably negative letters") barplot(t41[,2],ylim=c(0,100),xlab="Year",ylab="Percentage of neutral letters") barplot(t41[,1]/t41[,3],ylim=c(0,5),xlab="Year",ylab="Ratio of reliable positive / reliably negative") abline(1,0,col="blue") ## relating to volume par(mfrow=c(2,2)) t40=table(b1$idsdoc,b1$indexcomb) t41=rowPercents(t40) barplot(t41[,1],ylim=c(0,25),xlab="Corpus",ylab="Percentage of reliably positive letters") barplot(t41[,3],ylim=c(0,25),xlab="Corpus",ylab="Percentage of reliably negative letters") barplot(t41[,2],ylim=c(0,100),xlab="Corpus",ylab="Percentage of neutral letters") barplot(t41[,1]/t41[,3],ylim=c(0,5),xlab="Corpus",ylab="Ratio of reliable positive / reliably negative") abline(1,0,col="blue") ## relating the presence of term "indian" to positive/negative sentiment par(mfrow=c(2,2)) t40=table(b1$occIndian,b1$indexcomb) t41=rowPercents(t40) barplot(t41[,1],ylim=c(0,25),xlab="Indian",ylab="Percentage of reliably positive letters") barplot(t41[,3],ylim=c(0,25),xlab="Indian",ylab="Percentage of reliably negative letters") barplot(t41[,2],ylim=c(0,100),xlab="Indian",ylab="Percentage of neutral letters") barplot(t41[,1]/t41[,3],ylim=c(0,5),xlab="Indian",ylab="Ratio of reliable positive / reliably negative") abline(1,0,col="blue") ### relating the presence of term "land" to positive/negative sentiment par(mfrow=c(2,2)) t40=table(b1$occLand,b1$indexcomb) t41=rowPercents(t40) barplot(t41[,1],ylim=c(0,25),xlab="Land",ylab="Percentage of reliably positive letters") barplot(t41[,3],ylim=c(0,25),xlab="Land",ylab="Percentage of reliably negative letters") barplot(t41[,2],ylim=c(0,100),xlab="Land",ylab="Percentage of neutral letters") barplot(t41[,1]/t41[,3],ylim=c(0,5),xlab="Land",ylab="Ratio of reliable positive / reliably negative") abline(1,0,col="blue") ### relating the presence of term "war" to positive/negative sentiment par(mfrow=c(2,2)) t40=table(b1$occWar,b1$indexcomb) t41=rowPercents(t40) barplot(t41[,1],ylim=c(0,25),xlab="War",ylab="Percentage of reliably positive letters") barplot(t41[,3],ylim=c(0,25),xlab="War",ylab="Percentage of reliably negative letters") barplot(t41[,2],ylim=c(0,100),xlab="War",ylab="Percentage of neutral letters") barplot(t41[,1]/t41[,3],ylim=c(0,5),xlab="War",ylab="Ratio of reliable positive / reliably negative") abline(1,0,col="blue") ### relating the presence of term "treaty" to positive/negative sentiment par(mfrow=c(2,2)) t40=table(b1$occTreaty,b1$indexcomb) t41=rowPercents(t40) barplot(t41[,1],ylim=c(0,25),xlab="Treaty",ylab="Percentage of reliably positive letters") barplot(t41[,3],ylim=c(0,25),xlab="Treaty",ylab="Percentage of reliably negative letters") barplot(t41[,2],ylim=c(0,100),xlab="Treaty",ylab="Percentage of neutral letters") barplot(t41[,1]/t41[,3],ylim=c(0,5),xlab="Treaty",ylab="Ratio of reliable positive / reliably negative") abline(1,0,col="blue") ### relating the presence of term "government" to positive/negative sentiment par(mfrow=c(2,2)) t40=table(b1$occGovernment,b1$indexcomb) t41=rowPercents(t40) barplot(t41[,1],ylim=c(0,25),xlab="Government",ylab="Percentage of reliably positive letters") barplot(t41[,3],ylim=c(0,25),xlab="Government",ylab="Percentage of reliably negative letters") barplot(t41[,2],ylim=c(0,100),xlab="Government",ylab="Percentage of neutral letters") barplot(t41[,1]/t41[,3],ylim=c(0,5),xlab="Government",ylab="Ratio of reliable positive / reliably negative") abline(1,0,col="blue") ## relationship authorship Governor to positive/negative sentiment par(mfrow=c(2,2)) t40=table(b1$occAutGov,b1$indexcomb) t41=rowPercents(t40) barplot(t41[,1],ylim=c(0,25),xlab="Author=Governor",ylab="Percentage of reliably positive letters") barplot(t41[,3],ylim=c(0,25),xlab="Author=Governor",ylab="Percentage of reliably negative letters") barplot(t41[,2],ylim=c(0,100),xlab="Author=Governor",ylab="Percentage of neutral letters") barplot(t41[,1]/t41[,3],ylim=c(0,5),xlab="Author=Governor",ylab="Ratio of reliable positive / reliably negative") abline(1,0,col="blue") ## relationship recipient Governor to positive/negative sentiment par(mfrow=c(2,2)) t40=table(b1$occRecGov,b1$indexcomb) t41=rowPercents(t40) barplot(t41[,1],ylim=c(0,25),xlab="Recipient=Governor",ylab="Percentage of reliably positive letters") barplot(t41[,3],ylim=c(0,25),xlab="Recipient=Governor",ylab="Percentage of reliably negative letters") barplot(t41[,2],ylim=c(0,100),xlab="Recipient=Governor",ylab="Percentage of neutral letters") barplot(t41[,1]/t41[,3],ylim=c(0,5),xlab="Recipient=Governor",ylab="Ratio of reliable positive / reliably negative") abline(1,0,col="blue")