###############################################################
########### SENTIMENT ANALYSIS: TERRITORIAL PAPERS ############
###############################################################

rm(list = ls())

library(stm)
library(tm)
library(slam)

###############################################################
######################### TERRITORIAL PAPERS ##################
################## Reading from file datacomb.csv #############
############ authors and recipients already cleaned up ########
############################################################### 

data <- read.csv("C:\\Johannes Ledolter\\2020March01Book\\Chapter6WEB\\datacomb.csv",stringsAsFactors=F)
dim(data)
data[1:10,1:5]
ids1=data$indc
ids2=data$autr
ids3=data$recr
ids4=data$yearc
ids1
length(ids1)
ids2
length(ids2)
ids3
length(ids3)
ids4
length(ids4)

corpus <- Corpus(VectorSource(data$text),readerControl = list(reader = readPlain)) ## this is how to create corpus 
corpus1 <- tm_map(corpus, stripWhitespace)
corpus2 <- tm_map(corpus1, content_transformer(tolower))	
corpus3 <- tm_map(corpus2, removePunctuation)
corpus4 <- tm_map(corpus3, removeNumbers)
corpus5 <- tm_map(corpus4, removeWords, stopwords("english"))

corpus.dtm <- DocumentTermMatrix(corpus5,control=list(stemming=FALSE))	## no stemming as default

dim(corpus.dtm)


####################################################################
####################### SENTIMENT ANALYSIS #########################
####################################################################

## loading the words with positive and negative sentiment

load("C:\\Johannes Ledolter\\2020March01Book\\Chapter6WEB\\SentimentWords.RData")
pos	## words with positive sentiment 
neg	## words with negative sentiment

corpuscomb=corpus5	## represents the cleaned-up corpus (removed stopwords, numbers/punctuations, short words of length two)

## create the folder "cleancorpus"
## write out corpus as txt in a temporary folder "cleancorpus"

ntotal=dim(corpus.dtm)[1]	## combined length of letters 
for (d in 1:ntotal) {
  writeCorpus(corpuscomb[d],path="C:/Johannes Ledolter/2020March01Book/Chapter6WEB/cleancorpus",paste(d,".txt", sep = ""))
}

## reading the clean corpus from the temporary folder "cleancorpus"
## matching the sentiment and creating an output file that contains for each document (in row) 
## creating the file out.file that contains the number and proportion of positive (negative) matches
## the number of positive and negative matches in out.file[,1] and out.file[,2]
## the proportion of postive matches among all sentiment-scored words (either positive or negative matches) in out.file[,5]
## the proportion of negative matches among all sentiment-scored words (either positive or negative matches) in outfile[,6]

out.file = data.frame(pos = NA, neg = NA, pospercent = NA, negpercent = NA, posmatchpercent = NA, negmatchpercent = NA)[numeric(0),]

for (i in 1:ntotal){
  text_read = readLines(paste("C:/Johannes Ledolter/2020March01Book/Chapter6WEB/cleancorpus",paste("/", i, ".txt",sep=""),sep=""))
  text = as.character(text_read)
  text = unlist(strsplit(text," "))
  text=tolower(text)
  
  posmatch=match(text,pos,nomatch=-999)
  nupos=sum(posmatch>0)
  
  negmatch=match(text,neg,nomatch=-999)
  nuneg=sum(negmatch>0)
  
  newrow = c(nupos,nuneg, 100*(nupos/length(text)), 100*(nuneg/length(text)), 100*(nupos/(nupos + nuneg)), 100*(nuneg/(nupos+nuneg)))
  
  out.file = rbind(out.file, newrow)
  
}

colnames(out.file) = c("positive matches", "negative matches", "pos% (length base)", "neg% (length base)", 
"pos% (match base)", "neg% (match base)")
out.file[1:10,1:6]
dim(out.file)

out.file[1:10,1:6]
dim(out.file)

## sentiment information is now attached to each document


## assessing word characteristics of the documents - whether or not they include certain key words

## counting up the number of occurrences for indian 
h1=labels(corpus.dtm)$Terms
ind=h1=="indian"
nuterms=row_sums(as.matrix(corpus.dtm[,ind]))
indicator<-function(condition) ifelse(condition,1,0)	## indicator = 1 if indian is a word in the document
occIndian=indicator(nuterms)
table(occIndian)

## counting up the number of occurrences for land 
h1=labels(corpus.dtm)$Terms
ind=h1=="land"
nuterms=row_sums(as.matrix(corpus.dtm[,ind]))
indicator<-function(condition) ifelse(condition,1,0)
occLand=indicator(nuterms)
table(occLand)

## counting up the number of occurrences for war 
h1=labels(corpus.dtm)$Terms
ind=h1=="war"
nuterms=row_sums(as.matrix(corpus.dtm[,ind]))
indicator<-function(condition) ifelse(condition,1,0)
occWar=indicator(nuterms)
table(occWar)

## counting up the number of occurrences for treaty 
h1=labels(corpus.dtm)$Terms
ind=h1=="treaty"
nuterms=row_sums(as.matrix(corpus.dtm[,ind]))
indicator<-function(condition) ifelse(condition,1,0)
occTreaty=indicator(nuterms)
table(occTreaty)

## counting up the number of occurrences for government 
h1=labels(corpus.dtm)$Terms
ind=h1=="government"
nuterms=row_sums(as.matrix(corpus.dtm[,ind]))
indicator<-function(condition) ifelse(condition,1,0)
occGovernment=indicator(nuterms)
table(occGovernment)

occAutGov=data$autr=="GOVERNOR"	## indicator if letter authored by governor
table(occAutGov)
occRecGov=data$recr=="GOVERNOR"	## indicator if letter received by governor
table(occRecGov)

occIndian
occLand
occWar
occTreaty
occGovernment
occAutGov
occRecGov


##############################################################################################################################
## METHOD 1 ##################################################################################################################
## defining a document as positive if its proportion of positive sentiment words is above the corpus average of proportions ##
## defining a document as negative if its proportion of positive sentiment words is below the corpus average of proportions ##
## investigate whether sentiment is changing over time (year) ################################################################
## investigate whether sentiment is related to the occurrence of certain words ###############################################
##############################################################################################################################


nletters=ntotal
id=dim(nletters)
for (i in 1:nletters) {
id[i]=i
}
id

percpos=out.file$"pos% (match base)"
np=out.file$"positive matches"
nn=out.file$"negative matches"
nt=np+nn
par(mfrow=c(1,1))
hist(percpos)
ave=sum(np)/(sum(np)+sum(nn))
ave	## perhaps this is a better way to get the average
## 70 percent positive, on average

idstime=as.numeric(ids4)
idsdoc=as.numeric(ids1)

b=data.frame(id,percpos,nt,idstime,idsdoc,occIndian,occLand,occWar,occTreaty,occGovernment,occAutGov,occRecGov)
b[1:10,]

index=nt>1	## to get rid of na. Could use letters with nt > cutoff
## decided to use all letters with nt>1; proportions from groups with small nt won't become significant anyway
b=b[index,]
dim(b)

b$indexpos=b$percpos-ave*100>0
b$indexneg=b$percpos-ave*100<0

b1=b
b1[1:10,]

table(b1$idstime,b1$indexpos)
t50=tapply(b1$indexpos,b1$idstime,FUN=mean,na.rm=TRUE)
t50
## barplot(t50)
t51=t50
t51[colSums(table(b1$indexpos,b1$idstime))<=9]=NA
t51
par(mfrow=c(1,2))
barplot(t51,main="POSITIVE LETTERS",ylim=c(0,1),xlab="Year",ylab="Percentage of positive letters")	## probably best

xx=as.numeric(levels(as.factor(b1$idstime)))
yy=t51
pp=predict(loess(yy~xx),xx)
plot(xx,yy,main="POSITIVE LETTERS",ylim=c(0,1),xlab="Year",ylab="Percentage of positive letters")
lines(xx,pp)

par(mfrow=c(1,1))
table(b1$idsdoc,b1$indexpos)
t50=tapply(b1$indexpos,b1$idsdoc,FUN=mean,na.rm=TRUE)
t50
## barplot(t50)
t51=t50
t51[colSums(table(b1$indexpos,b1$idsdoc))<=9]=NA
t51
barplot(t51,main="POSITIVE LETTERS",ylim=c(0,1),xlab="Corpus",ylab="Percentage of positive letters")	## probably best


## relationship to content: indian, war, land, government, treaty

### relating the presence of term "indian" to positive sentiment
tbl=table(b1$occIndian,b1$indexpos)
tbl
tbl[1,2]/(tbl[1,1]+tbl[1,2])
tbl[2,2]/(tbl[2,1]+tbl[2,2])
chisq.test(tbl)

### relating the presence of term "land" to positive sentiment
tbl=table(b1$occLand,b1$indexpos)
tbl
tbl[1,2]/(tbl[1,1]+tbl[1,2])
tbl[2,2]/(tbl[2,1]+tbl[2,2])
chisq.test(tbl)

### relating the presence of term "war" to positive sentiment
tbl=table(b1$occWar,b1$indexpos)
tbl
tbl[1,2]/(tbl[1,1]+tbl[1,2])
tbl[2,2]/(tbl[2,1]+tbl[2,2])
chisq.test(tbl)

### relating the presence of term "treaty" to positive sentiment
tbl=table(b1$occTreaty,b1$indexpos)
tbl
tbl[1,2]/(tbl[1,1]+tbl[1,2])
tbl[2,2]/(tbl[2,1]+tbl[2,2])
chisq.test(tbl)

### relating the presence of term "government" to positive sentiment
tbl=table(b1$occGovernment,b1$indexpos)
tbl
tbl[1,2]/(tbl[1,1]+tbl[1,2])
tbl[2,2]/(tbl[2,1]+tbl[2,2])
chisq.test(tbl)

## relationship whether written or received from Governor

### relating the letters authored/received by "Governor" to positive sentiment
tbl=table(b1$occAutGov,b1$indexpos)
tbl
tbl[2,2]/(tbl[2,1]+tbl[2,2])
tbl=table(b1$occRecGov,b1$indexpos)
tbl
tbl[2,2]/(tbl[2,1]+tbl[2,2])



##############################################################################################################################
## METHOD 2: FACTORING IN THE RELIABILITY OF THE ESTIMATE FOR POSITIVITY #####################################################
## defining a document as positive if its proportion of positive sentiment is larger than the upper reliability bound ########
## defining a document as negative if its proportion of positive sentiment is smaller than the lower reliability bound #######
## this leaves a third group: neutral documents ############################################################################## 
## investigate whether sentiment is changing over time (year) ################################################################
## investigate whether sentiment is related to the occurrence of certain words ###############################################
##############################################################################################################################


temp=1.28*sqrt(ave*(1-ave)/b$nt)	## 1.28 for 90 percent coverage
b$indexSpos=(b$percpos-100*temp)-ave*100>0
b$indexSneg=(b$percpos+100*temp)-ave*100<0
b$indexSneutral=b$indexSpos==b$indexSneg
b[1:10,]

b1=b[order(b[,2],decreasing=TRUE),]

## letters with positive sentiment, significantly larger than average
b1[b1$indexSpos,]
dim(b1[b1$indexSpos,])[1]
## letters with negative sentiment, significantly smaller than average
b1[b1$indexSneg,]
dim(b1[b1$indexSneg,])[1]
## letters with neutral sentiment, not significantly different than average
b1[b1$indexSneutral,]
dim(b1[b1$indexSneutral,])[1]

## graphic display: start
b1[,1]=b[,1]
plot(b1[b1$indexSpos,1],b1[b1$indexSpos,2],type="p",col="red",xlim=c(1,nletters),ylim=c(0,100),ylab="Percent Positive",xlab="Letter ID",
main="More positive than average (RED), Less positive than average (GREEN), Not different than average (Black)",
sub="Blue Line: Percent Positive across all letters")						
## color red for letters with sentiment significantly larger than average
points(b1[b1$indexSneg,1],b1[b1$indexSneg,2],type="p",col="green")   	
## color green for letters with sentiment significantly smaller than average
points(b1[b1$indexSneutral,1],b1[b1$indexSneutral,2],type="p",pch="*",col="black") 									
## color black for letters with sentiment not significantly different than average
abline(ave*100,0,col="blue")   						
## add horizontal line at ave*100 
## graphic display: end


table(b1$idstime,b1$indexSpos)
t50=tapply(b1$indexSpos,b1$idstime,FUN=mean,na.rm=TRUE)
t50
## barplot(t50)
t51=t50
t51[colSums(table(b1$indexSpos,b1$idstime))<=9]=NA
t51
par(mfrow=c(1,2))
barplot(t51,main="SIG POSITIVE LETTERS",ylim=c(0,0.35),xlab="Year",ylab="Percentage of sig positive letters")		## probably best

xx=as.numeric(levels(as.factor(b1$idstime)))
yy=t51
pp=predict(loess(yy~xx),xx)
plot(xx,yy,main="SIG POSITIVE LETTERS",ylim=c(0,0.35),xlab="Year",ylab="Percentage of sig positive letters")
lines(xx,pp)

table(b1$idstime,b1$indexSneg)
t50=tapply(b1$indexSneg,b1$idstime,FUN=mean,na.rm=TRUE)
t50
## barplot(t50)
t51=t50
t51[colSums(table(b1$indexSneg,b1$idstime))<=9]=NA
t51
par(mfrow=c(1,2))
barplot(t51,main="SIG NEGATIVE LETTERS",ylim=c(0,0.35),xlab="Year",ylab="Percentage of sig negative letters")	## probably best

xx=as.numeric(levels(as.factor(b1$idstime)))
yy=t51
pp=predict(loess(yy~xx),xx)
plot(xx,yy,main="SIG NEGATIVE LETTERS",ylim=c(0,0.35),xlab="Year",ylab="Percentage of sig negative letters")
lines(xx,pp)

table(b1$idsdoc,b1$indexSpos)
t50=tapply(b1$indexSpos,b1$idsdoc,FUN=mean,na.rm=TRUE)
t50
## barplot(t50)
t51=t50
t51[colSums(table(b1$indexSpos,b1$idsdoc))<=9]=NA
t51
barplot(t51,main="SIG POSITIVE LETTERS",ylim=c(0,0.35),xlab="Corpus",ylab="Percentage of sig positive letters")	## probably best

table(b1$idsdoc,b1$indexSneg)
t50=tapply(b1$indexSneg,b1$idsdoc,FUN=mean,na.rm=TRUE)
t50
## barplot(t50)
t51=t50
t51[colSums(table(b1$indexSneg,b1$idsdoc))<=9]=NA
t51
barplot(t51,main="SIG NEGATIVE LETTERS",ylim=c(0,0.35),xlab="Corpus",ylab="Percentage of sig negative letters")	## probably best


## relationship to content: indian, war, land, government, treaty

### relating the presence of term "indian" to positive/negative sentiment
tbl=table(b1$occIndian,b1$indexSpos)
tbl
tbl[1,2]/(tbl[1,1]+tbl[1,2])
tbl[2,2]/(tbl[2,1]+tbl[2,2])
chisq.test(tbl)

tbl=table(b1$occIndian,b1$indexSneg)
tbl
tbl[1,2]/(tbl[1,1]+tbl[1,2])
tbl[2,2]/(tbl[2,1]+tbl[2,2])
chisq.test(tbl) 


### relating the presence of term "land" to positive/negative sentiment
tbl=table(b1$occLand,b1$indexSpos)
tbl
tbl[1,2]/(tbl[1,1]+tbl[1,2])
tbl[2,2]/(tbl[2,1]+tbl[2,2])
chisq.test(tbl)

tbl=table(b1$occLand,b1$indexSneg)
tbl
tbl[1,2]/(tbl[1,1]+tbl[1,2])
tbl[2,2]/(tbl[2,1]+tbl[2,2])
chisq.test(tbl) 

### relating the presence of term "war" to positive/negative sentiment
tbl=table(b1$occWar,b1$indexSpos)
tbl
tbl[1,2]/(tbl[1,1]+tbl[1,2])
tbl[2,2]/(tbl[2,1]+tbl[2,2])
chisq.test(tbl)

tbl=table(b1$occWar,b1$indexSneg)
tbl
tbl[1,2]/(tbl[1,1]+tbl[1,2])
tbl[2,2]/(tbl[2,1]+tbl[2,2])
chisq.test(tbl) 

### relating the presence of term "treaty" to positive/negative sentiment
tbl=table(b1$occTreaty,b1$indexSpos)
tbl
tbl[1,2]/(tbl[1,1]+tbl[1,2])
tbl[2,2]/(tbl[2,1]+tbl[2,2])
chisq.test(tbl)

tbl=table(b1$occTreaty,b1$indexSneg)
tbl
tbl[1,2]/(tbl[1,1]+tbl[1,2])
tbl[2,2]/(tbl[2,1]+tbl[2,2])
chisq.test(tbl) 

### relating the presence of term "government" to positive/negative sentiment
tbl=table(b1$occGovernment,b1$indexSpos)
tbl
tbl[1,2]/(tbl[1,1]+tbl[1,2])
tbl[2,2]/(tbl[2,1]+tbl[2,2])
chisq.test(tbl)

tbl=table(b1$occGovernment,b1$indexSneg)
tbl
tbl[1,2]/(tbl[1,1]+tbl[1,2])
tbl[2,2]/(tbl[2,1]+tbl[2,2])
chisq.test(tbl) 

## relationship whether written or received from Governor

### relating the letters authored/received by "Governor" to positive sentiment
tbl=table(b1$occAutGov,b1$indexSpos)
tbl
tbl[2,2]/(tbl[2,1]+tbl[2,2])
tbl=table(b1$occRecGov,b1$indexSpos)
tbl
tbl[2,2]/(tbl[2,1]+tbl[2,2])

### relating the letters authored/received by "Governor" to negative sentiment
tbl=table(b1$occAutGov,b1$indexSneg)
tbl
tbl[2,2]/(tbl[2,1]+tbl[2,2])
tbl=table(b1$occRecGov,b1$indexSneg)
tbl
tbl[2,2]/(tbl[2,1]+tbl[2,2])



##############################################################################################################################
## METHOD 3: FACTORING IN THE RELIABILITY OF THE ESTIMATE FOR POSITIVITY #####################################################
## VISUALIZING ALL THREE GROUPS: SigPos, SigNeg, NotSig ######################################################################
##############################################################################################################################

library(Rcmdr)

comb=dim(length(b1$indexSpos))
for (i in 1:length(b1$indexSpos)) {
if(b1$indexSpos[i]==TRUE) comb[i]=1
if(b1$indexSneutral[i]==TRUE) comb[i]=2
if(b1$indexSneg[i]==TRUE) comb[i]=3
}
comb
b1$indexcomb=factor(comb)

## relating to time

par(mfrow=c(2,2))
t40=table(b1$idstime,b1$indexcomb)
t40[colSums(table(b1$indexcomb,b1$idstime))<=9]=NA
t41=rowPercents(t40)
barplot(t41[,1],ylim=c(0,25),xlab="Year",ylab="Percentage of reliably positive letters")	
barplot(t41[,3],ylim=c(0,25),xlab="Year",ylab="Percentage of reliably negative letters")
barplot(t41[,2],ylim=c(0,100),xlab="Year",ylab="Percentage of neutral letters")	
barplot(t41[,1]/t41[,3],ylim=c(0,5),xlab="Year",ylab="Ratio of reliable positive / reliably negative")	
abline(1,0,col="blue")   						

## relating to volume

par(mfrow=c(2,2))
t40=table(b1$idsdoc,b1$indexcomb)
t41=rowPercents(t40)
barplot(t41[,1],ylim=c(0,25),xlab="Corpus",ylab="Percentage of reliably positive letters")	
barplot(t41[,3],ylim=c(0,25),xlab="Corpus",ylab="Percentage of reliably negative letters")
barplot(t41[,2],ylim=c(0,100),xlab="Corpus",ylab="Percentage of neutral letters")	
barplot(t41[,1]/t41[,3],ylim=c(0,5),xlab="Corpus",ylab="Ratio of reliable positive / reliably negative")	
abline(1,0,col="blue")   						

## relating the presence of term "indian" to positive/negative sentiment

par(mfrow=c(2,2))
t40=table(b1$occIndian,b1$indexcomb)
t41=rowPercents(t40)
barplot(t41[,1],ylim=c(0,25),xlab="Indian",ylab="Percentage of reliably positive letters")	
barplot(t41[,3],ylim=c(0,25),xlab="Indian",ylab="Percentage of reliably negative letters")
barplot(t41[,2],ylim=c(0,100),xlab="Indian",ylab="Percentage of neutral letters")	
barplot(t41[,1]/t41[,3],ylim=c(0,5),xlab="Indian",ylab="Ratio of reliable positive / reliably negative")	
abline(1,0,col="blue")   						

### relating the presence of term "land" to positive/negative sentiment

par(mfrow=c(2,2))
t40=table(b1$occLand,b1$indexcomb)
t41=rowPercents(t40)
barplot(t41[,1],ylim=c(0,25),xlab="Land",ylab="Percentage of reliably positive letters")	
barplot(t41[,3],ylim=c(0,25),xlab="Land",ylab="Percentage of reliably negative letters")
barplot(t41[,2],ylim=c(0,100),xlab="Land",ylab="Percentage of neutral letters")	
barplot(t41[,1]/t41[,3],ylim=c(0,5),xlab="Land",ylab="Ratio of reliable positive / reliably negative")	
abline(1,0,col="blue")   						

### relating the presence of term "war" to positive/negative sentiment

par(mfrow=c(2,2))
t40=table(b1$occWar,b1$indexcomb)
t41=rowPercents(t40)
barplot(t41[,1],ylim=c(0,25),xlab="War",ylab="Percentage of reliably positive letters")	
barplot(t41[,3],ylim=c(0,25),xlab="War",ylab="Percentage of reliably negative letters")
barplot(t41[,2],ylim=c(0,100),xlab="War",ylab="Percentage of neutral letters")	
barplot(t41[,1]/t41[,3],ylim=c(0,5),xlab="War",ylab="Ratio of reliable positive / reliably negative")	
abline(1,0,col="blue")   						

### relating the presence of term "treaty" to positive/negative sentiment

par(mfrow=c(2,2))
t40=table(b1$occTreaty,b1$indexcomb)
t41=rowPercents(t40)
barplot(t41[,1],ylim=c(0,25),xlab="Treaty",ylab="Percentage of reliably positive letters")	
barplot(t41[,3],ylim=c(0,25),xlab="Treaty",ylab="Percentage of reliably negative letters")
barplot(t41[,2],ylim=c(0,100),xlab="Treaty",ylab="Percentage of neutral letters")	
barplot(t41[,1]/t41[,3],ylim=c(0,5),xlab="Treaty",ylab="Ratio of reliable positive / reliably negative")	
abline(1,0,col="blue")   						

### relating the presence of term "government" to positive/negative sentiment

par(mfrow=c(2,2))
t40=table(b1$occGovernment,b1$indexcomb)
t41=rowPercents(t40)
barplot(t41[,1],ylim=c(0,25),xlab="Government",ylab="Percentage of reliably positive letters")	
barplot(t41[,3],ylim=c(0,25),xlab="Government",ylab="Percentage of reliably negative letters")
barplot(t41[,2],ylim=c(0,100),xlab="Government",ylab="Percentage of neutral letters")	
barplot(t41[,1]/t41[,3],ylim=c(0,5),xlab="Government",ylab="Ratio of reliable positive / reliably negative")	
abline(1,0,col="blue")   
		
## relationship authorship Governor to positive/negative sentiment

par(mfrow=c(2,2))
t40=table(b1$occAutGov,b1$indexcomb)
t41=rowPercents(t40)
barplot(t41[,1],ylim=c(0,25),xlab="Author=Governor",ylab="Percentage of reliably positive letters")	
barplot(t41[,3],ylim=c(0,25),xlab="Author=Governor",ylab="Percentage of reliably negative letters")
barplot(t41[,2],ylim=c(0,100),xlab="Author=Governor",ylab="Percentage of neutral letters")	
barplot(t41[,1]/t41[,3],ylim=c(0,5),xlab="Author=Governor",ylab="Ratio of reliable positive / reliably negative")	
abline(1,0,col="blue")   
			
## relationship recipient Governor to positive/negative sentiment

par(mfrow=c(2,2))
t40=table(b1$occRecGov,b1$indexcomb)
t41=rowPercents(t40)
barplot(t41[,1],ylim=c(0,25),xlab="Recipient=Governor",ylab="Percentage of reliably positive letters")	
barplot(t41[,3],ylim=c(0,25),xlab="Recipient=Governor",ylab="Percentage of reliably negative letters")
barplot(t41[,2],ylim=c(0,100),xlab="Recipient=Governor",ylab="Percentage of neutral letters")	
barplot(t41[,1]/t41[,3],ylim=c(0,5),xlab="Recipient=Governor",ylab="Ratio of reliable positive / reliably negative")	
abline(1,0,col="blue")