############################################################################
########################## TOPIC MODELS STM: START #########################
####  CORRESPONDENCE BETWEEN GOVERNORS AND CENTER OF POWER: 958 LETTERS ####
############################################################################

rm(list = ls())

library(tm)
library(stm)
library(slam)

data <- read.csv("C:\\Johannes Ledolter\\2020March01Book\\Chapter9WEB\\datacomb.csv", stringsAsFactors = F)

## data$text for text; data$yearc for year; data$indc for subcorp; data$autr for author; data$recr for recipient

corpus <- VCorpus(VectorSource(data$text),readerControl = list(reader = readPlain)) ## this is how to create corpus 
corpus1 <- tm_map(corpus, stripWhitespace)
corpus2 <- tm_map(corpus1, content_transformer(tolower))	
corpus3 <- tm_map(corpus2, removePunctuation)
corpus4 <- tm_map(corpus3, removeNumbers)
corpus5 <- tm_map(corpus4, removeWords, stopwords("english"))

corp.dtm <- DocumentTermMatrix(corpus5,control=list(stemming=FALSE))	## no stemming as default
stripcoef=0.99
corpStripped.dtm=removeSparseTerms(corp.dtm,stripcoef)
dim(corpStripped.dtm)

############################################################################
######################### SELECTION OF LETTERS  ############################
############################################################################

## select letters where governor is the author and president and secretaries are the recipienrs
## select letters where governor is the recipient and president and secretaries are the authors

data$autr
data$recr
length(data$autr)
length(data$recr)

inda1=data$autr=="GOVERNOR"
inda2=data$autr=="SECRETARY OF WAR"
inda3=data$autr=="SECRETARY OF TREASURY"
inda4=data$autr=="SECRETARY OF STATE"
inda5=data$autr=="PRESIDENT"
indr1=data$recr=="GOVERNOR"
indr2=data$recr=="SECRETARY OF WAR"
indr3=data$recr=="SECRETARY OF TREASURY"
indr4=data$recr=="SECRETARY OF STATE"
indr5=data$recr=="PRESIDENT"
indpart1=inda1+indr2+indr3+indr4+indr5
indpart2=inda2+inda3+inda4+inda5+indr1
table(indpart1)
table(indpart2)
indcomb=(indpart1==2)+(indpart2==2)
table(indcomb)

############################################################################
## STM WITH META VARIABLES #################################################
## FROM governor TO president/secretaries of state, treasury and war #######
## FROM president/secretaries of state, treasury and war RO governor #######
############################################################################

corpStripped3.dtm=removeSparseTerms(corp.dtm[indcomb==1,],stripcoef)
dim(corpStripped3.dtm)
group=as.numeric(indpart2==2) 
## group (0) is FROM governor TO president and secretaries
## group (1) is FROM governor TO president and secretaries
group=group[indcomb==1]
sum(group)
length(group)
group=factor(group)
levels(group)[1]=gsub("0","Governor to Center",levels(group)[1])	
levels(group)[2]=gsub("1","Center to Governor",levels(group)[2])	
levels(group)

processed <- readCorpus(corpStripped3.dtm,type = 'slam')
out <- prepDocuments(processed$documents,processed$vocab)
docs <- out$documents
vocab <- out$vocab

## with meta variables
results=stm(out$documents,out$vocab,K=5,prevalence=~group,seed = 1234)
labelTopics(results,c(1,2,3,4,5), n=10)
plot(results,main="Topic Proportions",xlab="   ")

## Assessing significance of meta variables
results$mu
results$beta
est <- estimateEffect(~group,results,nsims=50)
summary(est)	## assesses significance of meta variables
par(mfrow=c(1,1))
plot(est,"group",main="Effects of Meta Variable on Topic Prevalence",xlim=c(-0.05,0.5))
plot(est,covariate="group",cov.value1="Governor to Center", cov.value2="Center to Governor", topics=c(1,2,3,4,5),model=results,method="difference",xlim=c(-0.5,0.3),main="Differences in Topic Prevalence for Reversed Authorship")

############################################################################
########################### TOPIC MODELS STM: END ##########################
####  CORRESPONDENCE BETWEEN GOVERNORS AND CENTER OF POWER: 958 LETTERS ####
############################################################################