############################################################################ ########################## TOPIC MODELS STM: START ######################### #### CORRESPONDENCE BETWEEN GOVERNORS AND CENTER OF POWER: 958 LETTERS #### ############################################################################ rm(list = ls()) library(tm) library(stm) library(slam) data <- read.csv("C:\\Johannes Ledolter\\2020March01Book\\Chapter9WEB\\datacomb.csv", stringsAsFactors = F) ## data$text for text; data$yearc for year; data$indc for subcorp; data$autr for author; data$recr for recipient corpus <- VCorpus(VectorSource(data$text),readerControl = list(reader = readPlain)) ## this is how to create corpus corpus1 <- tm_map(corpus, stripWhitespace) corpus2 <- tm_map(corpus1, content_transformer(tolower)) corpus3 <- tm_map(corpus2, removePunctuation) corpus4 <- tm_map(corpus3, removeNumbers) corpus5 <- tm_map(corpus4, removeWords, stopwords("english")) corp.dtm <- DocumentTermMatrix(corpus5,control=list(stemming=FALSE)) ## no stemming as default stripcoef=0.99 corpStripped.dtm=removeSparseTerms(corp.dtm,stripcoef) dim(corpStripped.dtm) ############################################################################ ######################### SELECTION OF LETTERS ############################ ############################################################################ ## select letters where governor is the author and president and secretaries are the recipienrs ## select letters where governor is the recipient and president and secretaries are the authors data$autr data$recr length(data$autr) length(data$recr) inda1=data$autr=="GOVERNOR" inda2=data$autr=="SECRETARY OF WAR" inda3=data$autr=="SECRETARY OF TREASURY" inda4=data$autr=="SECRETARY OF STATE" inda5=data$autr=="PRESIDENT" indr1=data$recr=="GOVERNOR" indr2=data$recr=="SECRETARY OF WAR" indr3=data$recr=="SECRETARY OF TREASURY" indr4=data$recr=="SECRETARY OF STATE" indr5=data$recr=="PRESIDENT" indpart1=inda1+indr2+indr3+indr4+indr5 indpart2=inda2+inda3+inda4+inda5+indr1 table(indpart1) table(indpart2) indcomb=(indpart1==2)+(indpart2==2) table(indcomb) ############################################################################ ## STM WITH META VARIABLES ################################################# ## FROM governor TO president/secretaries of state, treasury and war ####### ## FROM president/secretaries of state, treasury and war RO governor ####### ############################################################################ corpStripped3.dtm=removeSparseTerms(corp.dtm[indcomb==1,],stripcoef) dim(corpStripped3.dtm) group=as.numeric(indpart2==2) ## group (0) is FROM governor TO president and secretaries ## group (1) is FROM governor TO president and secretaries group=group[indcomb==1] sum(group) length(group) group=factor(group) levels(group)[1]=gsub("0","Governor to Center",levels(group)[1]) levels(group)[2]=gsub("1","Center to Governor",levels(group)[2]) levels(group) processed <- readCorpus(corpStripped3.dtm,type = 'slam') out <- prepDocuments(processed$documents,processed$vocab) docs <- out$documents vocab <- out$vocab ## with meta variables results=stm(out$documents,out$vocab,K=5,prevalence=~group,seed = 1234) labelTopics(results,c(1,2,3,4,5), n=10) plot(results,main="Topic Proportions",xlab=" ") ## Assessing significance of meta variables results$mu results$beta est <- estimateEffect(~group,results,nsims=50) summary(est) ## assesses significance of meta variables par(mfrow=c(1,1)) plot(est,"group",main="Effects of Meta Variable on Topic Prevalence",xlim=c(-0.05,0.5)) plot(est,covariate="group",cov.value1="Governor to Center", cov.value2="Center to Governor", topics=c(1,2,3,4,5),model=results,method="difference",xlim=c(-0.5,0.3),main="Differences in Topic Prevalence for Reversed Authorship") ############################################################################ ########################### TOPIC MODELS STM: END ########################## #### CORRESPONDENCE BETWEEN GOVERNORS AND CENTER OF POWER: 958 LETTERS #### ############################################################################