############################################################## #################### TOPIC MODELS STM: START ################# ##### ANALYSIS OF THE COMPLETE CORPUS OF 8423 DOCUMENTS ###### ############################################################## rm(list = ls()) library(tm) library(stm) library(slam) ## Need to set directory ## setwd('~/downloads/topicModel') #########build corpus##### data <- read.csv("C:\\Johannes Ledolter\\2020March01Book\\Chapter9WEB\\datacomb.csv", stringsAsFactors = F) ## data$text for text; data$yearc for year; data$indc for subcorp; data$autr for author; data$recr for recipient corpus <- VCorpus(VectorSource(data$text),readerControl = list(reader = readPlain)) ## this is how to create corpus corpus1 <- tm_map(corpus, stripWhitespace) corpus2 <- tm_map(corpus1, content_transformer(tolower)) corpus3 <- tm_map(corpus2, removePunctuation) corpus4 <- tm_map(corpus3, removeNumbers) corpus5 <- tm_map(corpus4, removeWords, stopwords("english")) corp.dtm <- DocumentTermMatrix(corpus5,control=list(stemming=FALSE)) ## no stemming as default corpStripped.dtm=removeSparseTerms(corp.dtm,0.99) corpStripped.dtm <- as.matrix(corpStripped.dtm) dim(corpStripped.dtm) ################transfer to STM processed <- readCorpus(corpStripped.dtm, type = 'dtm') reg <- model.matrix(~0 + as.factor(indc), data) colnames(reg) <- c('Northwest', 'Indiana', 'Orleans', 'LousianaMissouri', 'Illinois', 'Michigan', 'Wisconsin') year <- data$yearc metadata <- as.data.frame(cbind(reg, year)) out <- prepDocuments(processed$documents, processed$vocab, meta = metadata) docs <- out$documents vocab <- out$vocab ######build STM######## ## without meta variables results=stm(out$documents,out$vocab,K=5,seed = 1234) labelTopics(results,c(1,2,3,4,5), n=20) ########################################################################### ## further analysis of topic model results without meta: start ############ ########################################################################### results$theta ## the matrix of probabilitiespar(mfrow=c(5,1)) par(mfrow=c(1,3)) yy=results$theta[,1] xx=as.numeric(data$yearc) plot(yy~xx,main="Topic1",xlab="Year",ylab="Topic Proportion",ylim=c(0,1)) pp=predict(loess(yy~xx),xx) plot(xx,pp,main="Topic1",xlab="Year",ylab="Topic Proportion",ylim=c(0,1)) barplot(tapply(yy,data$indc,FUN=mean,na.rm=TRUE),main="Topic1",xlab="Corpus",ylab="Topic Proportion",ylim=c(0,1)) par(mfrow=c(1,2)) yy=results$theta[,1] xx=as.numeric(data$yearc) pp=predict(loess(yy~xx),xx) plot(xx,pp,main="Topic1",xlab="Year",ylab="Topic Proportion",ylim=c(0,1)) barplot(tapply(yy,data$indc,FUN=mean,na.rm=TRUE),main="Topic1",xlab="Corpus",ylab="Topic Proportion",ylim=c(0,1)) par(mfrow=c(1,3)) yy=results$theta[,2] xx=as.numeric(data$yearc) plot(yy~xx,main="Topic2",xlab="Year",ylab="Topic Proportion",ylim=c(0,1)) pp=predict(loess(yy~xx),xx) plot(xx,pp,main="Topic2",xlab="Year",ylab="Topic Proportion",ylim=c(0,1)) barplot(tapply(yy,data$indc,FUN=mean,na.rm=TRUE),main="Topic2",xlab="Corpus",ylab="Topic Proportion",ylim=c(0,1)) par(mfrow=c(1,2)) yy=results$theta[,2] xx=as.numeric(data$yearc) pp=predict(loess(yy~xx),xx) plot(xx,pp,main="Topic2",xlab="Year",ylab="Topic Proportion",ylim=c(0,1)) barplot(tapply(yy,data$indc,FUN=mean,na.rm=TRUE),main="Topic2",xlab="Corpus",ylab="Topic Proportion",ylim=c(0,1)) par(mfrow=c(1,3)) yy=results$theta[,3] xx=as.numeric(data$yearc) plot(yy~xx,main="Topic3",xlab="Year",ylab="Topic Proportion",ylim=c(0,1)) pp=predict(loess(yy~xx),xx) plot(xx,pp,main="Topic3",xlab="Year",ylab="Topic Proportion",ylim=c(0,1)) barplot(tapply(yy,data$indc,FUN=mean,na.rm=TRUE),main="Topic3",xlab="Corpus",ylab="Topic Proportion",ylim=c(0,1)) par(mfrow=c(1,2)) yy=results$theta[,3] pp=predict(loess(yy~xx),xx) plot(xx,pp,main="Topic3",xlab="Year",ylab="Topic Proportion",ylim=c(0,1)) barplot(tapply(yy,data$indc,FUN=mean,na.rm=TRUE),main="Topic3",xlab="Corpus",ylab="Topic Proportion",ylim=c(0,1)) par(mfrow=c(1,3)) yy=results$theta[,4] xx=as.numeric(data$yearc) plot(yy~xx,main="Topic4",xlab="Year",ylab="Topic Proportion",ylim=c(0,1)) pp=predict(loess(yy~xx),xx) plot(xx,pp,main="Topic4",xlab="Year",ylab="Topic Proportion",ylim=c(0,1)) barplot(tapply(yy,data$indc,FUN=mean,na.rm=TRUE),main="Topic4",xlab="Corpus",ylab="Topic Proportion",ylim=c(0,1)) par(mfrow=c(1,2)) yy=results$theta[,4] xx=as.numeric(data$yearc) pp=predict(loess(yy~xx),xx) plot(xx,pp,main="Topic4",xlab="Year",ylab="Topic Proportion",ylim=c(0,1)) barplot(tapply(yy,data$indc,FUN=mean,na.rm=TRUE),main="Topic4",xlab="Corpus",ylab="Topic Proportion",ylim=c(0,1)) par(mfrow=c(1,2)) yy=results$theta[,5] xx=as.numeric(data$yearc) pp=predict(loess(yy~xx),xx) plot(xx,pp,main="Topic5",xlab="Year",ylab="Topic Proportion",ylim=c(0,1)) barplot(tapply(yy,data$indc,FUN=mean,na.rm=TRUE),main="Topic5",xlab="Corpus",ylab="Topic Proportion",ylim=c(0,1)) ############################################################## #################### TOPIC MODELS STM: END ################### ##### ANALYSIS OF THE COMPLETE CORPUS OF 8423 DOCUMENTS ###### ##############################################################