### Trump's Tweets ### ### files are read from/written to the directory "C:\\Johannes Ledolter\\2020March01Book\\Chapter11" rm(list = ls()) trumptweet <- readLines("C:\\Johannes Ledolter\\2020March01Book\\Chapter11\\TrumpTweets.txt",encoding = "UTF-8") ## inputfile is a textfile, encoded in UTF-8 length(trumptweet) features <- strsplit(trumptweet, split = ",") features[[1]] <- NULL source <- sapply(features, function(x) x[1]) text <- sapply(features, function(x) x[2]) time <- sapply(features, function(x) x[3]) retweet_count <- sapply(features, function(x) x[4]) favorite_count <- sapply(features, function(x) x[5]) is_retweet <- sapply(features, function(x) x[6]) id_str <- sapply(features, function(x) x[7]) tweet_df <- data.frame(source, text, time, retweet_count,favorite_count, is_retweet, id_str, stringsAsFactors = FALSE) ### part 1: perform check # check the source table(source) class(tweet_df$source) tweet_df$source <- factor(tweet_df$source) levels(tweet_df$source) # no problem found # check the text table(text) which(nchar(text)==0) which(is.na(text)) class(tweet_df$text) # no problem found # check the time nchar("01-08-2015 07:02:44") which(nchar(time)!=19) class(tweet_df$time) # check retweet_count which(nchar(tweet_df$retweet_count)<=0) tweet_df$retweet_count <- as.numeric(tweet_df$retweet_count) # all can be transferred as numeric, no problem found # check favorite_count which(nchar(tweet_df$favorite_count)<=0) tweet_df$favorite_count <- as.numeric(tweet_df$favorite_count) # all can be transferred as numeric, no problem found # check the is_retweet table(is_retweet) which(!is_retweet %in% c("false", "true")) tweet_df[which(!is_retweet %in% c("false", "true")), ] # 58 tweets do not have sign for whether they are retweets or not. # check if any are retweets grep("RT", tweet_df$text[which(!is_retweet %in% c("false", "true"))]) # no retweets, replace empty string by "false" tweet_df$is_retweet[which(!is_retweet %in% c("false", "true"))] <- "false" tweet_df$is_retweet <- factor(tweet_df$is_retweet) levels(tweet_df$is_retweet) table(tweet_df$is_retweet) # check is_str nchar(id_str) which(nchar(id_str)!=19) length(which(nchar(id_str)!=19)) which(nchar(id_str)==19) length(which(nchar(id_str)==19)) # no problem found ### part 2: cleaning special symbols tweet_df$text[1:200] tweet_df$text <- gsub("“", "", tweet_df$text, fixed = TRUE) tweet_df$text <- gsub("”", "", tweet_df$text, fixed = TRUE) tweet_df$text <- gsub("’", "'", tweet_df$text, fixed = TRUE) tweet_df$text <- gsub("—", " - ", tweet_df$text, fixed = TRUE) tweet_df$text <- gsub("&", " ", tweet_df$text, fixed = TRUE) tweet_df$text[1:200] ### further cleaning: using the textclean package to replace emojis with text ### note that not all emojis may be covered ### textclean package contains many other useful cleaning tools. experiment!! library(textclean) tweet_df$text=replace_emoji(tweet_df$text) tweet_df$text[1:200] ### the above approach may not cover all emojis ### you can supplement the previous approach by downloading a more recent dictionary of emojis ### such as the one in the enclosed file emojis.csv ### then add the following commands ### library(DataCombine) ### emoji_dic <- read.csv("C:\\Johannes Ledolter\\2020March01Book\\Chapter11\\emojis.csv", stringsAsFactors = F, sep = ";") ### tweet_df <- FindReplace(data = tweet_df,Var = "text", replaceData = emoji_dic, from = "utf8", to = "EN", exact = FALSE, vector = FALSE) ### tweet_df$text[1:200] ### saving the cleaned seven vectors source=tweet_df$source text=tweet_df$text time=tweet_df$time retweet_count=tweet_df$retweet_count favorite_count=tweet_df$favorite_count is_retweet=tweet_df$is_retweet id_str=tweet_df$id_str save(source,text,time,retweet_count,favorite_count,is_retweet,id_str, file = "C:\\Johannes Ledolter\\2020March01Book\\Chapter11\\TrumpTweets.RData") rm(list = ls()) load("C:\\Johannes Ledolter\\2020March01Book\\Chapter11\\TrumpTweets.RData") ### part 3: analyses library(tm) ## START: creating corpus corpus <- VCorpus(VectorSource(text),readerControl = list(reader = readPlain)) ## this is how to create corpus corpus1 <- tm_map(corpus, stripWhitespace) corpus2 <- tm_map(corpus1, content_transformer(tolower)) corpus3 <- tm_map(corpus2, removePunctuation) corpus4 <- tm_map(corpus3, removeNumbers) corpus5 <- tm_map(corpus4, removeWords, stopwords("english")) corp.dtm <- DocumentTermMatrix(corpus5,control=list(stemming=FALSE)) ## no stemming default dim(corp.dtm) findFreqTerms(corp.dtm,100) findFreqTerms(corp.dtm,2000) ### if you want to omit all TRUMP retweets table(is_retweet) TRUMPtext=text[is_retweet=="false"] length(TRUMPtext) corpus <- VCorpus(VectorSource(TRUMPtext),readerControl = list(reader = readPlain)) ## this is how to create corpus corpus1 <- tm_map(corpus, stripWhitespace) corpus2 <- tm_map(corpus1, content_transformer(tolower)) corpus3 <- tm_map(corpus2, removePunctuation) corpus4 <- tm_map(corpus3, removeNumbers) corpus5 <- tm_map(corpus4, removeWords, stopwords("english")) corpTRUMP.dtm <- DocumentTermMatrix(corpus5,control=list(stemming=FALSE)) ## no stemming default dim(corpTRUMP.dtm) findFreqTerms(corpTRUMP.dtm,100) findFreqTerms(corpTRUMP.dtm,2000) ### if you want to bring in the variable time ### and stratify the analysis by year (or month / day) date=matrix(unlist(strsplit(time, split = " ", fixed = TRUE)),ncol=2,byrow=TRUE)[,1] date mm=matrix(unlist(strsplit(date, split = "-", fixed = TRUE)),ncol=3,byrow=TRUE) month=mm[,1] day=mm[,2] year=mm[,3] month day year