### Trump's Tweets ###
### files are read from/written to the directory "C:\\Johannes Ledolter\\2020March01Book\\Chapter11"
rm(list = ls())

trumptweet <- readLines("C:\\Johannes Ledolter\\2020March01Book\\Chapter11\\TrumpTweets.txt",encoding = "UTF-8")
## inputfile is a textfile, encoded in UTF-8
length(trumptweet)
features <- strsplit(trumptweet, split = ",")
features[[1]] <- NULL
source <- sapply(features, function(x) x[1])
text <- sapply(features, function(x) x[2])
time <- sapply(features, function(x) x[3])
retweet_count <- sapply(features, function(x) x[4])
favorite_count <- sapply(features, function(x) x[5])
is_retweet <- sapply(features, function(x) x[6])
id_str <- sapply(features, function(x) x[7])
tweet_df <- data.frame(source, text, time, retweet_count,favorite_count, is_retweet, id_str, stringsAsFactors = FALSE)

### part 1: perform check

# check the source
table(source)
class(tweet_df$source)
tweet_df$source <- factor(tweet_df$source)
levels(tweet_df$source)
# no problem found

# check the text
table(text)
which(nchar(text)==0)
which(is.na(text))
class(tweet_df$text)
# no problem found

# check the time
nchar("01-08-2015 07:02:44")
which(nchar(time)!=19)
class(tweet_df$time)

# check retweet_count
which(nchar(tweet_df$retweet_count)<=0)
tweet_df$retweet_count <- as.numeric(tweet_df$retweet_count)
# all can be transferred as numeric, no problem found

# check favorite_count
which(nchar(tweet_df$favorite_count)<=0)
tweet_df$favorite_count <- as.numeric(tweet_df$favorite_count)
# all can be transferred as numeric, no problem found

# check the is_retweet
table(is_retweet)
which(!is_retweet %in% c("false", "true"))
tweet_df[which(!is_retweet %in% c("false", "true")), ]
# 58 tweets do not have sign for whether they are retweets or not.
# check if any are retweets
grep("RT", tweet_df$text[which(!is_retweet %in% c("false", "true"))])
# no retweets, replace empty string by "false" 
tweet_df$is_retweet[which(!is_retweet %in% c("false", "true"))] <- "false"
tweet_df$is_retweet <- factor(tweet_df$is_retweet)
levels(tweet_df$is_retweet)
table(tweet_df$is_retweet)

# check is_str
nchar(id_str)
which(nchar(id_str)!=19)
length(which(nchar(id_str)!=19))
which(nchar(id_str)==19)
length(which(nchar(id_str)==19))
# no problem found


### part 2: cleaning special symbols 

tweet_df$text[1:200]
tweet_df$text <- gsub("“", "", tweet_df$text, fixed = TRUE)
tweet_df$text <- gsub("”", "", tweet_df$text, fixed = TRUE)
tweet_df$text <- gsub("’", "'", tweet_df$text, fixed = TRUE)
tweet_df$text <- gsub("—", " - ", tweet_df$text, fixed = TRUE)
tweet_df$text <- gsub("&amp", " ", tweet_df$text, fixed = TRUE)
tweet_df$text[1:200]

### further cleaning: using the textclean package to replace emojis with text 
### note that not all emojis may be covered
### textclean package contains many other useful cleaning tools. experiment!!
library(textclean)
tweet_df$text=replace_emoji(tweet_df$text) 
tweet_df$text[1:200]

	### the above approach may not cover all emojis
	### you can supplement the previous approach by downloading a more recent dictionary of emojis 
	### such as the one in the enclosed file emojis.csv
	### then add the following commands
	### library(DataCombine)
	### emoji_dic <- read.csv("C:\\Johannes Ledolter\\2020March01Book\\Chapter11\\emojis.csv", stringsAsFactors = F, sep = ";")
	### tweet_df <- FindReplace(data = tweet_df,Var = "text", replaceData = emoji_dic, from = "utf8", to = "EN", exact = FALSE, vector = FALSE)
	### tweet_df$text[1:200]

### saving the cleaned seven vectors
source=tweet_df$source
text=tweet_df$text
time=tweet_df$time
retweet_count=tweet_df$retweet_count
favorite_count=tweet_df$favorite_count
is_retweet=tweet_df$is_retweet
id_str=tweet_df$id_str
save(source,text,time,retweet_count,favorite_count,is_retweet,id_str, file = "C:\\Johannes Ledolter\\2020March01Book\\Chapter11\\TrumpTweets.RData")

rm(list = ls())
load("C:\\Johannes Ledolter\\2020March01Book\\Chapter11\\TrumpTweets.RData")


### part 3: analyses
library(tm)
## START: creating corpus
corpus <- VCorpus(VectorSource(text),readerControl = list(reader = readPlain)) ## this is how to create corpus 
corpus1 <- tm_map(corpus, stripWhitespace)
corpus2 <- tm_map(corpus1, content_transformer(tolower))	
corpus3 <- tm_map(corpus2, removePunctuation)
corpus4 <- tm_map(corpus3, removeNumbers)
corpus5 <- tm_map(corpus4, removeWords, stopwords("english"))
corp.dtm <- DocumentTermMatrix(corpus5,control=list(stemming=FALSE))	## no stemming default

dim(corp.dtm)
findFreqTerms(corp.dtm,100)	
findFreqTerms(corp.dtm,2000)	


### if you want to omit all TRUMP retweets
table(is_retweet)
TRUMPtext=text[is_retweet=="false"]
length(TRUMPtext)

corpus <- VCorpus(VectorSource(TRUMPtext),readerControl = list(reader = readPlain)) ## this is how to create corpus 
corpus1 <- tm_map(corpus, stripWhitespace)
corpus2 <- tm_map(corpus1, content_transformer(tolower))	
corpus3 <- tm_map(corpus2, removePunctuation)
corpus4 <- tm_map(corpus3, removeNumbers)
corpus5 <- tm_map(corpus4, removeWords, stopwords("english"))
corpTRUMP.dtm <- DocumentTermMatrix(corpus5,control=list(stemming=FALSE))	## no stemming default

dim(corpTRUMP.dtm)
findFreqTerms(corpTRUMP.dtm,100)	
findFreqTerms(corpTRUMP.dtm,2000)	


### if you want to bring in the variable time 
### and stratify the analysis by year (or month / day)

date=matrix(unlist(strsplit(time, split = " ", fixed = TRUE)),ncol=2,byrow=TRUE)[,1]
date
mm=matrix(unlist(strsplit(date, split = "-", fixed = TRUE)),ncol=3,byrow=TRUE)
month=mm[,1]
day=mm[,2]
year=mm[,3]
month
day
year