Chapter 1 R CODE: ProgramTest.docx

1.3 Introductory example, and its analysis using the R statistical Software

rm(list = ls()) # clean the global environment

Reading in the data

We read the data from the directory C:\Johannes Ledolter\2020March01Book\Chapter1WEB

data <- read.csv('C:\\Johannes Ledolter\\2020March01Book\\Chapter1WEB\\test.csv', header=FALSE, stringsAsFactors=F)

Cleaning the data

dim(data) # dimension of the data set

## [1] 4 1

data[1:4,1] # view all speeches

## [1] "Mr. STEVENS. The gentleman cannot have anything to explain. As he has not spoken there is nothing to explain, nothing to patch up."                                                                                                                                                                                                                                                                                                                                
## [2] "Mr. BROOKS. I do not see the relevancy of that remark. I desire to know, Mr. Clerk, first, whether I can yield the floor temporarily to the gentleman from Pennsylvania for the purpose of explanation; or second, whether I can yield the remaining portion of my time to him."                                                                                                                                                                                   
## [3] "The CLERK. Under the rules of the House the gentleman from New York cannot yield the remaining portion of his time to any member if objection is made to his doing so; nor can he yield to any other member, except for purposes of personal explanation in relation to the pending proposition. The gentleman from Pennsylvania has stated that he did not rise for the purpose of explanation. Hence by his own statement he is precluded from taking the floor."
## [4] "Mr. JOHNSON. I do not desire the floor for the purpose of personal explanation, but I desire to as an explanation of the gentleman from New York."

dim(data)[1]    # the first value of dim(data), which is the number of speeches, the data frame has 4 rows

## [1] 4

for (i in 1:dim(data)[1]) {                                 # i takes on the value from 1 to 4
  txt=data[i,1]                                             # assigning each speech to variable 'txt'
  txt=tolower(txt)                                          # transform all text to lower cases 
  txt=gsub("[.]","", ignore.case = TRUE,txt)                  # delete . 
  txt=gsub("[,]","", ignore.case = TRUE,txt)                  # delete , 
  txt=gsub("[;]","", ignore.case = TRUE,txt)                  # delete ; 
  txt=gsub("new york","new-york", ignore.case = TRUE,txt)   # the state of new-york
  data[i,1]=txt                                             # assign cleaned variable 'txt' back into the data frame 'data'
}

data[1:4,1]  # check new data frame 'data'

## [1] "mr stevens the gentleman cannot have anything to explain as he has not spoken there is nothing to explain nothing to patch up"                                                                                                                                                                                                                                                                                                                               
## [2] "mr brooks i do not see the relevancy of that remark i desire to know mr clerk first whether i can yield the floor temporarily to the gentleman from pennsylvania for the purpose of explanation or second whether i can yield the remaining portion of my time to him"                                                                                                                                                                                       
## [3] "the clerk under the rules of the house the gentleman from new-york cannot yield the remaining portion of his time to any member if objection is made to his doing so nor can he yield to any other member except for purposes of personal explanation in relation to the pending proposition the gentleman from pennsylvania has stated that he did not rise for the purpose of explanation hence by his own statement he is precluded from taking the floor"
## [4] "mr johnson i do not desire the floor for the purpose of personal explanation but i desire to as an explanation of the gentleman from new-york"

omit meta variables from the text

speaker in meta2; determine length of each speech works if there are no missing values in meta2

# create new variables, preparation for the for loop below
len=dim(dim(data)[1])
meta1=dim(dim(data)[1])
meta2=dim(dim(data)[1])

for (i in 1:dim(data)[1]) {
  txt=data[i,1]                  # assign each speech to variable 'txt'
  temp=strsplit(txt, " ")[[1]]   # split variable 'txt' based on space
  len[i]=length(temp)-2          # assign the length of each speech excluding the speaker name into variable 'len' 
  meta1[i]=temp[1]               # assign 1st word in 'temp'(mr/the) to variable meta1
  meta2[i]=temp[2]               # assign 2nd word in 'temp'(stevens/brooks/clerk/johnson) to variable meta2
  tempr=dim(len[i])              # create new variable 'tempr'
  for (j in 1:len[i]) { 
    tempr[j]=temp[j+2]           # assign each word in 'temp' excluding the speaker name into 'tempr' 
    }
  data[i,1]=toString(tempr)      # combine words back into one string 
  data[i,1]=gsub("[,]","", ignore.case = TRUE,data[i,1])
}

data[1:4,1] # check speeches after excluding speaker names

## [1] "the gentleman cannot have anything to explain as he has not spoken there is nothing to explain nothing to patch up"                                                                                                                                                                                                                                                                                                                                
## [2] "i do not see the relevancy of that remark i desire to know mr clerk first whether i can yield the floor temporarily to the gentleman from pennsylvania for the purpose of explanation or second whether i can yield the remaining portion of my time to him"                                                                                                                                                                                       
## [3] "under the rules of the house the gentleman from new-york cannot yield the remaining portion of his time to any member if objection is made to his doing so nor can he yield to any other member except for purposes of personal explanation in relation to the pending proposition the gentleman from pennsylvania has stated that he did not rise for the purpose of explanation hence by his own statement he is precluded from taking the floor"
## [4] "i do not desire the floor for the purpose of personal explanation but i desire to as an explanation of the gentleman from new-york"

len             # variable 'len' is the measure of how many words are in each speech after excluding speaker names

## [1] 21 47 77 24

hist(len)       # histogram of variable 'len'

boxplot(len)    # boxplot of variable 'len'

quantile(len)   # quantile of variable 'len'

##    0%   25%   50%   75%  100% 
## 21.00 23.25 35.50 54.50 77.00

meta2           # values in variable 'meta2'

## [1] "stevens" "brooks"  "clerk"   "johnson"

creating corpus

use the ‘tm’ package to create the corpus

#install.packages("tm")
library(tm)

## Loading required package: NLP

corpus <- VCorpus(VectorSource(data[,1]),readerControl = list(reader = readPlain)) # this is how to create corpus 

corpus1 <- tm_map(corpus, stripWhitespace)                                         # remove whitespace
corpus2 <- tm_map(corpus1, content_transformer(tolower))                             # transform to lower case
corpus3 <- tm_map(corpus2, removePunctuation)                                      # remove puntuation
corpus4 <- tm_map(corpus3, removeNumbers)                                          # remove numbers
corpus5 <- tm_map(corpus4, removeWords, stopwords("english"))                      # remove stopwords

Document-term matrix without stemming

corp.dtm <- DocumentTermMatrix(corpus5,control=list(stemming=FALSE)) ## no stemming is the default
corp.dtm

## <<DocumentTermMatrix (documents: 4, terms: 43)>>
## Non-/sparse entries: 61/111
## Sparsity           : 65%
## Maximal term length: 12
## Weighting          : term frequency (tf)

Term-document matrix without stemming

corp.tdm <- TermDocumentMatrix(corpus5,control=list(stemming=FALSE))
corp.tdm

## <<TermDocumentMatrix (terms: 43, documents: 4)>>
## Non-/sparse entries: 61/111
## Sparsity           : 65%
## Maximal term length: 12
## Weighting          : term frequency (tf)

Document-term matrix with stemming

corps.dtm <- DocumentTermMatrix(corpus5,control=list(stemming=TRUE))    
corps.dtm

## <<DocumentTermMatrix (documents: 4, terms: 42)>>
## Non-/sparse entries: 60/108
## Sparsity           : 64%
## Maximal term length: 12
## Weighting          : term frequency (tf)

findFreqTerms(corp.dtm,1)

##  [1] "anything"     "can"          "clerk"        "desire"       "except"      
##  [6] "explain"      "explanation"  "first"        "floor"        "gentleman"   
## [11] "hence"        "house"        "know"         "made"         "member"      
## [16] "newyork"      "nothing"      "objection"    "patch"        "pending"     
## [21] "pennsylvania" "personal"     "portion"      "precluded"    "proposition" 
## [26] "purpose"      "purposes"     "relation"     "relevancy"    "remaining"   
## [31] "remark"       "rise"         "rules"        "second"       "see"         
## [36] "spoken"       "stated"       "statement"    "taking"       "temporarily" 
## [41] "time"         "whether"      "yield"

Variable ‘corps’ used stemming method, here purpose and purposes are combined into purpos

findFreqTerms(corps.dtm,1)

##  [1] "anyth"        "can"          "clerk"        "desir"        "except"      
##  [6] "explain"      "explan"       "first"        "floor"        "gentleman"   
## [11] "henc"         "hous"         "know"         "made"         "member"      
## [16] "newyork"      "noth"         "object"       "patch"        "pend"        
## [21] "pennsylvania" "person"       "portion"      "preclud"      "proposit"    
## [26] "purpos"       "relat"        "relev"        "remain"       "remark"      
## [31] "rise"         "rule"         "second"       "see"          "spoken"      
## [36] "state"        "statement"    "take"         "temporarili"  "time"        
## [41] "whether"      "yield"

stopwords

stopwords("english")

##   [1] "i"          "me"         "my"         "myself"     "we"        
##   [6] "our"        "ours"       "ourselves"  "you"        "your"      
##  [11] "yours"      "yourself"   "yourselves" "he"         "him"       
##  [16] "his"        "himself"    "she"        "her"        "hers"      
##  [21] "herself"    "it"         "its"        "itself"     "they"      
##  [26] "them"       "their"      "theirs"     "themselves" "what"      
##  [31] "which"      "who"        "whom"       "this"       "that"      
##  [36] "these"      "those"      "am"         "is"         "are"       
##  [41] "was"        "were"       "be"         "been"       "being"     
##  [46] "have"       "has"        "had"        "having"     "do"        
##  [51] "does"       "did"        "doing"      "would"      "should"    
##  [56] "could"      "ought"      "i'm"        "you're"     "he's"      
##  [61] "she's"      "it's"       "we're"      "they're"    "i've"      
##  [66] "you've"     "we've"      "they've"    "i'd"        "you'd"     
##  [71] "he'd"       "she'd"      "we'd"       "they'd"     "i'll"      
##  [76] "you'll"     "he'll"      "she'll"     "we'll"      "they'll"   
##  [81] "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"    
##  [86] "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"    
##  [91] "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"     
##  [96] "cannot"     "couldn't"   "mustn't"    "let's"      "that's"    
## [101] "who's"      "what's"     "here's"     "there's"    "when's"    
## [106] "where's"    "why's"      "how's"      "a"          "an"        
## [111] "the"        "and"        "but"        "if"         "or"        
## [116] "because"    "as"         "until"      "while"      "of"        
## [121] "at"         "by"         "for"        "with"       "about"     
## [126] "against"    "between"    "into"       "through"    "during"    
## [131] "before"     "after"      "above"      "below"      "to"        
## [136] "from"       "up"         "down"       "in"         "out"       
## [141] "on"         "off"        "over"       "under"      "again"     
## [146] "further"    "then"       "once"       "here"       "there"     
## [151] "when"       "where"      "why"        "how"        "all"       
## [156] "any"        "both"       "each"       "few"        "more"      
## [161] "most"       "other"      "some"       "such"       "no"        
## [166] "nor"        "not"        "only"       "own"        "same"      
## [171] "so"         "than"       "too"        "very"

adding your own stopwords

stopwordsnew1=c(stopwords("english"),"occasionally")  # combine 'occasionally' and 'english stopwords' to be a new stopwords set
stopwordsnew1

##   [1] "i"            "me"           "my"           "myself"       "we"          
##   [6] "our"          "ours"         "ourselves"    "you"          "your"        
##  [11] "yours"        "yourself"     "yourselves"   "he"           "him"         
##  [16] "his"          "himself"      "she"          "her"          "hers"        
##  [21] "herself"      "it"           "its"          "itself"       "they"        
##  [26] "them"         "their"        "theirs"       "themselves"   "what"        
##  [31] "which"        "who"          "whom"         "this"         "that"        
##  [36] "these"        "those"        "am"           "is"           "are"         
##  [41] "was"          "were"         "be"           "been"         "being"       
##  [46] "have"         "has"          "had"          "having"       "do"          
##  [51] "does"         "did"          "doing"        "would"        "should"      
##  [56] "could"        "ought"        "i'm"          "you're"       "he's"        
##  [61] "she's"        "it's"         "we're"        "they're"      "i've"        
##  [66] "you've"       "we've"        "they've"      "i'd"          "you'd"       
##  [71] "he'd"         "she'd"        "we'd"         "they'd"       "i'll"        
##  [76] "you'll"       "he'll"        "she'll"       "we'll"        "they'll"     
##  [81] "isn't"        "aren't"       "wasn't"       "weren't"      "hasn't"      
##  [86] "haven't"      "hadn't"       "doesn't"      "don't"        "didn't"      
##  [91] "won't"        "wouldn't"     "shan't"       "shouldn't"    "can't"       
##  [96] "cannot"       "couldn't"     "mustn't"      "let's"        "that's"      
## [101] "who's"        "what's"       "here's"       "there's"      "when's"      
## [106] "where's"      "why's"        "how's"        "a"            "an"          
## [111] "the"          "and"          "but"          "if"           "or"          
## [116] "because"      "as"           "until"        "while"        "of"          
## [121] "at"           "by"           "for"          "with"         "about"       
## [126] "against"      "between"      "into"         "through"      "during"      
## [131] "before"       "after"        "above"        "below"        "to"          
## [136] "from"         "up"           "down"         "in"           "out"         
## [141] "on"           "off"          "over"         "under"        "again"       
## [146] "further"      "then"         "once"         "here"         "there"       
## [151] "when"         "where"        "why"          "how"          "all"         
## [156] "any"          "both"         "each"         "few"          "more"        
## [161] "most"         "other"        "some"         "such"         "no"          
## [166] "nor"          "not"          "only"         "own"          "same"        
## [171] "so"           "than"         "too"          "very"         "occasionally"

stopwordsnew2=c("perhaps","never")                    # 'perhaps' and 'never' are the stopwords
stopwordsnew2

## [1] "perhaps" "never"

frequencies of words

#install.packages("ggplot2")
library(ggplot2)                  # load 'ggplot2' package

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

dim(corp.dtm)

## [1]  4 43

as.matrix(corp.dtm)

##     Terms
## Docs anything can clerk desire except explain explanation first floor gentleman
##    1        1   0     0      0      0       2           0     0     0         1
##    2        0   2     1      1      0       0           1     1     1         1
##    3        0   1     0      0      1       0           2     0     1         2
##    4        0   0     0      2      0       0           2     0     1         1
##     Terms
## Docs hence house know made member newyork nothing objection patch pending
##    1     0     0    0    0      0       0       2         0     1       0
##    2     0     0    1    0      0       0       0         0     0       0
##    3     1     1    0    1      2       1       0         1     0       1
##    4     0     0    0    0      0       1       0         0     0       0
##     Terms
## Docs pennsylvania personal portion precluded proposition purpose purposes
##    1            0        0       0         0           0       0        0
##    2            1        0       1         0           0       1        0
##    3            1        1       1         1           1       1        1
##    4            0        1       0         0           0       1        0
##     Terms
## Docs relation relevancy remaining remark rise rules second see spoken stated
##    1        0         0         0      0    0     0      0   0      1      0
##    2        0         1         1      1    0     0      1   1      0      0
##    3        1         0         1      0    1     1      0   0      0      1
##    4        0         0         0      0    0     0      0   0      0      0
##     Terms
## Docs statement taking temporarily time whether yield
##    1         0      0           0    0       0     0
##    2         0      0           1    1       2     2
##    3         1      1           0    1       0     2
##    4         0      0           0    0       0     0

findFreqTerms(corp.dtm,1)

##  [1] "anything"     "can"          "clerk"        "desire"       "except"      
##  [6] "explain"      "explanation"  "first"        "floor"        "gentleman"   
## [11] "hence"        "house"        "know"         "made"         "member"      
## [16] "newyork"      "nothing"      "objection"    "patch"        "pending"     
## [21] "pennsylvania" "personal"     "portion"      "precluded"    "proposition" 
## [26] "purpose"      "purposes"     "relation"     "relevancy"    "remaining"   
## [31] "remark"       "rise"         "rules"        "second"       "see"         
## [36] "spoken"       "stated"       "statement"    "taking"       "temporarily" 
## [41] "time"         "whether"      "yield"

findFreqTerms(corp.dtm,2)

##  [1] "can"          "desire"       "explain"      "explanation"  "floor"       
##  [6] "gentleman"    "member"       "newyork"      "nothing"      "pennsylvania"
## [11] "personal"     "portion"      "purpose"      "remaining"    "time"        
## [16] "whether"      "yield"

freq=colSums(as.matrix(corp.dtm))     # column summation
ord=order(freq)                       # order words based on their frequencies, increasing is the default order method 
freq[head(ord)]                       # first 6 value from the variable 'ord'

## anything    clerk   except    first    hence    house 
##        1        1        1        1        1        1

freq[tail(ord)]                       # last 6  value from the variable 'ord'

##      desire       floor     purpose       yield explanation   gentleman 
##           3           3           3           4           5           5

freq=sort(colSums(as.matrix(corp.dtm)),decreasing=TRUE)  
head(freq,20)

##  explanation    gentleman        yield          can       desire        floor 
##            5            5            4            3            3            3 
##      purpose      explain       member      newyork      nothing pennsylvania 
##            3            2            2            2            2            2 
##     personal      portion    remaining         time      whether     anything 
##            2            2            2            2            2            1 
##        clerk       except 
##            1            1

wf=data.frame(word=names(freq),freq=freq)     # convert into a dataframe
head(wf)

displaying frequencies

p=ggplot(subset(wf,freq>2),aes(word,freq))             # plot graph based on words which have more that 2 freq in 'wf'
p=p+geom_bar(stat="identity")                          # stat: statistical transformation used for the data
p=p+theme(axis.text.x=element_text(angle=45,hjust=1))  # x-axis text value is 45 degree angled, horizontally right justified 
p

displaying word clouds

#install.packages("wordcloud")
library(wordcloud)                        # load 'wordcloud' package

## Loading required package: RColorBrewer

set.seed(142)                             # 
wordcloud(names(freq),freq,min.freq=1)

## Warning in wordcloud(names(freq), freq, min.freq = 1): explanation could not be
## fit on page. It will not be plotted.

set.seed(142)
dark2 <- brewer.pal(6,"Dark2")
wordcloud(names(freq),freq,max.words=7,rot.per=0.2,colors=dark2)

finding associations

as.matrix(corp.dtm)

##     Terms
## Docs anything can clerk desire except explain explanation first floor gentleman
##    1        1   0     0      0      0       2           0     0     0         1
##    2        0   2     1      1      0       0           1     1     1         1
##    3        0   1     0      0      1       0           2     0     1         2
##    4        0   0     0      2      0       0           2     0     1         1
##     Terms
## Docs hence house know made member newyork nothing objection patch pending
##    1     0     0    0    0      0       0       2         0     1       0
##    2     0     0    1    0      0       0       0         0     0       0
##    3     1     1    0    1      2       1       0         1     0       1
##    4     0     0    0    0      0       1       0         0     0       0
##     Terms
## Docs pennsylvania personal portion precluded proposition purpose purposes
##    1            0        0       0         0           0       0        0
##    2            1        0       1         0           0       1        0
##    3            1        1       1         1           1       1        1
##    4            0        1       0         0           0       1        0
##     Terms
## Docs relation relevancy remaining remark rise rules second see spoken stated
##    1        0         0         0      0    0     0      0   0      1      0
##    2        0         1         1      1    0     0      1   1      0      0
##    3        1         0         1      0    1     1      0   0      0      1
##    4        0         0         0      0    0     0      0   0      0      0
##     Terms
## Docs statement taking temporarily time whether yield
##    1         0      0           0    0       0     0
##    2         0      0           1    1       2     2
##    3         1      1           0    1       0     2
##    4         0      0           0    0       0     0

findAssocs(corp.dtm, "explanation", 0.5)

## $explanation
##     newyork    personal       floor     purpose      except   gentleman 
##        0.90        0.90        0.87        0.87        0.52        0.52 
##       hence       house        made      member   objection     pending 
##        0.52        0.52        0.52        0.52        0.52        0.52 
##   precluded proposition    purposes    relation        rise       rules 
##        0.52        0.52        0.52        0.52        0.52        0.52 
##      stated   statement      taking 
##        0.52        0.52        0.52

findAssocs(corp.dtm, "gentleman", 0.5)

## $gentleman
##       except        hence        house         made       member    objection 
##         1.00         1.00         1.00         1.00         1.00         1.00 
##      pending    precluded  proposition     purposes     relation         rise 
##         1.00         1.00         1.00         1.00         1.00         1.00 
##        rules       stated    statement       taking      newyork pennsylvania 
##         1.00         1.00         1.00         1.00         0.58         0.58 
##     personal      portion    remaining         time        yield  explanation 
##         0.58         0.58         0.58         0.58         0.58         0.52

weightBin creates indicator variables for presence of term

Bcorp.dtm=weightBin(corp.dtm)
as.matrix(Bcorp.dtm)

##     Terms
## Docs anything can clerk desire except explain explanation first floor gentleman
##    1        1   0     0      0      0       1           0     0     0         1
##    2        0   1     1      1      0       0           1     1     1         1
##    3        0   1     0      0      1       0           1     0     1         1
##    4        0   0     0      1      0       0           1     0     1         1
##     Terms
## Docs hence house know made member newyork nothing objection patch pending
##    1     0     0    0    0      0       0       1         0     1       0
##    2     0     0    1    0      0       0       0         0     0       0
##    3     1     1    0    1      1       1       0         1     0       1
##    4     0     0    0    0      0       1       0         0     0       0
##     Terms
## Docs pennsylvania personal portion precluded proposition purpose purposes
##    1            0        0       0         0           0       0        0
##    2            1        0       1         0           0       1        0
##    3            1        1       1         1           1       1        1
##    4            0        1       0         0           0       1        0
##     Terms
## Docs relation relevancy remaining remark rise rules second see spoken stated
##    1        0         0         0      0    0     0      0   0      1      0
##    2        0         1         1      1    0     0      1   1      0      0
##    3        1         0         1      0    1     1      0   0      0      1
##    4        0         0         0      0    0     0      0   0      0      0
##     Terms
## Docs statement taking temporarily time whether yield
##    1         0      0           0    0       0     0
##    2         0      0           1    1       1     1
##    3         1      1           0    1       0     1
##    4         0      0           0    0       0     0

findAssocs(Bcorp.dtm, "explanation", 0.5)

## $explanation
##        floor      purpose          can       desire      newyork pennsylvania 
##         1.00         1.00         0.58         0.58         0.58         0.58 
##     personal      portion    remaining         time        yield 
##         0.58         0.58         0.58         0.58         0.58

findAssocs(Bcorp.dtm, "gentleman", 0.5)

## $gentleman
## numeric(0)

sequence plotting

helpful as it shows whether certain terms occur together

vecg=as.matrix(corp.dtm)[,"gentleman"]
vecg

## 1 2 3 4 
## 1 1 2 1

vece=as.matrix(corp.dtm)[,"explanation"]
vece

## 1 2 3 4 
## 0 1 2 2

par(mfrow=c(1,1))
plot(vecg,type="l",lwd=7,xlab="document",ylab="frequency",ylim=c(0,max(c(vecg,vece))))
lines(vece,type="l",col=10,lwd=3)

bigrams

BigramTokenizer <-  function(x)
    unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
bi.dtm <- DocumentTermMatrix(corpus5, control = list(tokenize = BigramTokenizer))
bi.dtm

## <<DocumentTermMatrix (documents: 4, terms: 59)>>
## Non-/sparse entries: 67/169
## Sparsity           : 72%
## Maximal term length: 22
## Weighting          : term frequency (tf)

as.matrix(bi.dtm)

##     Terms
## Docs anything explain can yield clerk first desire explanation desire floor
##    1                1         0           0                  0            0
##    2                0         2           1                  0            0
##    3                0         1           0                  0            0
##    4                0         0           0                  1            1
##     Terms
## Docs desire know except purposes explain nothing explain spoken
##    1           0               0               1              1
##    2           1               0               0              0
##    3           0               1               0              0
##    4           0               0               0              0
##     Terms
## Docs explanation desire explanation gentleman explanation hence
##    1                  0                     0                 0
##    2                  0                     0                 0
##    3                  0                     0                 1
##    4                  1                     1                 0
##     Terms
## Docs explanation relation explanation second first whether floor purpose
##    1                    0                  0             0             0
##    2                    0                  1             1             0
##    3                    1                  0             0             0
##    4                    0                  0             0             1
##     Terms
## Docs floor temporarily gentleman anything gentleman newyork
##    1                 0                  1                 0
##    2                 1                  0                 0
##    3                 0                  0                 1
##    4                 0                  0                 1
##     Terms
## Docs gentleman pennsylvania hence statement house gentleman know mr made can
##    1                      0               0               0       0        0
##    2                      1               0               0       1        0
##    3                      1               1               1       0        1
##    4                      0               0               0       0        0
##     Terms
## Docs member except member objection mr clerk newyork yield nothing explain
##    1             0                0        0             0               1
##    2             0                0        1             0               0
##    3             1                1        0             1               0
##    4             0                0        0             0               0
##     Terms
## Docs nothing patch objection made pending proposition pennsylvania purpose
##    1             1              0                   0                    0
##    2             0              0                   0                    1
##    3             0              1                   1                    0
##    4             0              0                   0                    0
##     Terms
## Docs pennsylvania stated personal explanation portion time precluded taking
##    1                   0                    0            0                0
##    2                   0                    0            1                0
##    3                   1                    1            1                1
##    4                   0                    1            0                0
##     Terms
## Docs proposition gentleman purpose explanation purpose personal
##    1                     0                   0                0
##    2                     0                   1                0
##    3                     1                   1                0
##    4                     0                   0                1
##     Terms
## Docs purposes personal relation pending relevancy remark remaining portion
##    1                 0                0                0                 0
##    2                 0                0                1                 1
##    3                 1                1                0                 1
##    4                 0                0                0                 0
##     Terms
## Docs remark desire rise purpose rules house second whether see relevancy
##    1             0            0           0              0             0
##    2             1            0           0              1             1
##    3             0            1           1              0             0
##    4             0            0           0              0             0
##     Terms
## Docs spoken nothing stated rise statement precluded taking floor
##    1              1           0                   0            0
##    2              0           0                   0            0
##    3              0           1                   1            1
##    4              0           0                   0            0
##     Terms
## Docs temporarily gentleman time member whether can yield floor yield member
##    1                     0           0           0           0            0
##    2                     1           0           2           1            0
##    3                     0           1           0           0            1
##    4                     0           0           0           0            0
##     Terms
## Docs yield remaining
##    1               0
##    2               1
##    3               1
##    4               0

bi.tdm <- TermDocumentMatrix(corpus5, control = list(tokenize = BigramTokenizer))
bi.tdm

## <<TermDocumentMatrix (terms: 59, documents: 4)>>
## Non-/sparse entries: 67/169
## Sparsity           : 72%
## Maximal term length: 22
## Weighting          : term frequency (tf)

as.matrix(bi.tdm)

##                         Docs
## Terms                    1 2 3 4
##   anything explain       1 0 0 0
##   can yield              0 2 1 0
##   clerk first            0 1 0 0
##   desire explanation     0 0 0 1
##   desire floor           0 0 0 1
##   desire know            0 1 0 0
##   except purposes        0 0 1 0
##   explain nothing        1 0 0 0
##   explain spoken         1 0 0 0
##   explanation desire     0 0 0 1
##   explanation gentleman  0 0 0 1
##   explanation hence      0 0 1 0
##   explanation relation   0 0 1 0
##   explanation second     0 1 0 0
##   first whether          0 1 0 0
##   floor purpose          0 0 0 1
##   floor temporarily      0 1 0 0
##   gentleman anything     1 0 0 0
##   gentleman newyork      0 0 1 1
##   gentleman pennsylvania 0 1 1 0
##   hence statement        0 0 1 0
##   house gentleman        0 0 1 0
##   know mr                0 1 0 0
##   made can               0 0 1 0
##   member except          0 0 1 0
##   member objection       0 0 1 0
##   mr clerk               0 1 0 0
##   newyork yield          0 0 1 0
##   nothing explain        1 0 0 0
##   nothing patch          1 0 0 0
##   objection made         0 0 1 0
##   pending proposition    0 0 1 0
##   pennsylvania purpose   0 1 0 0
##   pennsylvania stated    0 0 1 0
##   personal explanation   0 0 1 1
##   portion time           0 1 1 0
##   precluded taking       0 0 1 0
##   proposition gentleman  0 0 1 0
##   purpose explanation    0 1 1 0
##   purpose personal       0 0 0 1
##   purposes personal      0 0 1 0
##   relation pending       0 0 1 0
##   relevancy remark       0 1 0 0
##   remaining portion      0 1 1 0
##   remark desire          0 1 0 0
##   rise purpose           0 0 1 0
##   rules house            0 0 1 0
##   second whether         0 1 0 0
##   see relevancy          0 1 0 0
##   spoken nothing         1 0 0 0
##   stated rise            0 0 1 0
##   statement precluded    0 0 1 0
##   taking floor           0 0 1 0
##   temporarily gentleman  0 1 0 0
##   time member            0 0 1 0
##   whether can            0 2 0 0
##   yield floor            0 1 0 0
##   yield member           0 0 1 0
##   yield remaining        0 1 1 0

displaying bigram frequencies

findFreqTerms(bi.dtm,1)

##  [1] "anything explain"       "can yield"              "clerk first"           
##  [4] "desire explanation"     "desire floor"           "desire know"           
##  [7] "except purposes"        "explain nothing"        "explain spoken"        
## [10] "explanation desire"     "explanation gentleman"  "explanation hence"     
## [13] "explanation relation"   "explanation second"     "first whether"         
## [16] "floor purpose"          "floor temporarily"      "gentleman anything"    
## [19] "gentleman newyork"      "gentleman pennsylvania" "hence statement"       
## [22] "house gentleman"        "know mr"                "made can"              
## [25] "member except"          "member objection"       "mr clerk"              
## [28] "newyork yield"          "nothing explain"        "nothing patch"         
## [31] "objection made"         "pending proposition"    "pennsylvania purpose"  
## [34] "pennsylvania stated"    "personal explanation"   "portion time"          
## [37] "precluded taking"       "proposition gentleman"  "purpose explanation"   
## [40] "purpose personal"       "purposes personal"      "relation pending"      
## [43] "relevancy remark"       "remaining portion"      "remark desire"         
## [46] "rise purpose"           "rules house"            "second whether"        
## [49] "see relevancy"          "spoken nothing"         "stated rise"           
## [52] "statement precluded"    "taking floor"           "temporarily gentleman" 
## [55] "time member"            "whether can"            "yield floor"           
## [58] "yield member"           "yield remaining"

findFreqTerms(bi.dtm,2)

## [1] "can yield"              "gentleman newyork"      "gentleman pennsylvania"
## [4] "personal explanation"   "portion time"           "purpose explanation"   
## [7] "remaining portion"      "whether can"            "yield remaining"

freq=colSums(as.matrix(bi.dtm))
ord=order(freq)
freq[head(ord)]

##   anything explain        clerk first desire explanation       desire floor 
##                  1                  1                  1                  1 
##        desire know    except purposes 
##                  1                  1

freq[tail(ord)]

##        portion time purpose explanation   remaining portion         whether can 
##                   2                   2                   2                   2 
##     yield remaining           can yield 
##                   2                   3

freq=sort(colSums(as.matrix(bi.dtm)),decreasing=TRUE)
head(freq,20)

##              can yield      gentleman newyork gentleman pennsylvania 
##                      3                      2                      2 
##   personal explanation           portion time    purpose explanation 
##                      2                      2                      2 
##      remaining portion            whether can        yield remaining 
##                      2                      2                      2 
##       anything explain            clerk first     desire explanation 
##                      1                      1                      1 
##           desire floor            desire know        except purposes 
##                      1                      1                      1 
##        explain nothing         explain spoken     explanation desire 
##                      1                      1                      1 
##  explanation gentleman      explanation hence 
##                      1                      1

wf=data.frame(word=names(freq),freq=freq)
head(wf)

p=ggplot(subset(wf,freq>1),aes(word,freq))
p=p+geom_bar(stat="identity")
p=p+theme(axis.text.x=element_text(angle=45,hjust=1))
p

R Notebook

Chapter 1 R CODE: ProgramTest.docx

1.3 Introductory example, and its analysis using the R statistical Software

Reading in the data

Cleaning the data

omit meta variables from the text

creating corpus

stopwords

adding your own stopwords

frequencies of words

frequencies of words

displaying frequencies

displaying word clouds

finding associations

weightBin creates indicator variables for presence of term

sequence plotting

bigrams

bigrams

displaying bigram frequencies