rm(list = ls()) # clean the global environment
We read the data from the directory C:\Johannes Ledolter\2020March01Book\Chapter1WEB
data <- read.csv('C:\\Johannes Ledolter\\2020March01Book\\Chapter1WEB\\test.csv', header=FALSE, stringsAsFactors=F)
dim(data) # dimension of the data set
## [1] 4 1
data[1:4,1] # view all speeches
## [1] "Mr. STEVENS. The gentleman cannot have anything to explain. As he has not spoken there is nothing to explain, nothing to patch up."
## [2] "Mr. BROOKS. I do not see the relevancy of that remark. I desire to know, Mr. Clerk, first, whether I can yield the floor temporarily to the gentleman from Pennsylvania for the purpose of explanation; or second, whether I can yield the remaining portion of my time to him."
## [3] "The CLERK. Under the rules of the House the gentleman from New York cannot yield the remaining portion of his time to any member if objection is made to his doing so; nor can he yield to any other member, except for purposes of personal explanation in relation to the pending proposition. The gentleman from Pennsylvania has stated that he did not rise for the purpose of explanation. Hence by his own statement he is precluded from taking the floor."
## [4] "Mr. JOHNSON. I do not desire the floor for the purpose of personal explanation, but I desire to as an explanation of the gentleman from New York."
dim(data)[1] # the first value of dim(data), which is the number of speeches, the data frame has 4 rows
## [1] 4
for (i in 1:dim(data)[1]) { # i takes on the value from 1 to 4
txt=data[i,1] # assigning each speech to variable 'txt'
txt=tolower(txt) # transform all text to lower cases
txt=gsub("[.]","", ignore.case = TRUE,txt) # delete .
txt=gsub("[,]","", ignore.case = TRUE,txt) # delete ,
txt=gsub("[;]","", ignore.case = TRUE,txt) # delete ;
txt=gsub("new york","new-york", ignore.case = TRUE,txt) # the state of new-york
data[i,1]=txt # assign cleaned variable 'txt' back into the data frame 'data'
}
data[1:4,1] # check new data frame 'data'
## [1] "mr stevens the gentleman cannot have anything to explain as he has not spoken there is nothing to explain nothing to patch up"
## [2] "mr brooks i do not see the relevancy of that remark i desire to know mr clerk first whether i can yield the floor temporarily to the gentleman from pennsylvania for the purpose of explanation or second whether i can yield the remaining portion of my time to him"
## [3] "the clerk under the rules of the house the gentleman from new-york cannot yield the remaining portion of his time to any member if objection is made to his doing so nor can he yield to any other member except for purposes of personal explanation in relation to the pending proposition the gentleman from pennsylvania has stated that he did not rise for the purpose of explanation hence by his own statement he is precluded from taking the floor"
## [4] "mr johnson i do not desire the floor for the purpose of personal explanation but i desire to as an explanation of the gentleman from new-york"
speaker in meta2; determine length of each speech works if there are no missing values in meta2
# create new variables, preparation for the for loop below
len=dim(dim(data)[1])
meta1=dim(dim(data)[1])
meta2=dim(dim(data)[1])
for (i in 1:dim(data)[1]) {
txt=data[i,1] # assign each speech to variable 'txt'
temp=strsplit(txt, " ")[[1]] # split variable 'txt' based on space
len[i]=length(temp)-2 # assign the length of each speech excluding the speaker name into variable 'len'
meta1[i]=temp[1] # assign 1st word in 'temp'(mr/the) to variable meta1
meta2[i]=temp[2] # assign 2nd word in 'temp'(stevens/brooks/clerk/johnson) to variable meta2
tempr=dim(len[i]) # create new variable 'tempr'
for (j in 1:len[i]) {
tempr[j]=temp[j+2] # assign each word in 'temp' excluding the speaker name into 'tempr'
}
data[i,1]=toString(tempr) # combine words back into one string
data[i,1]=gsub("[,]","", ignore.case = TRUE,data[i,1])
}
data[1:4,1] # check speeches after excluding speaker names
## [1] "the gentleman cannot have anything to explain as he has not spoken there is nothing to explain nothing to patch up"
## [2] "i do not see the relevancy of that remark i desire to know mr clerk first whether i can yield the floor temporarily to the gentleman from pennsylvania for the purpose of explanation or second whether i can yield the remaining portion of my time to him"
## [3] "under the rules of the house the gentleman from new-york cannot yield the remaining portion of his time to any member if objection is made to his doing so nor can he yield to any other member except for purposes of personal explanation in relation to the pending proposition the gentleman from pennsylvania has stated that he did not rise for the purpose of explanation hence by his own statement he is precluded from taking the floor"
## [4] "i do not desire the floor for the purpose of personal explanation but i desire to as an explanation of the gentleman from new-york"
len # variable 'len' is the measure of how many words are in each speech after excluding speaker names
## [1] 21 47 77 24
hist(len) # histogram of variable 'len'
boxplot(len) # boxplot of variable 'len'
quantile(len) # quantile of variable 'len'
## 0% 25% 50% 75% 100%
## 21.00 23.25 35.50 54.50 77.00
meta2 # values in variable 'meta2'
## [1] "stevens" "brooks" "clerk" "johnson"
use the ‘tm’ package to create the corpus
#install.packages("tm")
library(tm)
## Loading required package: NLP
corpus <- VCorpus(VectorSource(data[,1]),readerControl = list(reader = readPlain)) # this is how to create corpus
corpus1 <- tm_map(corpus, stripWhitespace) # remove whitespace
corpus2 <- tm_map(corpus1, content_transformer(tolower)) # transform to lower case
corpus3 <- tm_map(corpus2, removePunctuation) # remove puntuation
corpus4 <- tm_map(corpus3, removeNumbers) # remove numbers
corpus5 <- tm_map(corpus4, removeWords, stopwords("english")) # remove stopwords
Document-term matrix without stemming
corp.dtm <- DocumentTermMatrix(corpus5,control=list(stemming=FALSE)) ## no stemming is the default
corp.dtm
## <<DocumentTermMatrix (documents: 4, terms: 43)>>
## Non-/sparse entries: 61/111
## Sparsity : 65%
## Maximal term length: 12
## Weighting : term frequency (tf)
Term-document matrix without stemming
corp.tdm <- TermDocumentMatrix(corpus5,control=list(stemming=FALSE))
corp.tdm
## <<TermDocumentMatrix (terms: 43, documents: 4)>>
## Non-/sparse entries: 61/111
## Sparsity : 65%
## Maximal term length: 12
## Weighting : term frequency (tf)
Document-term matrix with stemming
corps.dtm <- DocumentTermMatrix(corpus5,control=list(stemming=TRUE))
corps.dtm
## <<DocumentTermMatrix (documents: 4, terms: 42)>>
## Non-/sparse entries: 60/108
## Sparsity : 64%
## Maximal term length: 12
## Weighting : term frequency (tf)
findFreqTerms(corp.dtm,1)
## [1] "anything" "can" "clerk" "desire" "except"
## [6] "explain" "explanation" "first" "floor" "gentleman"
## [11] "hence" "house" "know" "made" "member"
## [16] "newyork" "nothing" "objection" "patch" "pending"
## [21] "pennsylvania" "personal" "portion" "precluded" "proposition"
## [26] "purpose" "purposes" "relation" "relevancy" "remaining"
## [31] "remark" "rise" "rules" "second" "see"
## [36] "spoken" "stated" "statement" "taking" "temporarily"
## [41] "time" "whether" "yield"
Variable ‘corps’ used stemming method, here purpose and purposes are combined into purpos
findFreqTerms(corps.dtm,1)
## [1] "anyth" "can" "clerk" "desir" "except"
## [6] "explain" "explan" "first" "floor" "gentleman"
## [11] "henc" "hous" "know" "made" "member"
## [16] "newyork" "noth" "object" "patch" "pend"
## [21] "pennsylvania" "person" "portion" "preclud" "proposit"
## [26] "purpos" "relat" "relev" "remain" "remark"
## [31] "rise" "rule" "second" "see" "spoken"
## [36] "state" "statement" "take" "temporarili" "time"
## [41] "whether" "yield"
stopwords("english")
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
## [11] "yours" "yourself" "yourselves" "he" "him"
## [16] "his" "himself" "she" "her" "hers"
## [21] "herself" "it" "its" "itself" "they"
## [26] "them" "their" "theirs" "themselves" "what"
## [31] "which" "who" "whom" "this" "that"
## [36] "these" "those" "am" "is" "are"
## [41] "was" "were" "be" "been" "being"
## [46] "have" "has" "had" "having" "do"
## [51] "does" "did" "doing" "would" "should"
## [56] "could" "ought" "i'm" "you're" "he's"
## [61] "she's" "it's" "we're" "they're" "i've"
## [66] "you've" "we've" "they've" "i'd" "you'd"
## [71] "he'd" "she'd" "we'd" "they'd" "i'll"
## [76] "you'll" "he'll" "she'll" "we'll" "they'll"
## [81] "isn't" "aren't" "wasn't" "weren't" "hasn't"
## [86] "haven't" "hadn't" "doesn't" "don't" "didn't"
## [91] "won't" "wouldn't" "shan't" "shouldn't" "can't"
## [96] "cannot" "couldn't" "mustn't" "let's" "that's"
## [101] "who's" "what's" "here's" "there's" "when's"
## [106] "where's" "why's" "how's" "a" "an"
## [111] "the" "and" "but" "if" "or"
## [116] "because" "as" "until" "while" "of"
## [121] "at" "by" "for" "with" "about"
## [126] "against" "between" "into" "through" "during"
## [131] "before" "after" "above" "below" "to"
## [136] "from" "up" "down" "in" "out"
## [141] "on" "off" "over" "under" "again"
## [146] "further" "then" "once" "here" "there"
## [151] "when" "where" "why" "how" "all"
## [156] "any" "both" "each" "few" "more"
## [161] "most" "other" "some" "such" "no"
## [166] "nor" "not" "only" "own" "same"
## [171] "so" "than" "too" "very"
stopwordsnew1=c(stopwords("english"),"occasionally") # combine 'occasionally' and 'english stopwords' to be a new stopwords set
stopwordsnew1
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
## [11] "yours" "yourself" "yourselves" "he" "him"
## [16] "his" "himself" "she" "her" "hers"
## [21] "herself" "it" "its" "itself" "they"
## [26] "them" "their" "theirs" "themselves" "what"
## [31] "which" "who" "whom" "this" "that"
## [36] "these" "those" "am" "is" "are"
## [41] "was" "were" "be" "been" "being"
## [46] "have" "has" "had" "having" "do"
## [51] "does" "did" "doing" "would" "should"
## [56] "could" "ought" "i'm" "you're" "he's"
## [61] "she's" "it's" "we're" "they're" "i've"
## [66] "you've" "we've" "they've" "i'd" "you'd"
## [71] "he'd" "she'd" "we'd" "they'd" "i'll"
## [76] "you'll" "he'll" "she'll" "we'll" "they'll"
## [81] "isn't" "aren't" "wasn't" "weren't" "hasn't"
## [86] "haven't" "hadn't" "doesn't" "don't" "didn't"
## [91] "won't" "wouldn't" "shan't" "shouldn't" "can't"
## [96] "cannot" "couldn't" "mustn't" "let's" "that's"
## [101] "who's" "what's" "here's" "there's" "when's"
## [106] "where's" "why's" "how's" "a" "an"
## [111] "the" "and" "but" "if" "or"
## [116] "because" "as" "until" "while" "of"
## [121] "at" "by" "for" "with" "about"
## [126] "against" "between" "into" "through" "during"
## [131] "before" "after" "above" "below" "to"
## [136] "from" "up" "down" "in" "out"
## [141] "on" "off" "over" "under" "again"
## [146] "further" "then" "once" "here" "there"
## [151] "when" "where" "why" "how" "all"
## [156] "any" "both" "each" "few" "more"
## [161] "most" "other" "some" "such" "no"
## [166] "nor" "not" "only" "own" "same"
## [171] "so" "than" "too" "very" "occasionally"
stopwordsnew2=c("perhaps","never") # 'perhaps' and 'never' are the stopwords
stopwordsnew2
## [1] "perhaps" "never"
#install.packages("ggplot2")
library(ggplot2) # load 'ggplot2' package
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
dim(corp.dtm)
## [1] 4 43
as.matrix(corp.dtm)
## Terms
## Docs anything can clerk desire except explain explanation first floor gentleman
## 1 1 0 0 0 0 2 0 0 0 1
## 2 0 2 1 1 0 0 1 1 1 1
## 3 0 1 0 0 1 0 2 0 1 2
## 4 0 0 0 2 0 0 2 0 1 1
## Terms
## Docs hence house know made member newyork nothing objection patch pending
## 1 0 0 0 0 0 0 2 0 1 0
## 2 0 0 1 0 0 0 0 0 0 0
## 3 1 1 0 1 2 1 0 1 0 1
## 4 0 0 0 0 0 1 0 0 0 0
## Terms
## Docs pennsylvania personal portion precluded proposition purpose purposes
## 1 0 0 0 0 0 0 0
## 2 1 0 1 0 0 1 0
## 3 1 1 1 1 1 1 1
## 4 0 1 0 0 0 1 0
## Terms
## Docs relation relevancy remaining remark rise rules second see spoken stated
## 1 0 0 0 0 0 0 0 0 1 0
## 2 0 1 1 1 0 0 1 1 0 0
## 3 1 0 1 0 1 1 0 0 0 1
## 4 0 0 0 0 0 0 0 0 0 0
## Terms
## Docs statement taking temporarily time whether yield
## 1 0 0 0 0 0 0
## 2 0 0 1 1 2 2
## 3 1 1 0 1 0 2
## 4 0 0 0 0 0 0
findFreqTerms(corp.dtm,1)
## [1] "anything" "can" "clerk" "desire" "except"
## [6] "explain" "explanation" "first" "floor" "gentleman"
## [11] "hence" "house" "know" "made" "member"
## [16] "newyork" "nothing" "objection" "patch" "pending"
## [21] "pennsylvania" "personal" "portion" "precluded" "proposition"
## [26] "purpose" "purposes" "relation" "relevancy" "remaining"
## [31] "remark" "rise" "rules" "second" "see"
## [36] "spoken" "stated" "statement" "taking" "temporarily"
## [41] "time" "whether" "yield"
findFreqTerms(corp.dtm,2)
## [1] "can" "desire" "explain" "explanation" "floor"
## [6] "gentleman" "member" "newyork" "nothing" "pennsylvania"
## [11] "personal" "portion" "purpose" "remaining" "time"
## [16] "whether" "yield"
freq=colSums(as.matrix(corp.dtm)) # column summation
ord=order(freq) # order words based on their frequencies, increasing is the default order method
freq[head(ord)] # first 6 value from the variable 'ord'
## anything clerk except first hence house
## 1 1 1 1 1 1
freq[tail(ord)] # last 6 value from the variable 'ord'
## desire floor purpose yield explanation gentleman
## 3 3 3 4 5 5
freq=sort(colSums(as.matrix(corp.dtm)),decreasing=TRUE)
head(freq,20)
## explanation gentleman yield can desire floor
## 5 5 4 3 3 3
## purpose explain member newyork nothing pennsylvania
## 3 2 2 2 2 2
## personal portion remaining time whether anything
## 2 2 2 2 2 1
## clerk except
## 1 1
wf=data.frame(word=names(freq),freq=freq) # convert into a dataframe
head(wf)
p=ggplot(subset(wf,freq>2),aes(word,freq)) # plot graph based on words which have more that 2 freq in 'wf'
p=p+geom_bar(stat="identity") # stat: statistical transformation used for the data
p=p+theme(axis.text.x=element_text(angle=45,hjust=1)) # x-axis text value is 45 degree angled, horizontally right justified
p
#install.packages("wordcloud")
library(wordcloud) # load 'wordcloud' package
## Loading required package: RColorBrewer
set.seed(142) #
wordcloud(names(freq),freq,min.freq=1)
## Warning in wordcloud(names(freq), freq, min.freq = 1): explanation could not be
## fit on page. It will not be plotted.
set.seed(142)
dark2 <- brewer.pal(6,"Dark2")
wordcloud(names(freq),freq,max.words=7,rot.per=0.2,colors=dark2)
as.matrix(corp.dtm)
## Terms
## Docs anything can clerk desire except explain explanation first floor gentleman
## 1 1 0 0 0 0 2 0 0 0 1
## 2 0 2 1 1 0 0 1 1 1 1
## 3 0 1 0 0 1 0 2 0 1 2
## 4 0 0 0 2 0 0 2 0 1 1
## Terms
## Docs hence house know made member newyork nothing objection patch pending
## 1 0 0 0 0 0 0 2 0 1 0
## 2 0 0 1 0 0 0 0 0 0 0
## 3 1 1 0 1 2 1 0 1 0 1
## 4 0 0 0 0 0 1 0 0 0 0
## Terms
## Docs pennsylvania personal portion precluded proposition purpose purposes
## 1 0 0 0 0 0 0 0
## 2 1 0 1 0 0 1 0
## 3 1 1 1 1 1 1 1
## 4 0 1 0 0 0 1 0
## Terms
## Docs relation relevancy remaining remark rise rules second see spoken stated
## 1 0 0 0 0 0 0 0 0 1 0
## 2 0 1 1 1 0 0 1 1 0 0
## 3 1 0 1 0 1 1 0 0 0 1
## 4 0 0 0 0 0 0 0 0 0 0
## Terms
## Docs statement taking temporarily time whether yield
## 1 0 0 0 0 0 0
## 2 0 0 1 1 2 2
## 3 1 1 0 1 0 2
## 4 0 0 0 0 0 0
findAssocs(corp.dtm, "explanation", 0.5)
## $explanation
## newyork personal floor purpose except gentleman
## 0.90 0.90 0.87 0.87 0.52 0.52
## hence house made member objection pending
## 0.52 0.52 0.52 0.52 0.52 0.52
## precluded proposition purposes relation rise rules
## 0.52 0.52 0.52 0.52 0.52 0.52
## stated statement taking
## 0.52 0.52 0.52
findAssocs(corp.dtm, "gentleman", 0.5)
## $gentleman
## except hence house made member objection
## 1.00 1.00 1.00 1.00 1.00 1.00
## pending precluded proposition purposes relation rise
## 1.00 1.00 1.00 1.00 1.00 1.00
## rules stated statement taking newyork pennsylvania
## 1.00 1.00 1.00 1.00 0.58 0.58
## personal portion remaining time yield explanation
## 0.58 0.58 0.58 0.58 0.58 0.52
Bcorp.dtm=weightBin(corp.dtm)
as.matrix(Bcorp.dtm)
## Terms
## Docs anything can clerk desire except explain explanation first floor gentleman
## 1 1 0 0 0 0 1 0 0 0 1
## 2 0 1 1 1 0 0 1 1 1 1
## 3 0 1 0 0 1 0 1 0 1 1
## 4 0 0 0 1 0 0 1 0 1 1
## Terms
## Docs hence house know made member newyork nothing objection patch pending
## 1 0 0 0 0 0 0 1 0 1 0
## 2 0 0 1 0 0 0 0 0 0 0
## 3 1 1 0 1 1 1 0 1 0 1
## 4 0 0 0 0 0 1 0 0 0 0
## Terms
## Docs pennsylvania personal portion precluded proposition purpose purposes
## 1 0 0 0 0 0 0 0
## 2 1 0 1 0 0 1 0
## 3 1 1 1 1 1 1 1
## 4 0 1 0 0 0 1 0
## Terms
## Docs relation relevancy remaining remark rise rules second see spoken stated
## 1 0 0 0 0 0 0 0 0 1 0
## 2 0 1 1 1 0 0 1 1 0 0
## 3 1 0 1 0 1 1 0 0 0 1
## 4 0 0 0 0 0 0 0 0 0 0
## Terms
## Docs statement taking temporarily time whether yield
## 1 0 0 0 0 0 0
## 2 0 0 1 1 1 1
## 3 1 1 0 1 0 1
## 4 0 0 0 0 0 0
findAssocs(Bcorp.dtm, "explanation", 0.5)
## $explanation
## floor purpose can desire newyork pennsylvania
## 1.00 1.00 0.58 0.58 0.58 0.58
## personal portion remaining time yield
## 0.58 0.58 0.58 0.58 0.58
findAssocs(Bcorp.dtm, "gentleman", 0.5)
## $gentleman
## numeric(0)
helpful as it shows whether certain terms occur together
vecg=as.matrix(corp.dtm)[,"gentleman"]
vecg
## 1 2 3 4
## 1 1 2 1
vece=as.matrix(corp.dtm)[,"explanation"]
vece
## 1 2 3 4
## 0 1 2 2
par(mfrow=c(1,1))
plot(vecg,type="l",lwd=7,xlab="document",ylab="frequency",ylim=c(0,max(c(vecg,vece))))
lines(vece,type="l",col=10,lwd=3)
BigramTokenizer <- function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
bi.dtm <- DocumentTermMatrix(corpus5, control = list(tokenize = BigramTokenizer))
bi.dtm
## <<DocumentTermMatrix (documents: 4, terms: 59)>>
## Non-/sparse entries: 67/169
## Sparsity : 72%
## Maximal term length: 22
## Weighting : term frequency (tf)
as.matrix(bi.dtm)
## Terms
## Docs anything explain can yield clerk first desire explanation desire floor
## 1 1 0 0 0 0
## 2 0 2 1 0 0
## 3 0 1 0 0 0
## 4 0 0 0 1 1
## Terms
## Docs desire know except purposes explain nothing explain spoken
## 1 0 0 1 1
## 2 1 0 0 0
## 3 0 1 0 0
## 4 0 0 0 0
## Terms
## Docs explanation desire explanation gentleman explanation hence
## 1 0 0 0
## 2 0 0 0
## 3 0 0 1
## 4 1 1 0
## Terms
## Docs explanation relation explanation second first whether floor purpose
## 1 0 0 0 0
## 2 0 1 1 0
## 3 1 0 0 0
## 4 0 0 0 1
## Terms
## Docs floor temporarily gentleman anything gentleman newyork
## 1 0 1 0
## 2 1 0 0
## 3 0 0 1
## 4 0 0 1
## Terms
## Docs gentleman pennsylvania hence statement house gentleman know mr made can
## 1 0 0 0 0 0
## 2 1 0 0 1 0
## 3 1 1 1 0 1
## 4 0 0 0 0 0
## Terms
## Docs member except member objection mr clerk newyork yield nothing explain
## 1 0 0 0 0 1
## 2 0 0 1 0 0
## 3 1 1 0 1 0
## 4 0 0 0 0 0
## Terms
## Docs nothing patch objection made pending proposition pennsylvania purpose
## 1 1 0 0 0
## 2 0 0 0 1
## 3 0 1 1 0
## 4 0 0 0 0
## Terms
## Docs pennsylvania stated personal explanation portion time precluded taking
## 1 0 0 0 0
## 2 0 0 1 0
## 3 1 1 1 1
## 4 0 1 0 0
## Terms
## Docs proposition gentleman purpose explanation purpose personal
## 1 0 0 0
## 2 0 1 0
## 3 1 1 0
## 4 0 0 1
## Terms
## Docs purposes personal relation pending relevancy remark remaining portion
## 1 0 0 0 0
## 2 0 0 1 1
## 3 1 1 0 1
## 4 0 0 0 0
## Terms
## Docs remark desire rise purpose rules house second whether see relevancy
## 1 0 0 0 0 0
## 2 1 0 0 1 1
## 3 0 1 1 0 0
## 4 0 0 0 0 0
## Terms
## Docs spoken nothing stated rise statement precluded taking floor
## 1 1 0 0 0
## 2 0 0 0 0
## 3 0 1 1 1
## 4 0 0 0 0
## Terms
## Docs temporarily gentleman time member whether can yield floor yield member
## 1 0 0 0 0 0
## 2 1 0 2 1 0
## 3 0 1 0 0 1
## 4 0 0 0 0 0
## Terms
## Docs yield remaining
## 1 0
## 2 1
## 3 1
## 4 0
bi.tdm <- TermDocumentMatrix(corpus5, control = list(tokenize = BigramTokenizer))
bi.tdm
## <<TermDocumentMatrix (terms: 59, documents: 4)>>
## Non-/sparse entries: 67/169
## Sparsity : 72%
## Maximal term length: 22
## Weighting : term frequency (tf)
as.matrix(bi.tdm)
## Docs
## Terms 1 2 3 4
## anything explain 1 0 0 0
## can yield 0 2 1 0
## clerk first 0 1 0 0
## desire explanation 0 0 0 1
## desire floor 0 0 0 1
## desire know 0 1 0 0
## except purposes 0 0 1 0
## explain nothing 1 0 0 0
## explain spoken 1 0 0 0
## explanation desire 0 0 0 1
## explanation gentleman 0 0 0 1
## explanation hence 0 0 1 0
## explanation relation 0 0 1 0
## explanation second 0 1 0 0
## first whether 0 1 0 0
## floor purpose 0 0 0 1
## floor temporarily 0 1 0 0
## gentleman anything 1 0 0 0
## gentleman newyork 0 0 1 1
## gentleman pennsylvania 0 1 1 0
## hence statement 0 0 1 0
## house gentleman 0 0 1 0
## know mr 0 1 0 0
## made can 0 0 1 0
## member except 0 0 1 0
## member objection 0 0 1 0
## mr clerk 0 1 0 0
## newyork yield 0 0 1 0
## nothing explain 1 0 0 0
## nothing patch 1 0 0 0
## objection made 0 0 1 0
## pending proposition 0 0 1 0
## pennsylvania purpose 0 1 0 0
## pennsylvania stated 0 0 1 0
## personal explanation 0 0 1 1
## portion time 0 1 1 0
## precluded taking 0 0 1 0
## proposition gentleman 0 0 1 0
## purpose explanation 0 1 1 0
## purpose personal 0 0 0 1
## purposes personal 0 0 1 0
## relation pending 0 0 1 0
## relevancy remark 0 1 0 0
## remaining portion 0 1 1 0
## remark desire 0 1 0 0
## rise purpose 0 0 1 0
## rules house 0 0 1 0
## second whether 0 1 0 0
## see relevancy 0 1 0 0
## spoken nothing 1 0 0 0
## stated rise 0 0 1 0
## statement precluded 0 0 1 0
## taking floor 0 0 1 0
## temporarily gentleman 0 1 0 0
## time member 0 0 1 0
## whether can 0 2 0 0
## yield floor 0 1 0 0
## yield member 0 0 1 0
## yield remaining 0 1 1 0
findFreqTerms(bi.dtm,1)
## [1] "anything explain" "can yield" "clerk first"
## [4] "desire explanation" "desire floor" "desire know"
## [7] "except purposes" "explain nothing" "explain spoken"
## [10] "explanation desire" "explanation gentleman" "explanation hence"
## [13] "explanation relation" "explanation second" "first whether"
## [16] "floor purpose" "floor temporarily" "gentleman anything"
## [19] "gentleman newyork" "gentleman pennsylvania" "hence statement"
## [22] "house gentleman" "know mr" "made can"
## [25] "member except" "member objection" "mr clerk"
## [28] "newyork yield" "nothing explain" "nothing patch"
## [31] "objection made" "pending proposition" "pennsylvania purpose"
## [34] "pennsylvania stated" "personal explanation" "portion time"
## [37] "precluded taking" "proposition gentleman" "purpose explanation"
## [40] "purpose personal" "purposes personal" "relation pending"
## [43] "relevancy remark" "remaining portion" "remark desire"
## [46] "rise purpose" "rules house" "second whether"
## [49] "see relevancy" "spoken nothing" "stated rise"
## [52] "statement precluded" "taking floor" "temporarily gentleman"
## [55] "time member" "whether can" "yield floor"
## [58] "yield member" "yield remaining"
findFreqTerms(bi.dtm,2)
## [1] "can yield" "gentleman newyork" "gentleman pennsylvania"
## [4] "personal explanation" "portion time" "purpose explanation"
## [7] "remaining portion" "whether can" "yield remaining"
freq=colSums(as.matrix(bi.dtm))
ord=order(freq)
freq[head(ord)]
## anything explain clerk first desire explanation desire floor
## 1 1 1 1
## desire know except purposes
## 1 1
freq[tail(ord)]
## portion time purpose explanation remaining portion whether can
## 2 2 2 2
## yield remaining can yield
## 2 3
freq=sort(colSums(as.matrix(bi.dtm)),decreasing=TRUE)
head(freq,20)
## can yield gentleman newyork gentleman pennsylvania
## 3 2 2
## personal explanation portion time purpose explanation
## 2 2 2
## remaining portion whether can yield remaining
## 2 2 2
## anything explain clerk first desire explanation
## 1 1 1
## desire floor desire know except purposes
## 1 1 1
## explain nothing explain spoken explanation desire
## 1 1 1
## explanation gentleman explanation hence
## 1 1
wf=data.frame(word=names(freq),freq=freq)
head(wf)
p=ggplot(subset(wf,freq>1),aes(word,freq))
p=p+geom_bar(stat="identity")
p=p+theme(axis.text.x=element_text(angle=45,hjust=1))
p