rm(list = ls()) # clean the global environment
code from chapter 1 text to set up the dtm
data <- read.csv('C:\\Johannes Ledolter\\2020March01Book\\Chapter1WEB\\test.csv', header=FALSE, stringsAsFactors=F)
for (i in 1:dim(data)[1]) { # i takes on the value from 1 to 4
txt = data[i, 1] # assigning each speech to variable 'txt'
txt = tolower(txt) # transform all text to lower cases
txt = gsub("[.]", "", ignore.case = TRUE, txt) # delete .
txt = gsub("[,]", "", ignore.case = TRUE, txt) # delete , for empty space
txt = gsub("[;]", "", ignore.case = TRUE, txt) # delete ; for empty space
txt = gsub("new york", "new-york", ignore.case = TRUE, txt) # the state of new-york
data[i, 1] = txt # assign cleaned variable 'txt' back into the data frame 'data'
}
len = dim(dim(data)[1])
meta1 = dim(dim(data)[1])
meta2 = dim(dim(data)[1])
for (i in 1:dim(data)[1]) {
txt = data[i, 1] # assign each speech to variable 'txt'
temp = strsplit(txt, " ")[[1]] # split variable 'txt' based on space
len[i] = length(temp) - 2 # assign the length of each speech excluding the speaker name into variable 'len'
meta1[i] = temp[1] # assign 1st word in 'temp'(mr/the) to variable meta1
meta2[i] = temp[2] # assign 2nd word in 'temp'(stevens/brooks/clerk/johnson) to variable meta2
tempr = dim(len[i]) # create new variable 'tempr'
for (j in 1:len[i]) {
tempr[j] = temp[j + 2] # assign each word in 'temp' excluding the speaker name into 'tempr'
}
data[i, 1] = toString(tempr) # combine words back into one string
data[i, 1] = gsub("[,]", "", ignore.case = TRUE, data[i, 1])
}
#install.packages("tm")
library(tm)
## Loading required package: NLP
corpus = VCorpus(VectorSource(data[, 1]), readerControl = list(reader = readPlain)) # this is how to create corpus
corpus1 = tm_map(corpus, stripWhitespace) # remove whitespace
corpus2 = tm_map(corpus1, content_transformer(tolower)) # transform to lower case
corpus3 = tm_map(corpus2, removePunctuation) # remove puntuation
corpus4 = tm_map(corpus3, removeNumbers) # remove numbers
corpus5 = tm_map(corpus4, removeWords, stopwords("english")) # remove stopwords
corp.dtm = DocumentTermMatrix(corpus5, control = list(stemming = FALSE)) ## no stemming is the default
corp.tdm = TermDocumentMatrix(corpus5, control = list(stemming = FALSE))
corps.dtm = DocumentTermMatrix(corpus5, control = list(stemming = TRUE))
corps.dtm
## <<DocumentTermMatrix (documents: 4, terms: 42)>>
## Non-/sparse entries: 60/108
## Sparsity : 64%
## Maximal term length: 12
## Weighting : term frequency (tf)
corpOLD.dtm = corp.dtm
corpOLD.dtm
## <<DocumentTermMatrix (documents: 4, terms: 43)>>
## Non-/sparse entries: 61/111
## Sparsity : 65%
## Maximal term length: 12
## Weighting : term frequency (tf)
dim(corpOLD.dtm)
## [1] 4 43
labels(corpOLD.dtm)$Terms
## [1] "anything" "can" "clerk" "desire" "except"
## [6] "explain" "explanation" "first" "floor" "gentleman"
## [11] "hence" "house" "know" "made" "member"
## [16] "newyork" "nothing" "objection" "patch" "pending"
## [21] "pennsylvania" "personal" "portion" "precluded" "proposition"
## [26] "purpose" "purposes" "relation" "relevancy" "remaining"
## [31] "remark" "rise" "rules" "second" "see"
## [36] "spoken" "stated" "statement" "taking" "temporarily"
## [41] "time" "whether" "yield"
ind = labels(corpOLD.dtm)$Terms == "purposes"
ind
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
corpNEW.dtm = corpOLD.dtm[ , !ind]
dim(corpNEW.dtm)
## [1] 4 42
labels(corpNEW.dtm)$Terms
## [1] "anything" "can" "clerk" "desire" "except"
## [6] "explain" "explanation" "first" "floor" "gentleman"
## [11] "hence" "house" "know" "made" "member"
## [16] "newyork" "nothing" "objection" "patch" "pending"
## [21] "pennsylvania" "personal" "portion" "precluded" "proposition"
## [26] "purpose" "relation" "relevancy" "remaining" "remark"
## [31] "rise" "rules" "second" "see" "spoken"
## [36] "stated" "statement" "taking" "temporarily" "time"
## [41] "whether" "yield"
as.matrix(corpNEW.dtm)
## Terms
## Docs anything can clerk desire except explain explanation first floor gentleman
## 1 1 0 0 0 0 2 0 0 0 1
## 2 0 2 1 1 0 0 1 1 1 1
## 3 0 1 0 0 1 0 2 0 1 2
## 4 0 0 0 2 0 0 2 0 1 1
## Terms
## Docs hence house know made member newyork nothing objection patch pending
## 1 0 0 0 0 0 0 2 0 1 0
## 2 0 0 1 0 0 0 0 0 0 0
## 3 1 1 0 1 2 1 0 1 0 1
## 4 0 0 0 0 0 1 0 0 0 0
## Terms
## Docs pennsylvania personal portion precluded proposition purpose relation
## 1 0 0 0 0 0 0 0
## 2 1 0 1 0 0 1 0
## 3 1 1 1 1 1 1 1
## 4 0 1 0 0 0 1 0
## Terms
## Docs relevancy remaining remark rise rules second see spoken stated statement
## 1 0 0 0 0 0 0 0 1 0 0
## 2 1 1 1 0 0 1 1 0 0 0
## 3 0 1 0 1 1 0 0 0 1 1
## 4 0 0 0 0 0 0 0 0 0 0
## Terms
## Docs taking temporarily time whether yield
## 1 0 0 0 0 0
## 2 0 1 1 2 2
## 3 1 0 1 0 2
## 4 0 0 0 0 0
corpOLD.dtm
## <<DocumentTermMatrix (documents: 4, terms: 43)>>
## Non-/sparse entries: 61/111
## Sparsity : 65%
## Maximal term length: 12
## Weighting : term frequency (tf)
dim(corpOLD.dtm)
## [1] 4 43
labels(corpOLD.dtm)$Terms
## [1] "anything" "can" "clerk" "desire" "except"
## [6] "explain" "explanation" "first" "floor" "gentleman"
## [11] "hence" "house" "know" "made" "member"
## [16] "newyork" "nothing" "objection" "patch" "pending"
## [21] "pennsylvania" "personal" "portion" "precluded" "proposition"
## [26] "purpose" "purposes" "relation" "relevancy" "remaining"
## [31] "remark" "rise" "rules" "second" "see"
## [36] "spoken" "stated" "statement" "taking" "temporarily"
## [41] "time" "whether" "yield"
delete = c("floor", "proposition", "purposes") # write down terms to delete
length(delete)
## [1] 3
corpTEMP.dtm = corpOLD.dtm
for (i in 1:length(delete)) {
ind = labels(corpTEMP.dtm)$Terms == delete[i]
corpTEMP.dtm = corpTEMP.dtm[ , !ind]
}
corpNEW.dtm = corpTEMP.dtm
dim(corpNEW.dtm)
## [1] 4 40
labels(corpNEW.dtm)$Terms
## [1] "anything" "can" "clerk" "desire" "except"
## [6] "explain" "explanation" "first" "gentleman" "hence"
## [11] "house" "know" "made" "member" "newyork"
## [16] "nothing" "objection" "patch" "pending" "pennsylvania"
## [21] "personal" "portion" "precluded" "purpose" "relation"
## [26] "relevancy" "remaining" "remark" "rise" "rules"
## [31] "second" "see" "spoken" "stated" "statement"
## [36] "taking" "temporarily" "time" "whether" "yield"
as.matrix(corpNEW.dtm)
## Terms
## Docs anything can clerk desire except explain explanation first gentleman hence
## 1 1 0 0 0 0 2 0 0 1 0
## 2 0 2 1 1 0 0 1 1 1 0
## 3 0 1 0 0 1 0 2 0 2 1
## 4 0 0 0 2 0 0 2 0 1 0
## Terms
## Docs house know made member newyork nothing objection patch pending
## 1 0 0 0 0 0 2 0 1 0
## 2 0 1 0 0 0 0 0 0 0
## 3 1 0 1 2 1 0 1 0 1
## 4 0 0 0 0 1 0 0 0 0
## Terms
## Docs pennsylvania personal portion precluded purpose relation relevancy
## 1 0 0 0 0 0 0 0
## 2 1 0 1 0 1 0 1
## 3 1 1 1 1 1 1 0
## 4 0 1 0 0 1 0 0
## Terms
## Docs remaining remark rise rules second see spoken stated statement taking
## 1 0 0 0 0 0 0 1 0 0 0
## 2 1 1 0 0 1 1 0 0 0 0
## 3 1 0 1 1 0 0 0 1 1 1
## 4 0 0 0 0 0 0 0 0 0 0
## Terms
## Docs temporarily time whether yield
## 1 0 0 0 0
## 2 1 1 2 2
## 3 0 1 0 2
## 4 0 0 0 0
combining terms in dtm matrix: corpOLD.dtm terms to be combined in: comb new dtm matrix: corpNEW.dtm
library(slam)
corpOLD.dtm
## <<DocumentTermMatrix (documents: 4, terms: 43)>>
## Non-/sparse entries: 61/111
## Sparsity : 65%
## Maximal term length: 12
## Weighting : term frequency (tf)
dim(corpOLD.dtm)
## [1] 4 43
comb = c("purpose", "purposes") # words to combine
groupcomb = toString(comb) # label for combined terms
mm = length(comb)
corpNEW.dtm = corpOLD.dtm
h = labels(corpNEW.dtm)$Terms
ind1 = c(1:length(h))
for (j in 1:mm) {
ind = h == comb[j]
index = ind1[ind == TRUE]
h[index] = groupcomb # changing the labels of the terms to be combined to the group label
}
dimnames(corpNEW.dtm)$Terms = h
labels(corpNEW.dtm)$Terms
## [1] "anything" "can" "clerk"
## [4] "desire" "except" "explain"
## [7] "explanation" "first" "floor"
## [10] "gentleman" "hence" "house"
## [13] "know" "made" "member"
## [16] "newyork" "nothing" "objection"
## [19] "patch" "pending" "pennsylvania"
## [22] "personal" "portion" "precluded"
## [25] "proposition" "purpose, purposes" "purpose, purposes"
## [28] "relation" "relevancy" "remaining"
## [31] "remark" "rise" "rules"
## [34] "second" "see" "spoken"
## [37] "stated" "statement" "taking"
## [40] "temporarily" "time" "whether"
## [43] "yield"
corpNEW.dtm = rollup(corpNEW.dtm, 2L, labels(corpNEW.dtm)$Terms, FUN = sum)
dim(corpNEW.dtm)
## [1] 4 42
labels(corpNEW.dtm)$Terms
## [1] "anything" "can" "clerk"
## [4] "desire" "except" "explain"
## [7] "explanation" "first" "floor"
## [10] "gentleman" "hence" "house"
## [13] "know" "made" "member"
## [16] "newyork" "nothing" "objection"
## [19] "patch" "pending" "pennsylvania"
## [22] "personal" "portion" "precluded"
## [25] "proposition" "purpose, purposes" "relation"
## [28] "relevancy" "remaining" "remark"
## [31] "rise" "rules" "second"
## [34] "see" "spoken" "stated"
## [37] "statement" "taking" "temporarily"
## [40] "time" "whether" "yield"
as.matrix(corpNEW.dtm)
## Terms
## Docs anything can clerk desire except explain explanation first floor gentleman
## 1 1 0 0 0 0 2 0 0 0 1
## 2 0 2 1 1 0 0 1 1 1 1
## 3 0 1 0 0 1 0 2 0 1 2
## 4 0 0 0 2 0 0 2 0 1 1
## Terms
## Docs hence house know made member newyork nothing objection patch pending
## 1 0 0 0 0 0 0 2 0 1 0
## 2 0 0 1 0 0 0 0 0 0 0
## 3 1 1 0 1 2 1 0 1 0 1
## 4 0 0 0 0 0 1 0 0 0 0
## Terms
## Docs pennsylvania personal portion precluded proposition purpose, purposes
## 1 0 0 0 0 0 0
## 2 1 0 1 0 0 1
## 3 1 1 1 1 1 2
## 4 0 1 0 0 0 1
## Terms
## Docs relation relevancy remaining remark rise rules second see spoken stated
## 1 0 0 0 0 0 0 0 0 1 0
## 2 0 1 1 1 0 0 1 1 0 0
## 3 1 0 1 0 1 1 0 0 0 1
## 4 0 0 0 0 0 0 0 0 0 0
## Terms
## Docs statement taking temporarily time whether yield
## 1 0 0 0 0 0 0
## 2 0 0 1 1 2 2
## 3 1 1 0 1 0 2
## 4 0 0 0 0 0 0
combining terms in dtm matrix: corpOLD.dtm terms to be combined in: comb new dtm matrix: corpNEW.dtm
combine_terms = function(corpOLD.dtm, comb) {
groupcomb = toString(comb) # label for combined terms
mm = length(comb)
corpNEW.dtm = corpOLD.dtm
h = labels(corpNEW.dtm)$Terms
ind1 = c(1:length(h))
for (j in 1:mm) {
ind = h == comb[j]
index = ind1[ind == TRUE]
h[index] = groupcomb[1] # changing the labels of the terms to be combined to the group label
}
dimnames(corpNEW.dtm)$Terms = h
labels(corpNEW.dtm)$Terms
corpNEW.dtm = rollup(corpNEW.dtm, 2L, labels(corpNEW.dtm)$Terms, FUN = sum)
return(corpNEW.dtm)
}
# library(slam)
corpOLD.dtm
## <<DocumentTermMatrix (documents: 4, terms: 43)>>
## Non-/sparse entries: 61/111
## Sparsity : 65%
## Maximal term length: 12
## Weighting : term frequency (tf)
dim(corpOLD.dtm)
## [1] 4 43
labels(corpOLD.dtm)$Terms
## [1] "anything" "can" "clerk" "desire" "except"
## [6] "explain" "explanation" "first" "floor" "gentleman"
## [11] "hence" "house" "know" "made" "member"
## [16] "newyork" "nothing" "objection" "patch" "pending"
## [21] "pennsylvania" "personal" "portion" "precluded" "proposition"
## [26] "purpose" "purposes" "relation" "relevancy" "remaining"
## [31] "remark" "rise" "rules" "second" "see"
## [36] "spoken" "stated" "statement" "taking" "temporarily"
## [41] "time" "whether" "yield"
as.matrix(corpOLD.dtm)
## Terms
## Docs anything can clerk desire except explain explanation first floor gentleman
## 1 1 0 0 0 0 2 0 0 0 1
## 2 0 2 1 1 0 0 1 1 1 1
## 3 0 1 0 0 1 0 2 0 1 2
## 4 0 0 0 2 0 0 2 0 1 1
## Terms
## Docs hence house know made member newyork nothing objection patch pending
## 1 0 0 0 0 0 0 2 0 1 0
## 2 0 0 1 0 0 0 0 0 0 0
## 3 1 1 0 1 2 1 0 1 0 1
## 4 0 0 0 0 0 1 0 0 0 0
## Terms
## Docs pennsylvania personal portion precluded proposition purpose purposes
## 1 0 0 0 0 0 0 0
## 2 1 0 1 0 0 1 0
## 3 1 1 1 1 1 1 1
## 4 0 1 0 0 0 1 0
## Terms
## Docs relation relevancy remaining remark rise rules second see spoken stated
## 1 0 0 0 0 0 0 0 0 1 0
## 2 0 1 1 1 0 0 1 1 0 0
## 3 1 0 1 0 1 1 0 0 0 1
## 4 0 0 0 0 0 0 0 0 0 0
## Terms
## Docs statement taking temporarily time whether yield
## 1 0 0 0 0 0 0
## 2 0 0 1 1 2 2
## 3 1 1 0 1 0 2
## 4 0 0 0 0 0 0
comb = c("purpose", "purposes") # words to combine
corpNEW.dtm = combine_terms(corpOLD.dtm, comb)
dim(corpNEW.dtm)
## [1] 4 42
labels(corpNEW.dtm)$Terms
## [1] "anything" "can" "clerk"
## [4] "desire" "except" "explain"
## [7] "explanation" "first" "floor"
## [10] "gentleman" "hence" "house"
## [13] "know" "made" "member"
## [16] "newyork" "nothing" "objection"
## [19] "patch" "pending" "pennsylvania"
## [22] "personal" "portion" "precluded"
## [25] "proposition" "purpose, purposes" "relation"
## [28] "relevancy" "remaining" "remark"
## [31] "rise" "rules" "second"
## [34] "see" "spoken" "stated"
## [37] "statement" "taking" "temporarily"
## [40] "time" "whether" "yield"
as.matrix(corpNEW.dtm)
## Terms
## Docs anything can clerk desire except explain explanation first floor gentleman
## 1 1 0 0 0 0 2 0 0 0 1
## 2 0 2 1 1 0 0 1 1 1 1
## 3 0 1 0 0 1 0 2 0 1 2
## 4 0 0 0 2 0 0 2 0 1 1
## Terms
## Docs hence house know made member newyork nothing objection patch pending
## 1 0 0 0 0 0 0 2 0 1 0
## 2 0 0 1 0 0 0 0 0 0 0
## 3 1 1 0 1 2 1 0 1 0 1
## 4 0 0 0 0 0 1 0 0 0 0
## Terms
## Docs pennsylvania personal portion precluded proposition purpose, purposes
## 1 0 0 0 0 0 0
## 2 1 0 1 0 0 1
## 3 1 1 1 1 1 2
## 4 0 1 0 0 0 1
## Terms
## Docs relation relevancy remaining remark rise rules second see spoken stated
## 1 0 0 0 0 0 0 0 0 1 0
## 2 0 1 1 1 0 0 1 1 0 0
## 3 1 0 1 0 1 1 0 0 0 1
## 4 0 0 0 0 0 0 0 0 0 0
## Terms
## Docs statement taking temporarily time whether yield
## 1 0 0 0 0 0 0
## 2 0 0 1 1 2 2
## 3 1 1 0 1 0 2
## 4 0 0 0 0 0 0
combining terms in dtm matrix: corpOLD.dtm terms to be combined in: comb new dtm matrix: corpNEW.dtm
corpOLD.dtm
## <<DocumentTermMatrix (documents: 4, terms: 43)>>
## Non-/sparse entries: 61/111
## Sparsity : 65%
## Maximal term length: 12
## Weighting : term frequency (tf)
dim(corpOLD.dtm)
## [1] 4 43
labels(corpOLD.dtm)$Terms
## [1] "anything" "can" "clerk" "desire" "except"
## [6] "explain" "explanation" "first" "floor" "gentleman"
## [11] "hence" "house" "know" "made" "member"
## [16] "newyork" "nothing" "objection" "patch" "pending"
## [21] "pennsylvania" "personal" "portion" "precluded" "proposition"
## [26] "purpose" "purposes" "relation" "relevancy" "remaining"
## [31] "remark" "rise" "rules" "second" "see"
## [36] "spoken" "stated" "statement" "taking" "temporarily"
## [41] "time" "whether" "yield"
comb = c("purpose", "purposes") # words to combine
groupcomb = toString(comb)
groupcomb # label for combined terms
## [1] "purpose, purposes"
mm = length(comb)
h = labels(corpOLD.dtm)$Terms
ind = h == comb[1]
for (i in 2:mm) {
ind1 = h == comb[i]
ind = ind|ind1
}
corpR.dtm = corpOLD.dtm[ , !ind]
dim(corpR.dtm)
## [1] 4 41
labels(corpR.dtm)$Terms
## [1] "anything" "can" "clerk" "desire" "except"
## [6] "explain" "explanation" "first" "floor" "gentleman"
## [11] "hence" "house" "know" "made" "member"
## [16] "newyork" "nothing" "objection" "patch" "pending"
## [21] "pennsylvania" "personal" "portion" "precluded" "proposition"
## [26] "relation" "relevancy" "remaining" "remark" "rise"
## [31] "rules" "second" "see" "spoken" "stated"
## [36] "statement" "taking" "temporarily" "time" "whether"
## [41] "yield"
as.matrix(corpR.dtm)
## Terms
## Docs anything can clerk desire except explain explanation first floor gentleman
## 1 1 0 0 0 0 2 0 0 0 1
## 2 0 2 1 1 0 0 1 1 1 1
## 3 0 1 0 0 1 0 2 0 1 2
## 4 0 0 0 2 0 0 2 0 1 1
## Terms
## Docs hence house know made member newyork nothing objection patch pending
## 1 0 0 0 0 0 0 2 0 1 0
## 2 0 0 1 0 0 0 0 0 0 0
## 3 1 1 0 1 2 1 0 1 0 1
## 4 0 0 0 0 0 1 0 0 0 0
## Terms
## Docs pennsylvania personal portion precluded proposition relation relevancy
## 1 0 0 0 0 0 0 0
## 2 1 0 1 0 0 0 1
## 3 1 1 1 1 1 1 0
## 4 0 1 0 0 0 0 0
## Terms
## Docs remaining remark rise rules second see spoken stated statement taking
## 1 0 0 0 0 0 0 1 0 0 0
## 2 1 1 0 0 1 1 0 0 0 0
## 3 1 0 1 1 0 0 0 1 1 1
## 4 0 0 0 0 0 0 0 0 0 0
## Terms
## Docs temporarily time whether yield
## 1 0 0 0 0
## 2 1 1 2 2
## 3 0 1 0 2
## 4 0 0 0 0
corpN.dtm = corpOLD.dtm[ , ind]
dim(corpN.dtm)
## [1] 4 2
if(dim(corpN.dtm)[2] > 0) {
corpN.dtm[ , 1] = row_sums(as.matrix(corpN.dtm))
corpN.dtm = corpN.dtm[ , 1]
dimnames(corpN.dtm)$Terms = groupcomb
}
corpNEW.dtm = as.DocumentTermMatrix(cbind(corpR.dtm, corpN.dtm), weighting = weightTf)
dim(corpNEW.dtm)
## [1] 4 42
labels(corpNEW.dtm)$Terms
## [1] "anything" "can" "clerk"
## [4] "desire" "except" "explain"
## [7] "explanation" "first" "floor"
## [10] "gentleman" "hence" "house"
## [13] "know" "made" "member"
## [16] "newyork" "nothing" "objection"
## [19] "patch" "pending" "pennsylvania"
## [22] "personal" "portion" "precluded"
## [25] "proposition" "relation" "relevancy"
## [28] "remaining" "remark" "rise"
## [31] "rules" "second" "see"
## [34] "spoken" "stated" "statement"
## [37] "taking" "temporarily" "time"
## [40] "whether" "yield" "purpose, purposes"
as.matrix(corpNEW.dtm)
## Terms
## Docs anything can clerk desire except explain explanation first floor gentleman
## 1 1 0 0 0 0 2 0 0 0 1
## 2 0 2 1 1 0 0 1 1 1 1
## 3 0 1 0 0 1 0 2 0 1 2
## 4 0 0 0 2 0 0 2 0 1 1
## Terms
## Docs hence house know made member newyork nothing objection patch pending
## 1 0 0 0 0 0 0 2 0 1 0
## 2 0 0 1 0 0 0 0 0 0 0
## 3 1 1 0 1 2 1 0 1 0 1
## 4 0 0 0 0 0 1 0 0 0 0
## Terms
## Docs pennsylvania personal portion precluded proposition relation relevancy
## 1 0 0 0 0 0 0 0
## 2 1 0 1 0 0 0 1
## 3 1 1 1 1 1 1 0
## 4 0 1 0 0 0 0 0
## Terms
## Docs remaining remark rise rules second see spoken stated statement taking
## 1 0 0 0 0 0 0 1 0 0 0
## 2 1 1 0 0 1 1 0 0 0 0
## 3 1 0 1 1 0 0 0 1 1 1
## 4 0 0 0 0 0 0 0 0 0 0
## Terms
## Docs temporarily time whether yield purpose, purposes
## 1 0 0 0 0 0
## 2 1 1 2 2 1
## 3 0 1 0 2 2
## 4 0 0 0 0 1