Chapter 1 extra code for document-term matrix

code from chapter 1 text to set up the dtm

data <- read.csv('C:\\Johannes Ledolter\\2020March01Book\\Chapter1WEB\\test.csv', header=FALSE, stringsAsFactors=F)

for (i in 1:dim(data)[1]) {                                   # i takes on the value from 1 to 4
  txt = data[i, 1]                                            # assigning each speech to variable 'txt'
  txt = tolower(txt)                                            # transform all text to lower cases 
  txt = gsub("[.]", "", ignore.case = TRUE, txt)                # delete . 
  txt = gsub("[,]", "", ignore.case = TRUE, txt)                # delete , for empty space
  txt = gsub("[;]", "", ignore.case = TRUE, txt)                # delete ; for empty space
  txt = gsub("new york", "new-york", ignore.case = TRUE, txt) # the state of new-york
  data[i, 1] = txt                                            # assign cleaned variable 'txt' back into the data frame 'data'
}

len = dim(dim(data)[1])
meta1 = dim(dim(data)[1])
meta2 = dim(dim(data)[1])

for (i in 1:dim(data)[1]) {
  txt = data[i, 1]                  # assign each speech to variable 'txt'
  temp = strsplit(txt, " ")[[1]]    # split variable 'txt' based on space
  len[i] = length(temp) - 2         # assign the length of each speech excluding the speaker name into variable 'len' 
  meta1[i] = temp[1]                # assign 1st word in 'temp'(mr/the) to variable meta1
  meta2[i] = temp[2]                # assign 2nd word in 'temp'(stevens/brooks/clerk/johnson) to variable meta2
  tempr = dim(len[i])               # create new variable 'tempr'
  for (j in 1:len[i]) { 
    tempr[j] = temp[j + 2]          # assign each word in 'temp' excluding the speaker name into 'tempr' 
    }
  data[i, 1] = toString(tempr)      # combine words back into one string 
  data[i, 1] = gsub("[,]", "", ignore.case = TRUE, data[i, 1])
}


#install.packages("tm")
library(tm)

## Loading required package: NLP

corpus = VCorpus(VectorSource(data[, 1]), readerControl = list(reader = readPlain)) # this is how to create corpus 

corpus1 = tm_map(corpus, stripWhitespace)                                         # remove whitespace
corpus2 = tm_map(corpus1, content_transformer(tolower))                           # transform to lower case
corpus3 = tm_map(corpus2, removePunctuation)                                      # remove puntuation
corpus4 = tm_map(corpus3, removeNumbers)                                          # remove numbers
corpus5 = tm_map(corpus4, removeWords, stopwords("english"))                      # remove stopwords

corp.dtm = DocumentTermMatrix(corpus5, control = list(stemming = FALSE)) ## no stemming is the default

corp.tdm = TermDocumentMatrix(corpus5, control = list(stemming = FALSE))

corps.dtm = DocumentTermMatrix(corpus5, control = list(stemming = TRUE))

corps.dtm

## <<DocumentTermMatrix (documents: 4, terms: 42)>>
## Non-/sparse entries: 60/108
## Sparsity           : 64%
## Maximal term length: 12
## Weighting          : term frequency (tf)

Different ways of omitting and combing words in a document-term matrix

OMITTING A SINGLE WORD FROM A DOCUMENT-TERM MATRIX

corpOLD.dtm = corp.dtm
corpOLD.dtm

## <<DocumentTermMatrix (documents: 4, terms: 43)>>
## Non-/sparse entries: 61/111
## Sparsity           : 65%
## Maximal term length: 12
## Weighting          : term frequency (tf)

dim(corpOLD.dtm)

## [1]  4 43

labels(corpOLD.dtm)$Terms

##  [1] "anything"     "can"          "clerk"        "desire"       "except"      
##  [6] "explain"      "explanation"  "first"        "floor"        "gentleman"   
## [11] "hence"        "house"        "know"         "made"         "member"      
## [16] "newyork"      "nothing"      "objection"    "patch"        "pending"     
## [21] "pennsylvania" "personal"     "portion"      "precluded"    "proposition" 
## [26] "purpose"      "purposes"     "relation"     "relevancy"    "remaining"   
## [31] "remark"       "rise"         "rules"        "second"       "see"         
## [36] "spoken"       "stated"       "statement"    "taking"       "temporarily" 
## [41] "time"         "whether"      "yield"

ind = labels(corpOLD.dtm)$Terms == "purposes"
ind

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE

corpNEW.dtm = corpOLD.dtm[ , !ind]
dim(corpNEW.dtm)

## [1]  4 42

labels(corpNEW.dtm)$Terms

##  [1] "anything"     "can"          "clerk"        "desire"       "except"      
##  [6] "explain"      "explanation"  "first"        "floor"        "gentleman"   
## [11] "hence"        "house"        "know"         "made"         "member"      
## [16] "newyork"      "nothing"      "objection"    "patch"        "pending"     
## [21] "pennsylvania" "personal"     "portion"      "precluded"    "proposition" 
## [26] "purpose"      "relation"     "relevancy"    "remaining"    "remark"      
## [31] "rise"         "rules"        "second"       "see"          "spoken"      
## [36] "stated"       "statement"    "taking"       "temporarily"  "time"        
## [41] "whether"      "yield"

as.matrix(corpNEW.dtm)

##     Terms
## Docs anything can clerk desire except explain explanation first floor gentleman
##    1        1   0     0      0      0       2           0     0     0         1
##    2        0   2     1      1      0       0           1     1     1         1
##    3        0   1     0      0      1       0           2     0     1         2
##    4        0   0     0      2      0       0           2     0     1         1
##     Terms
## Docs hence house know made member newyork nothing objection patch pending
##    1     0     0    0    0      0       0       2         0     1       0
##    2     0     0    1    0      0       0       0         0     0       0
##    3     1     1    0    1      2       1       0         1     0       1
##    4     0     0    0    0      0       1       0         0     0       0
##     Terms
## Docs pennsylvania personal portion precluded proposition purpose relation
##    1            0        0       0         0           0       0        0
##    2            1        0       1         0           0       1        0
##    3            1        1       1         1           1       1        1
##    4            0        1       0         0           0       1        0
##     Terms
## Docs relevancy remaining remark rise rules second see spoken stated statement
##    1         0         0      0    0     0      0   0      1      0         0
##    2         1         1      1    0     0      1   1      0      0         0
##    3         0         1      0    1     1      0   0      0      1         1
##    4         0         0      0    0     0      0   0      0      0         0
##     Terms
## Docs taking temporarily time whether yield
##    1      0           0    0       0     0
##    2      0           1    1       2     2
##    3      1           0    1       0     2
##    4      0           0    0       0     0

OMITTING SEVERAL WORDS FROM A DOCUMENT-TERM MATRIX

corpOLD.dtm

## <<DocumentTermMatrix (documents: 4, terms: 43)>>
## Non-/sparse entries: 61/111
## Sparsity           : 65%
## Maximal term length: 12
## Weighting          : term frequency (tf)

dim(corpOLD.dtm)

## [1]  4 43

labels(corpOLD.dtm)$Terms

##  [1] "anything"     "can"          "clerk"        "desire"       "except"      
##  [6] "explain"      "explanation"  "first"        "floor"        "gentleman"   
## [11] "hence"        "house"        "know"         "made"         "member"      
## [16] "newyork"      "nothing"      "objection"    "patch"        "pending"     
## [21] "pennsylvania" "personal"     "portion"      "precluded"    "proposition" 
## [26] "purpose"      "purposes"     "relation"     "relevancy"    "remaining"   
## [31] "remark"       "rise"         "rules"        "second"       "see"         
## [36] "spoken"       "stated"       "statement"    "taking"       "temporarily" 
## [41] "time"         "whether"      "yield"

delete = c("floor", "proposition", "purposes")   # write down terms to delete
length(delete)

## [1] 3

corpTEMP.dtm = corpOLD.dtm
for (i in 1:length(delete)) {
  ind = labels(corpTEMP.dtm)$Terms == delete[i]
  corpTEMP.dtm = corpTEMP.dtm[ , !ind]
}
corpNEW.dtm = corpTEMP.dtm
dim(corpNEW.dtm)

## [1]  4 40

labels(corpNEW.dtm)$Terms

##  [1] "anything"     "can"          "clerk"        "desire"       "except"      
##  [6] "explain"      "explanation"  "first"        "gentleman"    "hence"       
## [11] "house"        "know"         "made"         "member"       "newyork"     
## [16] "nothing"      "objection"    "patch"        "pending"      "pennsylvania"
## [21] "personal"     "portion"      "precluded"    "purpose"      "relation"    
## [26] "relevancy"    "remaining"    "remark"       "rise"         "rules"       
## [31] "second"       "see"          "spoken"       "stated"       "statement"   
## [36] "taking"       "temporarily"  "time"         "whether"      "yield"

as.matrix(corpNEW.dtm)

##     Terms
## Docs anything can clerk desire except explain explanation first gentleman hence
##    1        1   0     0      0      0       2           0     0         1     0
##    2        0   2     1      1      0       0           1     1         1     0
##    3        0   1     0      0      1       0           2     0         2     1
##    4        0   0     0      2      0       0           2     0         1     0
##     Terms
## Docs house know made member newyork nothing objection patch pending
##    1     0    0    0      0       0       2         0     1       0
##    2     0    1    0      0       0       0         0     0       0
##    3     1    0    1      2       1       0         1     0       1
##    4     0    0    0      0       1       0         0     0       0
##     Terms
## Docs pennsylvania personal portion precluded purpose relation relevancy
##    1            0        0       0         0       0        0         0
##    2            1        0       1         0       1        0         1
##    3            1        1       1         1       1        1         0
##    4            0        1       0         0       1        0         0
##     Terms
## Docs remaining remark rise rules second see spoken stated statement taking
##    1         0      0    0     0      0   0      1      0         0      0
##    2         1      1    0     0      1   1      0      0         0      0
##    3         1      0    1     1      0   0      0      1         1      1
##    4         0      0    0     0      0   0      0      0         0      0
##     Terms
## Docs temporarily time whether yield
##    1           0    0       0     0
##    2           1    1       2     2
##    3           0    1       0     2
##    4           0    0       0     0

COMBINING FREQUENCIES ACROSS SEVERAL AND DELETING FREQUENCIES OF INDIVIDUAL TERMS

combining terms in dtm matrix: corpOLD.dtm terms to be combined in: comb new dtm matrix: corpNEW.dtm

library(slam)

corpOLD.dtm

## <<DocumentTermMatrix (documents: 4, terms: 43)>>
## Non-/sparse entries: 61/111
## Sparsity           : 65%
## Maximal term length: 12
## Weighting          : term frequency (tf)

dim(corpOLD.dtm)

## [1]  4 43

comb = c("purpose", "purposes")     # words to combine 
groupcomb = toString(comb)        # label for combined terms
mm = length(comb)

corpNEW.dtm = corpOLD.dtm
h = labels(corpNEW.dtm)$Terms
ind1 = c(1:length(h))
for (j in 1:mm) {
  ind = h == comb[j]
  index = ind1[ind == TRUE]
  h[index] = groupcomb  # changing the labels of the terms to be combined to the group label
}

dimnames(corpNEW.dtm)$Terms = h 
labels(corpNEW.dtm)$Terms

##  [1] "anything"          "can"               "clerk"            
##  [4] "desire"            "except"            "explain"          
##  [7] "explanation"       "first"             "floor"            
## [10] "gentleman"         "hence"             "house"            
## [13] "know"              "made"              "member"           
## [16] "newyork"           "nothing"           "objection"        
## [19] "patch"             "pending"           "pennsylvania"     
## [22] "personal"          "portion"           "precluded"        
## [25] "proposition"       "purpose, purposes" "purpose, purposes"
## [28] "relation"          "relevancy"         "remaining"        
## [31] "remark"            "rise"              "rules"            
## [34] "second"            "see"               "spoken"           
## [37] "stated"            "statement"         "taking"           
## [40] "temporarily"       "time"              "whether"          
## [43] "yield"

corpNEW.dtm = rollup(corpNEW.dtm, 2L, labels(corpNEW.dtm)$Terms, FUN = sum)

dim(corpNEW.dtm)

## [1]  4 42

labels(corpNEW.dtm)$Terms

##  [1] "anything"          "can"               "clerk"            
##  [4] "desire"            "except"            "explain"          
##  [7] "explanation"       "first"             "floor"            
## [10] "gentleman"         "hence"             "house"            
## [13] "know"              "made"              "member"           
## [16] "newyork"           "nothing"           "objection"        
## [19] "patch"             "pending"           "pennsylvania"     
## [22] "personal"          "portion"           "precluded"        
## [25] "proposition"       "purpose, purposes" "relation"         
## [28] "relevancy"         "remaining"         "remark"           
## [31] "rise"              "rules"             "second"           
## [34] "see"               "spoken"            "stated"           
## [37] "statement"         "taking"            "temporarily"      
## [40] "time"              "whether"           "yield"

as.matrix(corpNEW.dtm)

##     Terms
## Docs anything can clerk desire except explain explanation first floor gentleman
##    1        1   0     0      0      0       2           0     0     0         1
##    2        0   2     1      1      0       0           1     1     1         1
##    3        0   1     0      0      1       0           2     0     1         2
##    4        0   0     0      2      0       0           2     0     1         1
##     Terms
## Docs hence house know made member newyork nothing objection patch pending
##    1     0     0    0    0      0       0       2         0     1       0
##    2     0     0    1    0      0       0       0         0     0       0
##    3     1     1    0    1      2       1       0         1     0       1
##    4     0     0    0    0      0       1       0         0     0       0
##     Terms
## Docs pennsylvania personal portion precluded proposition purpose, purposes
##    1            0        0       0         0           0                 0
##    2            1        0       1         0           0                 1
##    3            1        1       1         1           1                 2
##    4            0        1       0         0           0                 1
##     Terms
## Docs relation relevancy remaining remark rise rules second see spoken stated
##    1        0         0         0      0    0     0      0   0      1      0
##    2        0         1         1      1    0     0      1   1      0      0
##    3        1         0         1      0    1     1      0   0      0      1
##    4        0         0         0      0    0     0      0   0      0      0
##     Terms
## Docs statement taking temporarily time whether yield
##    1         0      0           0    0       0     0
##    2         0      0           1    1       2     2
##    3         1      1           0    1       0     2
##    4         0      0           0    0       0     0

FUNCTION combine_terms TO COMBINE TERMS (COLUMNS) IN A DT MATRIX AND DELETING FREQUENCIES OF INDIVIDUAL TERMS

combining terms in dtm matrix: corpOLD.dtm terms to be combined in: comb new dtm matrix: corpNEW.dtm

combine_terms = function(corpOLD.dtm, comb) {
  
  groupcomb = toString(comb)    # label for combined terms
  mm = length(comb)
  
  corpNEW.dtm = corpOLD.dtm
  h = labels(corpNEW.dtm)$Terms
  ind1 = c(1:length(h))
  for (j in 1:mm) {
    ind = h == comb[j]
    index = ind1[ind == TRUE]
    h[index] = groupcomb[1]  # changing the labels of the terms to be combined to the group label
    }
  dimnames(corpNEW.dtm)$Terms = h   
  labels(corpNEW.dtm)$Terms
  corpNEW.dtm = rollup(corpNEW.dtm, 2L, labels(corpNEW.dtm)$Terms, FUN = sum)
  return(corpNEW.dtm)
}

# library(slam)
corpOLD.dtm

## <<DocumentTermMatrix (documents: 4, terms: 43)>>
## Non-/sparse entries: 61/111
## Sparsity           : 65%
## Maximal term length: 12
## Weighting          : term frequency (tf)

dim(corpOLD.dtm)

## [1]  4 43

labels(corpOLD.dtm)$Terms

##  [1] "anything"     "can"          "clerk"        "desire"       "except"      
##  [6] "explain"      "explanation"  "first"        "floor"        "gentleman"   
## [11] "hence"        "house"        "know"         "made"         "member"      
## [16] "newyork"      "nothing"      "objection"    "patch"        "pending"     
## [21] "pennsylvania" "personal"     "portion"      "precluded"    "proposition" 
## [26] "purpose"      "purposes"     "relation"     "relevancy"    "remaining"   
## [31] "remark"       "rise"         "rules"        "second"       "see"         
## [36] "spoken"       "stated"       "statement"    "taking"       "temporarily" 
## [41] "time"         "whether"      "yield"

as.matrix(corpOLD.dtm)

##     Terms
## Docs anything can clerk desire except explain explanation first floor gentleman
##    1        1   0     0      0      0       2           0     0     0         1
##    2        0   2     1      1      0       0           1     1     1         1
##    3        0   1     0      0      1       0           2     0     1         2
##    4        0   0     0      2      0       0           2     0     1         1
##     Terms
## Docs hence house know made member newyork nothing objection patch pending
##    1     0     0    0    0      0       0       2         0     1       0
##    2     0     0    1    0      0       0       0         0     0       0
##    3     1     1    0    1      2       1       0         1     0       1
##    4     0     0    0    0      0       1       0         0     0       0
##     Terms
## Docs pennsylvania personal portion precluded proposition purpose purposes
##    1            0        0       0         0           0       0        0
##    2            1        0       1         0           0       1        0
##    3            1        1       1         1           1       1        1
##    4            0        1       0         0           0       1        0
##     Terms
## Docs relation relevancy remaining remark rise rules second see spoken stated
##    1        0         0         0      0    0     0      0   0      1      0
##    2        0         1         1      1    0     0      1   1      0      0
##    3        1         0         1      0    1     1      0   0      0      1
##    4        0         0         0      0    0     0      0   0      0      0
##     Terms
## Docs statement taking temporarily time whether yield
##    1         0      0           0    0       0     0
##    2         0      0           1    1       2     2
##    3         1      1           0    1       0     2
##    4         0      0           0    0       0     0

comb = c("purpose", "purposes")             # words to combine 
corpNEW.dtm = combine_terms(corpOLD.dtm, comb)
dim(corpNEW.dtm)

## [1]  4 42

labels(corpNEW.dtm)$Terms

##  [1] "anything"          "can"               "clerk"            
##  [4] "desire"            "except"            "explain"          
##  [7] "explanation"       "first"             "floor"            
## [10] "gentleman"         "hence"             "house"            
## [13] "know"              "made"              "member"           
## [16] "newyork"           "nothing"           "objection"        
## [19] "patch"             "pending"           "pennsylvania"     
## [22] "personal"          "portion"           "precluded"        
## [25] "proposition"       "purpose, purposes" "relation"         
## [28] "relevancy"         "remaining"         "remark"           
## [31] "rise"              "rules"             "second"           
## [34] "see"               "spoken"            "stated"           
## [37] "statement"         "taking"            "temporarily"      
## [40] "time"              "whether"           "yield"

as.matrix(corpNEW.dtm)

##     Terms
## Docs anything can clerk desire except explain explanation first floor gentleman
##    1        1   0     0      0      0       2           0     0     0         1
##    2        0   2     1      1      0       0           1     1     1         1
##    3        0   1     0      0      1       0           2     0     1         2
##    4        0   0     0      2      0       0           2     0     1         1
##     Terms
## Docs hence house know made member newyork nothing objection patch pending
##    1     0     0    0    0      0       0       2         0     1       0
##    2     0     0    1    0      0       0       0         0     0       0
##    3     1     1    0    1      2       1       0         1     0       1
##    4     0     0    0    0      0       1       0         0     0       0
##     Terms
## Docs pennsylvania personal portion precluded proposition purpose, purposes
##    1            0        0       0         0           0                 0
##    2            1        0       1         0           0                 1
##    3            1        1       1         1           1                 2
##    4            0        1       0         0           0                 1
##     Terms
## Docs relation relevancy remaining remark rise rules second see spoken stated
##    1        0         0         0      0    0     0      0   0      1      0
##    2        0         1         1      1    0     0      1   1      0      0
##    3        1         0         1      0    1     1      0   0      0      1
##    4        0         0         0      0    0     0      0   0      0      0
##     Terms
## Docs statement taking temporarily time whether yield
##    1         0      0           0    0       0     0
##    2         0      0           1    1       2     2
##    3         1      1           0    1       0     2
##    4         0      0           0    0       0     0

ANOTHER ALTERNATIVE WAY OF COMBINING TERMS IN A DT MATRIX

combining terms in dtm matrix: corpOLD.dtm terms to be combined in: comb new dtm matrix: corpNEW.dtm

corpOLD.dtm

## <<DocumentTermMatrix (documents: 4, terms: 43)>>
## Non-/sparse entries: 61/111
## Sparsity           : 65%
## Maximal term length: 12
## Weighting          : term frequency (tf)

dim(corpOLD.dtm)

## [1]  4 43

labels(corpOLD.dtm)$Terms

##  [1] "anything"     "can"          "clerk"        "desire"       "except"      
##  [6] "explain"      "explanation"  "first"        "floor"        "gentleman"   
## [11] "hence"        "house"        "know"         "made"         "member"      
## [16] "newyork"      "nothing"      "objection"    "patch"        "pending"     
## [21] "pennsylvania" "personal"     "portion"      "precluded"    "proposition" 
## [26] "purpose"      "purposes"     "relation"     "relevancy"    "remaining"   
## [31] "remark"       "rise"         "rules"        "second"       "see"         
## [36] "spoken"       "stated"       "statement"    "taking"       "temporarily" 
## [41] "time"         "whether"      "yield"

comb = c("purpose", "purposes")     # words to combine 
groupcomb = toString(comb) 
groupcomb           # label for combined terms

## [1] "purpose, purposes"

mm = length(comb)

h = labels(corpOLD.dtm)$Terms
ind = h == comb[1]

for (i in 2:mm) {
  ind1 = h == comb[i]
  ind = ind|ind1
}

corpR.dtm = corpOLD.dtm[ , !ind]
dim(corpR.dtm)

## [1]  4 41

labels(corpR.dtm)$Terms

##  [1] "anything"     "can"          "clerk"        "desire"       "except"      
##  [6] "explain"      "explanation"  "first"        "floor"        "gentleman"   
## [11] "hence"        "house"        "know"         "made"         "member"      
## [16] "newyork"      "nothing"      "objection"    "patch"        "pending"     
## [21] "pennsylvania" "personal"     "portion"      "precluded"    "proposition" 
## [26] "relation"     "relevancy"    "remaining"    "remark"       "rise"        
## [31] "rules"        "second"       "see"          "spoken"       "stated"      
## [36] "statement"    "taking"       "temporarily"  "time"         "whether"     
## [41] "yield"

as.matrix(corpR.dtm)

##     Terms
## Docs anything can clerk desire except explain explanation first floor gentleman
##    1        1   0     0      0      0       2           0     0     0         1
##    2        0   2     1      1      0       0           1     1     1         1
##    3        0   1     0      0      1       0           2     0     1         2
##    4        0   0     0      2      0       0           2     0     1         1
##     Terms
## Docs hence house know made member newyork nothing objection patch pending
##    1     0     0    0    0      0       0       2         0     1       0
##    2     0     0    1    0      0       0       0         0     0       0
##    3     1     1    0    1      2       1       0         1     0       1
##    4     0     0    0    0      0       1       0         0     0       0
##     Terms
## Docs pennsylvania personal portion precluded proposition relation relevancy
##    1            0        0       0         0           0        0         0
##    2            1        0       1         0           0        0         1
##    3            1        1       1         1           1        1         0
##    4            0        1       0         0           0        0         0
##     Terms
## Docs remaining remark rise rules second see spoken stated statement taking
##    1         0      0    0     0      0   0      1      0         0      0
##    2         1      1    0     0      1   1      0      0         0      0
##    3         1      0    1     1      0   0      0      1         1      1
##    4         0      0    0     0      0   0      0      0         0      0
##     Terms
## Docs temporarily time whether yield
##    1           0    0       0     0
##    2           1    1       2     2
##    3           0    1       0     2
##    4           0    0       0     0

corpN.dtm = corpOLD.dtm[ , ind]
dim(corpN.dtm)

## [1] 4 2

if(dim(corpN.dtm)[2] > 0) {
  corpN.dtm[ , 1] = row_sums(as.matrix(corpN.dtm))  
  corpN.dtm = corpN.dtm[ , 1]
  dimnames(corpN.dtm)$Terms = groupcomb
}

corpNEW.dtm = as.DocumentTermMatrix(cbind(corpR.dtm, corpN.dtm), weighting = weightTf)   
dim(corpNEW.dtm)

## [1]  4 42

labels(corpNEW.dtm)$Terms

##  [1] "anything"          "can"               "clerk"            
##  [4] "desire"            "except"            "explain"          
##  [7] "explanation"       "first"             "floor"            
## [10] "gentleman"         "hence"             "house"            
## [13] "know"              "made"              "member"           
## [16] "newyork"           "nothing"           "objection"        
## [19] "patch"             "pending"           "pennsylvania"     
## [22] "personal"          "portion"           "precluded"        
## [25] "proposition"       "relation"          "relevancy"        
## [28] "remaining"         "remark"            "rise"             
## [31] "rules"             "second"            "see"              
## [34] "spoken"            "stated"            "statement"        
## [37] "taking"            "temporarily"       "time"             
## [40] "whether"           "yield"             "purpose, purposes"

as.matrix(corpNEW.dtm)

##     Terms
## Docs anything can clerk desire except explain explanation first floor gentleman
##    1        1   0     0      0      0       2           0     0     0         1
##    2        0   2     1      1      0       0           1     1     1         1
##    3        0   1     0      0      1       0           2     0     1         2
##    4        0   0     0      2      0       0           2     0     1         1
##     Terms
## Docs hence house know made member newyork nothing objection patch pending
##    1     0     0    0    0      0       0       2         0     1       0
##    2     0     0    1    0      0       0       0         0     0       0
##    3     1     1    0    1      2       1       0         1     0       1
##    4     0     0    0    0      0       1       0         0     0       0
##     Terms
## Docs pennsylvania personal portion precluded proposition relation relevancy
##    1            0        0       0         0           0        0         0
##    2            1        0       1         0           0        0         1
##    3            1        1       1         1           1        1         0
##    4            0        1       0         0           0        0         0
##     Terms
## Docs remaining remark rise rules second see spoken stated statement taking
##    1         0      0    0     0      0   0      1      0         0      0
##    2         1      1    0     0      1   1      0      0         0      0
##    3         1      0    1     1      0   0      0      1         1      1
##    4         0      0    0     0      0   0      0      0         0      0
##     Terms
## Docs temporarily time whether yield purpose, purposes
##    1           0    0       0     0                 0
##    2           1    1       2     2                 1
##    3           0    1       0     2                 2
##    4           0    0       0     0                 1

combineterms

Chapter 1 extra code for document-term matrix

Different ways of omitting and combing words in a document-term matrix

OMITTING A SINGLE WORD FROM A DOCUMENT-TERM MATRIX

OMITTING SEVERAL WORDS FROM A DOCUMENT-TERM MATRIX

COMBINING FREQUENCIES ACROSS SEVERAL AND DELETING FREQUENCIES OF INDIVIDUAL TERMS

FUNCTION combine_terms TO COMBINE TERMS (COLUMNS) IN A DT MATRIX AND DELETING FREQUENCIES OF INDIVIDUAL TERMS

ANOTHER ALTERNATIVE WAY OF COMBINING TERMS IN A DT MATRIX