forked from bquast/Data-Science-Capstone
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tm.R
105 lines (82 loc) · 2.58 KB
/
tm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# tm.R
# Bastiaan Quast
# load the magrittr package for piping
library(magrittr)
# load the data
load("sample.RData")
# load the text mining package 'tm'
library(tm)
# load create the corpus for each source
# blogs
sample_blogs %>%
data.frame() %>%
DataframeSource() %>%
VCorpus %>%
tm_map( stripWhitespace ) -> vc_blogs
# news
sample_news %>%
data.frame() %>%
DataframeSource() %>%
VCorpus %>%
tm_map( stripWhitespace ) -> vc_news
# twitter
sample_twitter %>%
data.frame() %>%
DataframeSource() %>%
VCorpus %>%
tm_map( stripWhitespace ) -> vc_twitter
vc_all <- c(vc_blogs, vc_news, vc_twitter)
vc_all %>%
TermDocumentMatrix( control = list(removePunctuation=TRUE,
removeNumbers=TRUE,
stopwords = TRUE,
removeSparseTerms=0.8 )
) -> tdm_sparse
vc_all %>%
DocumentTermMatrix( control = list(removePunctuation=TRUE,
removeNumbers=TRUE,
stopwords = TRUE,
removeSparseTerms=0.8 )
) -> dtm_sparse
tdm_sparse %>%
save( file = "tdm_sparse.RData" )
rm(tdm_sparse)
gc()
# n-grams
# load the RWeka library
library(RWeka)
# construct bi-gram tokanizer and tri-gram tokanizer
BiGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
TriGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
# find bigrams in blogs
vc_all %>%
TermDocumentMatrix( control = list(removePunctuation=TRUE,
removeNumbers=TRUE,
stopwords = TRUE,
removeSparseTerms=0.8,
tokenize = BiGramTokenizer)
) -> tdm_bi_sparse
tdm_bi_sparse %>%
save( file = "tdm_bi_sparse.RData" )
# list bi-grams with at least 5000 occurances
tdm_bi_sparse %>%
findFreqTerms( lowfreq=2000 ) -> bi_grams_2000
bi_grams_2000
rm(tdm_bi_sparse)
gc()
vc_all %>%
TermDocumentMatrix( control = list(removePunctuation=TRUE,
removeNumbers=TRUE,
stopwords = TRUE,
removeSparseTerms=0.8,
tokenize = TriGramTokenizer)
) -> tdm_tri_sparse
# list tri-grams with at least 50 occurances
tdm_tri_sparse %>%
findFreqTerms( lowfreq=1 ) -> tri_grams
tri_grams
tdm_tri_sparse %>%
save( file = "tdm_tri_sparse.RData" )
rm(tdm_tri_sparse)
gc()