-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyze.R
105 lines (87 loc) · 2.76 KB
/
analyze.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
library(tidytext)
library(tidyverse)
library(stringr)
library(gutenbergr)
library(gridExtra)
# Read in a file to process (and I should really do some preprocessing here!)
slurp <- function(filename) {
file_text <- read_file(filename)
return(file_text)
}
# Get Project Gutenberg book by title based on ID
get_book <- function(pg_title) {
pg_id <- gutenberg_metadata %>%
filter(title == pg_title) %>%
select(gutenberg_id)
book <- gutenberg_download(pg_id[[1]])
return(book)
}
# Basic, initial text processing
process_text <- function(file_text) {
tidied <- file_text %>%
unnest_tokens(word, text) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
anti_join(stop_words)
return(tidied)
}
# Plot word counts (takes pre-processed text from function above)
word_counts <- function(ptext) {
plt <- count(ptext, word, sort = TRUE) %>%
top_n(10) %>%
ggplot(aes(reorder(word, n), n)) +
geom_col() +
coord_flip() +
labs(x = "Word", y = "Count")
return(plt)
}
# Bind sentiment data frames to word tibble
bind_sentiments <- function(words) {
# Use purrr and a parameter to facilitate arbitrary lexicons?
nrc_sent <- inner_join(get_sentiments("nrc"), words)
afinn_sent <- inner_join(get_sentiments("afinn"), words)
bing_sent <- inner_join(get_sentiments("bing"), words)
# Return as a list
return(list(nrc = nrc_sent, afinn = afinn_sent, bing = bing_sent))
}
# Get NRC sentiment count
get_nrc_count <- function(words_sent) {
return(count(words_sent, sentiment, sort = TRUE))
}
# Get AFINN sentiment counts
get_afinn_counts <- function(words_sent) {
total <- reduce(words_sent$score, `+`)
avg <- mean(words_sent$score)
med <- median(words_sent$score)
counts <- count(words_sent, score, sort = TRUE)
return(list(total = total, avg = avg, med = med, counts = counts))
}
# Get Bing sentiment counts
get_bing_count <- function(words_sent) {
return(count(words_sent, sentiment, sort = TRUE))
}
# Plot the sentiment counts
sentiment_counts <- function(ptext) {
sents <- bind_sentiments(ptext)
nrc_plt <- get_nrc_count(sents$nrc) %>%
top_n(10) %>%
ggplot(aes(reorder(sentiment, n), n)) +
geom_col() +
coord_flip() +
labs(x = "Sentiment", y = "Count",
title = "NRC - Emotions")
afinn_plt <- get_afinn_counts(sents$afinn)$counts %>%
top_n(10) %>%
ggplot(aes(score, n)) +
geom_col() +
coord_flip() +
labs(x = "Sentiment", y = "Count",
title = "AFINN - +/- Point Scale")
bing_plt <- get_bing_count(sents$bing) %>%
top_n(10) %>%
ggplot(aes(reorder(sentiment, n), n)) +
geom_col() +
coord_flip() +
labs(x = "Sentiment", y = "Count",
title = "BING - +/-")
return(grid.arrange(nrc_plt, afinn_plt, bing_plt, ncol = 1))
}