-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_model.R
71 lines (61 loc) · 2.54 KB
/
test_model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
require(tm)
require(NLP)
require(openNLP)
if(!exists("m", mode="list")) source("build_model.R")
convert_text_to_sentences <- function(text, lang = "en") {
# Function to compute sentence annotations using the Apache OpenNLP Maxent sentence detector employing the default model for language 'en'.
sentence_token_annotator <- Maxent_Sent_Token_Annotator(language = lang)
# Convert text to class String from package NLP
text <- as.String(text)
# Sentence boundaries in text
sentence.boundaries <- annotate(text, sentence_token_annotator)
# Extract sentences
sentences <- text[sentence.boundaries]
# return sentences
return(sentences)
}
w1match <- 0L
w1miss <- 0L
w3match <- 0L
w3miss <- 0L
process.single.sentence <- function(rawSentence) {
tokens <- tokenize.sentence(rawSentence)
for (nToken in 2:(length(tokens) - 1)) { #tokens[1] is Start marker, skip last word
testStr <- paste(paste(tokens[2:nToken], collapse = ' '), ' ', sep = '')
predictions <- predict.next.word(m, testStr, num.possibilities = 3)
if (nrow(predictions) > 0) {
nextWord <- tokens[nToken + 1]
w1m <- nextWord == predictions[1]$token
assign("w1match", w1match + w1m, envir = .GlobalEnv)
assign("w1miss", w1miss + 1 - w1m, envir = .GlobalEnv)
w3m <- nextWord %in% predictions$token
assign("w3match", w3match + w3m, envir = .GlobalEnv)
assign("w3miss", w3miss + 1 - w3m, envir = .GlobalEnv)
if (w3m) {
print(sprintf("Match \"%s\", variants = {%s}, success rate = %.4f",
nextWord, paste(predictions$token, collapse = ', '),
(as.numeric(w3match) / (as.numeric(w3match) + as.numeric(w3miss)))))
} else {
print(sprintf("Miss \"%s\", variants = {%s}, success rate = %.4f",
nextWord, paste(predictions$token, collapse = ', '),
(as.numeric(w3match) / (as.numeric(w3match) + as.numeric(w3miss)))))
}
} else {
print(sprintf("Error - No variants given, context = [%s]"), testStr)
}
}
}
process.single.file <- function(fileName) {
con <- file(fileName, "r", blocking = FALSE)
text <- readLines(con, encoding = "UTF-8")
for (lineNum in 1:length(text)) {
line <- text[[lineNum]]
sentences <- convert_text_to_sentences(line)
for (sentNum in 1:length(sentences)) {
process.single.sentence(sentences[[sentNum]])
}
}
}
process.single.file('./src_data/en_US/test3.txt')
#process.single.file('./src_data/en_US/en_US.news.txt')
#process.single.file('./src_data/en_US/en_US.twitter.txt')