forked from atpaino/deep-text-corrector
-
Notifications
You must be signed in to change notification settings - Fork 1
/
text_correcter_data_readers.py
118 lines (85 loc) · 3.75 KB
/
text_correcter_data_readers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import random
from data_reader import DataReader, PAD_TOKEN, EOS_TOKEN, GO_TOKEN
class PTBDataReader(DataReader):
"""
DataReader used to read in the Penn Treebank dataset.
"""
UNKNOWN_TOKEN = "<unk>" # already defined in the source data
DROPOUT_WORDS = {"a", "an", "the"}
DROPOUT_PROB = 0.25
REPLACEMENTS = {"there": "their", "their": "there"}
REPLACEMENT_PROB = 0.25
def __init__(self, config, train_path):
super(PTBDataReader, self).__init__(
config, train_path, special_tokens=[PAD_TOKEN, GO_TOKEN, EOS_TOKEN])
self.UNKNOWN_ID = self.token_to_id[PTBDataReader.UNKNOWN_TOKEN]
def read_samples_by_string(self, path):
for line in self.read_tokens(path):
source = []
target = []
for token in line:
target.append(token)
# Randomly dropout some words from the input.
dropout_word = (token in PTBDataReader.DROPOUT_WORDS and
random.random() < PTBDataReader.DROPOUT_PROB)
replace_word = (token in PTBDataReader.REPLACEMENTS and
random.random() <
PTBDataReader.REPLACEMENT_PROB)
if replace_word:
source.append(PTBDataReader.REPLACEMENTS[token])
elif not dropout_word:
source.append(token)
yield source, target
def unknown_token(self):
return PTBDataReader.UNKNOWN_TOKEN
def read_tokens(self, path):
with open(path, "r") as f:
for line in f:
yield line.rstrip().lstrip().split()
class MovieDialogReader(DataReader):
"""
DataReader used to read and tokenize data from the Cornell open movie
dialog dataset.
"""
UNKNOWN_TOKEN = "UNK"
DROPOUT_TOKENS = {"a", "an", "the", "'ll", "'s", "'m", "'ve"} # Add "to"
REPLACEMENTS = {"there": "their", "their": "there", "then": "than",
"than": "then"}
# Add: "be":"to"
def __init__(self, config, train_path, dropout_prob=0.25,
replacement_prob=0.25, dataset_copies=2):
super(MovieDialogReader, self).__init__(
config, train_path, special_tokens=[
PAD_TOKEN, GO_TOKEN, EOS_TOKEN,
MovieDialogReader.UNKNOWN_TOKEN],
dataset_copies=dataset_copies)
self.dropout_prob = dropout_prob
self.replacement_prob = replacement_prob
self.UNKNOWN_ID = self.token_to_id[MovieDialogReader.UNKNOWN_TOKEN]
def read_samples_by_string(self, path):
for tokens in self.read_tokens(path):
source = []
target = []
for token in tokens:
target.append(token)
# Randomly dropout some words from the input.
dropout_token = (token in MovieDialogReader.DROPOUT_TOKENS and
random.random() < self.dropout_prob)
replace_token = (token in MovieDialogReader.REPLACEMENTS and
random.random() < self.replacement_prob)
if replace_token:
source.append(MovieDialogReader.REPLACEMENTS[token])
elif not dropout_token:
source.append(token)
yield source, target
def unknown_token(self):
return MovieDialogReader.UNKNOWN_TOKEN
def read_tokens(self, path):
with open(path, "r") as f:
for line in f:
# yield nltk.word_tokenize(line.lower().rstrip().lstrip())
#
yield line.lower().rstrip().lstrip().split()