-
Notifications
You must be signed in to change notification settings - Fork 437
/
remove_words.py
90 lines (75 loc) · 2.35 KB
/
remove_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from nltk.corpus import stopwords
import nltk
from nltk.wsd import lesk
from nltk.corpus import wordnet as wn
from utils import clean_str, loadWord2Vec
import sys
if len(sys.argv) != 2:
sys.exit("Use: python remove_words.py <dataset>")
datasets = ['20ng', 'R8', 'R52', 'ohsumed', 'mr']
dataset = sys.argv[1]
if dataset not in datasets:
sys.exit("wrong dataset name")
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print(stop_words)
# Read Word Vectors
# word_vector_file = 'data/glove.6B/glove.6B.200d.txt'
# vocab, embd, word_vector_map = loadWord2Vec(word_vector_file)
# word_embeddings_dim = len(embd[0])
# dataset = '20ng'
doc_content_list = []
f = open('data/corpus/' + dataset + '.txt', 'rb')
# f = open('data/wiki_long_abstracts_en_text.txt', 'r')
for line in f.readlines():
doc_content_list.append(line.strip().decode('latin1'))
f.close()
word_freq = {} # to remove rare words
for doc_content in doc_content_list:
temp = clean_str(doc_content)
words = temp.split()
for word in words:
if word in word_freq:
word_freq[word] += 1
else:
word_freq[word] = 1
clean_docs = []
for doc_content in doc_content_list:
temp = clean_str(doc_content)
words = temp.split()
doc_words = []
for word in words:
# word not in stop_words and word_freq[word] >= 5
if dataset == 'mr':
doc_words.append(word)
elif word not in stop_words and word_freq[word] >= 5:
doc_words.append(word)
doc_str = ' '.join(doc_words).strip()
#if doc_str == '':
#doc_str = temp
clean_docs.append(doc_str)
clean_corpus_str = '\n'.join(clean_docs)
f = open('data/corpus/' + dataset + '.clean.txt', 'w')
#f = open('data/wiki_long_abstracts_en_text.clean.txt', 'w')
f.write(clean_corpus_str)
f.close()
#dataset = '20ng'
min_len = 10000
aver_len = 0
max_len = 0
f = open('data/corpus/' + dataset + '.clean.txt', 'r')
#f = open('data/wiki_long_abstracts_en_text.txt', 'r')
lines = f.readlines()
for line in lines:
line = line.strip()
temp = line.split()
aver_len = aver_len + len(temp)
if len(temp) < min_len:
min_len = len(temp)
if len(temp) > max_len:
max_len = len(temp)
f.close()
aver_len = 1.0 * aver_len / len(lines)
print('min_len : ' + str(min_len))
print('max_len : ' + str(max_len))
print('average_len : ' + str(aver_len))