This repository has been archived by the owner on Oct 3, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 26
/
build_wordvec.py
102 lines (82 loc) · 3.21 KB
/
build_wordvec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
Create word vector space from the crawled dataset
@author TaoPR (github.com/starcolon)
"""
import os
import sys
import codecs
import argparse
import word2vec
from termcolor import colored
from nltk.tokenize.punkt import PunktSentenceTokenizer
from pylib.knowledge.datasource import MineDB
from pylib.text.cleanser import *
arguments = argparse.ArgumentParser()
arguments.add_argument('--verbose', dest='verbose', action='store_true', help='Turn verbose output on.')
arguments.add_argument('--limit', type=int, default=100, help='Maximum number of topics we want to import')
arguments.add_argument('--out', type=str, default='./models/word2vec.bin', help='Output path of the word2vec binary model.')
args = vars(arguments.parse_args(sys.argv[1:]))
def model_from_crawl_collection(mineDB, output_path):
# Dump sentences out of the DB
print(colored('Exporting crawled data to text file...','cyan'))
text_path = export_crawl_to_text(mineDB)
print(colored('[Done]','green'))
# Train word2vec model
print(colored('Training word2vec...','cyan'))
model = create_model(text_path, output_path)
print(colored('[Done]','green'))
print('Word2Vec model is saved at : {}'.format(output_path))
return model
def export_crawl_to_text(mineDB):
# Prepare a naive sentence tokeniser utility
pst = PunktSentenceTokenizer()
text_path = os.path.realpath('./mine.txt')
with codecs.open(text_path, 'w', 'utf-8') as f:
m = 0
for wiki in mineDB.query({'downloaded': True},field=None):
# Skip empty content or the added one
if wiki['content'] is None or 'added_to_graph' in wiki:
continue
content = wiki['content']
# A wiki page may probably comprise of multiple content
for c in content['contents']:
# Explode content into sentences
sentences = pst.sentences_from_text(c)
print('... content #{} ==> {} sentences extracted.'.format(m, len(sentences)))
for s in sentences:
# Cleanse the sentence
s_ = cleanse(s)
# Filter out noise by length
if len(s_)<5 or len(s_.split(' '))<3:
continue
f.write(s_.lower() + '\n')
m += 1
if m>=args['limit']:
print(colored('[Ending] Maximum number of topics reached.','yellow'))
break
return text_path
def create_model(input_path, output_path):
word2vec.word2vec(\
input_path, \
output_path, \
size=10, binary=1, verbose=True)
assert(os.path.isfile(output_path))
#return word2vec.load(output_path)
return word2vec.WordVectors.from_binary(output_path, encoding='ISO-8859-1')
def repl(model):
while True:
w = input('Enter a word to try: ')
indexes, metrics = model.cosine(w)
print('... Similar words : {}', model.vocab[indexes])
if __name__ == '__main__':
mineDB = crawl_collection = MineDB('localhost','vor','crawl')
model = model_from_crawl_collection(mineDB, os.path.realpath(args['out']))
# Examine the model properties
if model is None:
print(colored('[ERROR] Model is empty.','red'))
else:
print(colored('[Word2Vec model spec]','cyan'))
print('... Model shape : {}'.format(model.vectors.shape))
# Execute a playground REPL
print(colored('[Word2Vec REPL]:','cyan'))
repl(model)