-
Notifications
You must be signed in to change notification settings - Fork 1
/
process_kaggle.py
44 lines (32 loc) · 1.28 KB
/
process_kaggle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
"""
Script used to process the Kaggle dataset and extract the matches.
It also creates the vocabulary from it.
https://www.kaggle.com/milesh1/35-million-chess-games
"""
import os
import re
if __name__ == "__main__":
vocab_counter = set()
with open(f"dataset/processed_kaggle2.txt", "w", encoding="utf-8") as outf:
with open("dataset/kaggle2.txt", "r", encoding="utf-8") as inpf:
for line in inpf:
try:
ostr = line.split("###")[1].strip()
ostr = re.sub("W\d+.", "", ostr)
ostr = re.sub("B\d+.", "", ostr)
if len(ostr) > 0:
if ostr[-1] != '\n':
ostr = ostr + '\n'
outf.write(ostr)
for move in ostr.split(" "):
move = move.replace("\n", "")
if move != "":
vocab_counter.add(move)
else:
a = 0
except:
pass
os.makedirs("vocabs", exist_ok=True)
with open(f"vocabs/kaggle2.txt", "w", encoding="utf-8") as f:
for v in vocab_counter:
f.write(v + "\n")