forked from PaddlePaddle/PaddleNLP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gen_data.sh
55 lines (47 loc) · 1.71 KB
/
gen_data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
echo "Downloading dataset..."
CUR_DIR=$PWD
mkdir -p gen_data
cd ./gen_data/
if [ ! -d "wikitext-103" ]; then
echo "Downloading wikitext-103..."
wget -O wikitext-103-v1.zip https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
echo "Unzip wikitext-103..."
unzip wikitext-103-v1.zip
cd wikitext-103
# Rename
mv wiki.train.tokens train.txt
mv wiki.valid.tokens valid.txt
mv wiki.test.tokens test.txt
cd -
fi
if [ ! -d 'enwik8' ]; then
mkdir -p enwik8
cd enwik8
echo "Downloading enwik8..."
wget -O enwik8.zip http://mattmahoney.net/dc/enwik8.zip
wget -O prep_enwik8.py https://raw.githubusercontent.com/salesforce/awd-lstm-lm/master/data/enwik8/prep_enwik8.py
python3 prep_enwik8.py
rm -f prep_enwik8.py
cd -
fi
if [ ! -d 'text8' ]; then
mkdir -p text8
cd text8
echo "Downloading text8..."
wget -O text8.zip http://mattmahoney.net/dc/text8.zip
python ${CUR_DIR}/utils/preprocess_text8.py 5000000
cd -
fi
if [ ! -d 'one-billion-words' ]; then
mkdir -p one-billion-words
cd one-billion-words
echo "Downloading one-billion-words..."
wget -O 1-billion-word-language-modeling-benchmark-r13output.tar.gz http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
tar xzf 1-billion-word-language-modeling-benchmark-r13output.tar.gz
dir="./1-billion-word-language-modeling-benchmark-r13output/heldout-monolingual.tokenized.shuffled/"
cat ${dir}/news.en.heldout-00000-of-00050 > valid.txt
cat ${dir}/news.en.heldout-00000-of-00050 > test.txt
wget -O 1b_word_vocab.txt https://github.com/rafaljozefowicz/lm/raw/master/1b_word_vocab.txt
cd -
fi
echo "All done. "