-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_word_frequencies.py
58 lines (52 loc) · 2.14 KB
/
get_word_frequencies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
"""
Save approximate word frequencies
- A word's frequency is approximated by the average number of times it is tweeted per day
"""
import argparse
import os
import numpy as np
import time
## Internal imports
from twitter_api import *
import config
from utils import make_sure_df_exists
if __name__ == '__main__':
REQUEST_LIMIT = 300
PATHS = config.FREQ_FILE_NAMES
parser = argparse.ArgumentParser()
parser.add_argument("--type", type=str, default="slang") #{"slang","nonslang","both","sample"}
parser.add_argument("--year", type=int, default=2010)
parser.add_argument("--save-dir", type=str, default="data/frequencies/")
parser.add_argument("--num-dates",type=int,default=40)
parser.add_argument("--words", type=str, default="data/all_words.csv")
args = parser.parse_args()
selected_words_df = pd.read_csv(args.words)
words_list = list(selected_words_df[selected_words_df.type == args.type].word)
freq_file_path = os.path.join(args.save_dir, PATHS[args.type + str(args.year)])
make_sure_df_exists(freq_file_path)
print("saving word frequencies under", freq_file_path)
num_words_until_pause = np.ceil(REQUEST_LIMIT/args.num_dates)
hour_gap=24
num_words_since_pause = 0
for word in words_list:
word = word.lower()
freq_df = pd.read_csv(freq_file_path)
if word in freq_df.word.values:
continue
print("getting frequency for", word)
freq = approx_freq(word, year=args.year,
num_dates=args.num_dates,
hour_gap=hour_gap,)
if freq == -1:
continue
with open(freq_file_path, "a") as freq_file:
row = (",").join([str("%.2f" % freq), word, str(args.year), args.type])
freq_file.write("\n" + row)
num_words_since_pause += 1
print("saved tweets for", word)
if num_words_since_pause == num_words_until_pause:
## Wait 15 minutes to avoid request rate restrictions
num_words_since_pause = 0
print("will wait 15 minutes now")
time.sleep(15*60)
print("finished waiting")