This repository has been archived by the owner on Sep 18, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
clean_data_from_bigquery.py
60 lines (44 loc) · 1.93 KB
/
clean_data_from_bigquery.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import pandas as pd
import nltk
from nltk.tokenize import wordpunct_tokenize
from nltk.stem.porter import PorterStemmer
import sys
import os
porter_stemmer = PorterStemmer()
def normalise_search_terms(terms):
tokens = wordpunct_tokenize(terms)
return ' '.join([porter_stemmer.stem(token) for token in tokens])
def filter_out_queries_with_not_enough_sessions(df):
"""
The number of sessions retrieve will vary by search term.
The volume of searches for a particular term may vary over time.
I'm just going to ignore anything where I haven't pulled back enough sessions.
"""
queries_before = df.searchTerm.nunique()
term_summary = df.groupby('searchTerm').agg({'searchSessionId': lambda x: x.nunique()})
enough_sessions = term_summary[term_summary.searchSessionId > 1000]
queries_after = len(enough_sessions.index)
if queries_after < queries_before:
print(f'Filtered out {queries_before - queries_after} queries with less than 1000 sessions')
print(f'There are now {queries_after} queries in the dataset')
return df[df.searchTerm.isin(enough_sessions.index)]
if __name__ == '__main__':
if len(sys.argv) < 3:
print('Usage: python clean_data_from_bigquery.py [input_filename] [output_filename]')
sys.exit(1)
# There are a handful of searches for literally "null"
# Don't try and interpret that
df = pd.read_csv(sys.argv[1], na_filter=False)
print('Renaming columns')
df = df.rename(
{
'ga:productSku': 'contentIdOrPath',
'linkPosition': 'rank',
'sessionId': 'searchSessionId',
'searchTerm': 'originalSearchTerm'
}, axis='columns'
)
df['searchTerm'] = df.originalSearchTerm.apply(normalise_search_terms)
#df = filter_out_queries_with_not_enough_sessions(df)
print(f'There are {df.searchSessionId.nunique()} unique sessions in the dataset')
df.to_csv(sys.argv[2], index=False)