forked from rostro36/Partisan-Responses
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
63 lines (50 loc) · 2 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import pandas as pd
import spacy
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import torch
from allennlp.predictors.predictor import Predictor
import tensorflow_hub as hub
from tqdm import tqdm
if torch.cuda.is_available():
cuda_device = 0 #TODO: is there a non hard-code way?
else:
cuda_device = -1
sp = spacy.load('en_core_web_sm')
model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
open_info_extractor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz",
cuda_device=cuda_device)
coref_extractor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz",
cuda_device=cuda_device)
def remove_wordy(s, wordy_list):
for i in wordy_list:
s=s.replace(i, "")
return s
def add_stemmed_col_to_df(df, speeches_col, stemmed_col):
"""
:param df: dataframe containing at least a column with speeches
:return: new dataframe, with added column for stemmed speeches
"""
tqdm.pandas()
ps = PorterStemmer()
stemmed = df[speeches_col].progress_apply(lambda x: " ".join([ps.stem(w.lower()) for w in word_tokenize(x)]))
df[stemmed_col] = stemmed
return df
def lemmatize(phrase):
"""
Given some text, it returns the lemmatized text
:param phrase: text
:return: lemmatized text
"""
return " ".join([word.lemma_ for word in sp(phrase)])
if __name__ == "__main__":
speeches = pd.read_pickle("all_speech_sentence_filtered.pkl")
new_speeches = add_stemmed_col_to_df(speeches, "Questions", "Stemmed")
new_speeches.to_pickle("all_speech_filtered_stemmed.pkl")
# check the new dataset
new_speeches = pd.read_pickle("all_speech_filtered_stemmed.pkl")
print(len(new_speeches))
print(new_speeches.iloc[0].Questions)
print(new_speeches.iloc[0].Stemmed)