-
Notifications
You must be signed in to change notification settings - Fork 0
/
pickle_dumps2.py
35 lines (32 loc) · 1.32 KB
/
pickle_dumps2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import pandas as pd
import pickle
import os
import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
dir_path="C:\\Users\\Krishna\\Desktop\\Projects\\BechdelPlusPlus\\New_Bechdel"
movie_character_df = pd.read_csv(dir_path+'\mcm_csv.csv')
movie_conversations_df=pd.read_csv(dir_path+'\mc_csv.csv')
movie_lines_df=pd.read_csv(dir_path+'\ml_csv.csv')
list_of_utterances={}
trans_table = ''.join( [chr(i) for i in range(128)] + ['?'] * 128 )
# line_id_mapping={}
for i in range(3500):#len(movie_lines_df)
#
# if movie_lines_df.Line_Id[i] not in list_of_utterances.keys():
# list_of_utterances[movie_lines_df.Line_Id[i]]=[]
try:
list_of_utterances[movie_lines_df.Line_Id[i]]=(nltk.word_tokenize(str(movie_lines_df.Dialogue[i])))
# tokenizer.tokenize(movie_lines_df.Dialogue[i])
except:
list_of_utterances[movie_lines_df.Line_Id[i]]=(nltk.word_tokenize(str(movie_lines_df.Dialogue[i]).translate(trans_table)))
# for i in range(len(movie_lines_df)):
# # print movie_lines_df.Dialogue[i]
# tokens=nltk.word_tokenize(str(movie_lines_df.Dialogue[i]))
# list_of_utterances.append(tokens)
# print list_of_utterances
pos_tags=[]
pos_tags=nltk.pos_tag_sents(list_of_utterances.values())
pickle_out=open('pos_tags_utterances.pickle','wb')
pickle.dump([pos_tags,list_of_utterances],pickle_out)
pickle_out.close