comprehend_helper.py

import os
import time
import boto3
import pandas as pd
from collections import Counter

start_time = time.time()
comprehend = boto3.client("comprehend", region_name='us-east-1')
accepted_entities = ['EVENT', 'LOCATION', 'ORGANIZATION', 'PERSON', 'COMMERCIAL_ITEM']
# skipping DATE, OTHER, QUANTITY as they don't make sense in this context

def process_csv_file(file_path, max_rows=50):
    if os.path.splitext(file_path)[1] != '.csv':
        print("Wrong input format, only CSV files supported. \nExiting.")
        return
    print("Reading input file...")
    input_df = pd.read_csv(file_path, nrows=max_rows)
    if input_df.shape[1] > 1:
        print("Input format is wrong. Please input a file with single column containing tweets/text.\nExiting.")
        return
    input_df.columns = ["Text"]
    text_list = list(input_df.Text.values)
    
    sentiments = []
    entities = []
    
    for i in range(0, len(text_list), 20):  #batch accepts ony 25 docs at a time
        start = i
        end = min(i+19, len(text_list))
        response = comprehend.batch_detect_sentiment(
            LanguageCode="en",
            TextList=text_list[start: end]
            )
        sentiments.extend([result['Sentiment'] for result in response['ResultList']])
        
        response_ent = comprehend.batch_detect_entities(
            LanguageCode="en",
            TextList=text_list[start: end]
            )
        for ent in response_ent['ResultList']:
            if ent['Entities']:  # entities detected
                for e in ent['Entities']:
                    if e['Type'] in accepted_entities:
                        if (len(e['Text']) > 2) and not (str(e['Text']).replace('@', '').isdecimal()):
                            entities.append(
                                [e['Text'], e['Type']]
                                )

    sentiment_counts = Counter(sentiments)
    # entities has both type and text if you like!
    top_ents = Counter([ent[0] for ent in entities]).most_common(15)
    print("Time taken: ", time.time() - start_time)
    # print(results)
    data_output = {
        "entity": 
        {
            "label": [x for (x,y) in top_ents],
            "count": [y for (x,y) in top_ents]
        },
        "sentiment":
        {
            "label": list(sentiment_counts.keys()),
            "count": list(sentiment_counts.values())
        }
    }
    # print(data_output)
    return data_output
    
if __name__ == "__main__":
    process_csv_file("sampleData/sample_tweets.csv")