Skip to content

Commit

Permalink
Merge pull request #764 from aws-solutions/release/v5.1.8
Browse files Browse the repository at this point in the history
Release v5.1.8
  • Loading branch information
amzn-gaod authored Aug 26, 2024
2 parents 83a7393 + 42290f1 commit da597a9
Show file tree
Hide file tree
Showing 20 changed files with 243 additions and 31 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [5.1.8] - 2024-08-26

### Changed

- Upgrading NLTK version

## [5.1.7] - 2024-06-11

### Changed
Expand Down
11 changes: 11 additions & 0 deletions SECURITY.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Reporting Security Issues

We take all security reports seriously.
When we receive such reports,
we will investigate and subsequently address
any potential vulnerabilities as quickly as possible.
If you discover a potential security issue in this project,
please notify AWS/Amazon Security via our
[vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/)
or directly via email to [AWS Security](mailto:[email protected]).
Please do *not* create a public GitHub issue in this project.
13 changes: 12 additions & 1 deletion deployment/build-s3-dist.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#
###############################################################################

source "$(dirname "${BASH_SOURCE[0]}")/nltk_download_functions.sh"

usage() {
msg "$1"
cat <<EOF
Expand Down Expand Up @@ -63,6 +65,8 @@ cleanup() {
echo "------------------------------------------------------------------------------"
fi
[ -n "$VENV" ] && [ -d "$VENV" ] && rm -rf "$VENV"

cleanup_punkt "$source_dir"
}

msg() {
Expand Down Expand Up @@ -228,7 +232,7 @@ pip3 install wheel
# See the following issues for more details:
# - https://github.com/aws/aws-cdk/issues/26300
# - https://github.com/python-jsonschema/jsonschema/issues/1117
pip3 install --quiet boto3 chalice==1.31.0 docopt pyyaml jsonschema==4.17.3 aws_xray_sdk
pip3 install --quiet boto3 chalice==1.31.2 docopt pyyaml jsonschema==4.17.3 aws_xray_sdk
export PYTHONPATH="$PYTHONPATH:$source_dir/lib/MediaInsightsEngineLambdaHelper/"
if [ $? -ne 0 ]; then
echo "ERROR: Failed to install required Python libraries."
Expand Down Expand Up @@ -257,6 +261,7 @@ cd "$source_dir"/lib/MediaInsightsEngineLambdaHelper || exit 1
rm -rf build
rm -rf dist
rm -rf Media_Insights_Engine_Lambda_Helper.egg-info
python3 -m pip install -r requirements.txt
python3 setup.py bdist_wheel > /dev/null
echo -n "Created: "
find "$source_dir"/lib/MediaInsightsEngineLambdaHelper/dist/
Expand Down Expand Up @@ -297,6 +302,7 @@ else
cp -R "$source_dir"/lib/MediaInsightsEngineLambdaHelper .
cd MediaInsightsEngineLambdaHelper/ || exit 1
echo "Building Media Insights on AWS Lambda Helper python library"
python3 -m pip install -r requirements.txt
python3 setup.py bdist_wheel > /dev/null
cp dist/*.whl ../
cp dist/*.whl "$source_dir"/lib/MediaInsightsEngineLambdaHelper/dist/
Expand Down Expand Up @@ -452,6 +458,7 @@ rm -rf ./dist ./package
# ------------------------------------------------------------------------------"

echo "Building Translate function"
download_punkt "$source_dir"
cd "$source_dir/operators/translate" || exit 1
[ -e dist ] && rm -rf dist
mkdir -p dist
Expand All @@ -466,12 +473,16 @@ touch ./setup.cfg
echo "[install]" > ./setup.cfg
echo "prefix= " >> ./setup.cfg
pip3 install --quiet -r ../requirements.txt --target .
# copy downloaded nltk_data pickles to the package
cp -r ../nltk_data nltk_data

if ! [ -d ../dist/start_translate.zip ]; then
zip -q -r9 ../dist/start_translate.zip .

elif [ -d ../dist/start_translate.zip ]; then
echo "Package already present"
fi

popd || exit 1
zip -q -g ./dist/start_translate.zip ./start_translate.py
cp "./dist/start_translate.zip" "$regional_dist_dir/start_translate.zip"
Expand Down
2 changes: 1 addition & 1 deletion deployment/lambda_layer_factory/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM amazonlinux
FROM public.ecr.aws/amazonlinux/amazonlinux:2023.5.20240819.0

WORKDIR /
RUN yum update -y
Expand Down
114 changes: 114 additions & 0 deletions deployment/nltk_download_functions.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#!/bin/bash

# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

###############################################################################
# PURPOSE: This script provides utility functions to download and verify the
# hash of NLTK pickles.
#
# PRELIMINARY:
# Sourced from within another script.
#
# USAGE:
# source "${path}/nltk_download_functions.sh"
#
# Where ${path} is the directory containing this script. If the calling
# script is in the same directory, use "$(dirname "${BASH_SOURCE[0]}")".
# Example: source "$(dirname "${BASH_SOURCE[0]}")/nltk_download_functions.sh"
#
###############################################################################

verify_hash() {
local file_path="$1"
local expected_hash="$2"
local actual_hash=$(sha256sum "$file_path" | awk '{print $1}')

if [ "$actual_hash" != "$expected_hash" ]; then
echo "Hash mismatch found for $file_path"
echo "Expected: $expected_hash"
echo "Found: $actual_hash"
return 1
else
return 0
fi
}

function download_punkt() {
cleanup_punkt "${1}"
echo "Starting download of punkt zip file..."

local -r source_dir="${1}"
local -r zip_url="https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip"
local -r zip_file="punkt.zip"
# this hash was verified to be working, it may need to be updated if nltk ever updates punkt pickles
local -r expected_hash="51c3078994aeaf650bfc8e028be4fb42b4a0d177d41c012b6a983979653660ec"
local temp_dir

# Download the punkt zip file
if ! curl -sSL "$zip_url" -o "$zip_file"; then
echo "Error: Failed to download the zip file."
return 1
fi

echo "Verifying hash against the last known valid hash..."

# Verify the hash
if ! verify_hash "$zip_file" "$expected_hash"; then
echo "Hash verification failed. Please check if the new punkt file is valid. Exiting."
rm -f "$zip_file"
return 1
fi

# Create a temporary directory for extraction
temp_dir=$(mktemp -d)
if [[ ! -d "$temp_dir" ]]; then
echo "Error: Failed to create a temporary directory."
rm -f "$zip_file"
return 1
fi

# Extract the zip file into the temporary directory
if ! unzip -q "$zip_file" -d "$temp_dir"; then
echo "Error: Failed to unzip the file."
rm -rf "$temp_dir"
rm -f "$zip_file"
return 1
fi

# Create the destination directory and copy files
# interacting directly with the app under /operators/translate for now until more apps
# within the solution need punkt pickles
local -r dest_dir="$source_dir/operators/translate/nltk_data/tokenizers/punkt"
mkdir -p "$dest_dir"
cp -r "$temp_dir/punkt/PY3/"* "$dest_dir/"

# Clean up temporary files
rm -rf "$temp_dir"
rm -f "$zip_file"

echo "Punkt zip file downloaded and extracted successfully."
}

function cleanup_punkt() {
local -r source_dir="${1}"
local -r target_dir="$source_dir/operators/translate/nltk_data"

echo "Starting cleanup process for punkt data..."

# Check if the target directory exists
if [[ -d "$target_dir" ]]; then
# Attempt to remove the directory
if rm -rf "$target_dir"; then
echo "Successfully removed the nltk_data directory at $target_dir."
else
echo "Error: Failed to remove the nltk_data directory at $target_dir."
return 1
fi
else
echo "The directory $target_dir does not exist. No cleanup necessary."
fi

echo "Cleanup process completed."
return 0
}
21 changes: 19 additions & 2 deletions deployment/run-unit-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
# cd deployment
# ./run-unit-tests.sh

source "$(dirname "${BASH_SOURCE[0]}")/nltk_download_functions.sh"

# Run unit tests
echo "Running unit tests"

Expand All @@ -12,9 +14,24 @@ echo "Installing Dependencies And Testing CDK"
echo "------------------------------------------------------------------------------"
chmod +x ../source/cdk/run-tests.sh && ../source/cdk/run-tests.sh || exit $?

echo "cd ../test/unit"
cd ../test/unit

build_dir="$(dirname "${BASH_SOURCE[0]}")"
source_dir="$build_dir/../source"

download_punkt "$source_dir"

echo "pushd ../test/unit"
pushd ../test/unit
echo "------------------------------------------------------------------------------"
echo "Installing Dependencies And Testing Modules"
echo "------------------------------------------------------------------------------"
./run_unit.sh

if [ $? -ne 0 ]; then
echo "ERROR: Unit test script failed"
exit 1
fi

echo "popd"
popd
cleanup_punkt "$source_dir"
2 changes: 1 addition & 1 deletion solution-manifest.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
id: SO0163
name: media-insights-on-aws
version: 5.1.7
version: 5.1.8
cloudformation_templates:
- template: media-insights-on-aws-stack.template
main_template: true
Expand Down
4 changes: 2 additions & 2 deletions source/cdk/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion source/cdk/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "media-insights-on-aws",
"version": "5.1.7",
"version": "5.1.8",
"description": "Synthesize templates for Media Insights on AWS using AWS Cloud Development Kit (CDK).",
"license": "Apache-2.0",
"private": true,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2077,7 +2077,7 @@ exports[`Snapshot media-insights root stack test 1`] = `
"CodeKeyPrefix",
],
},
"media_insights_on_aws_lambda_layer_python3.10-v5.1.7.zip",
"media_insights_on_aws_lambda_layer_python3.10-v5.1.8.zip",
],
],
},
Expand Down Expand Up @@ -2124,7 +2124,7 @@ exports[`Snapshot media-insights root stack test 1`] = `
"CodeKeyPrefix",
],
},
"media_insights_on_aws_lambda_layer_python3.11-v5.1.7.zip",
"media_insights_on_aws_lambda_layer_python3.11-v5.1.8.zip",
],
],
},
Expand Down Expand Up @@ -2171,7 +2171,7 @@ exports[`Snapshot media-insights root stack test 1`] = `
"CodeKeyPrefix",
],
},
"media_insights_on_aws_lambda_layer_python3.9-v5.1.7.zip",
"media_insights_on_aws_lambda_layer_python3.9-v5.1.8.zip",
],
],
},
Expand Down
2 changes: 2 additions & 0 deletions source/lib/MediaInsightsEngineLambdaHelper/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
setuptools>=72.1.0
urllib3==1.26.19
2 changes: 1 addition & 1 deletion source/operators/translate/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
nltk>=3.8.1
nltk==3.9.1
45 changes: 38 additions & 7 deletions source/operators/translate/start_translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from aws_xray_sdk.core import patch_all
import tempfile
import nltk.data
import pickle
from nltk.tokenize.punkt import PunktSentenceTokenizer

from MediaInsightsEngineLambdaHelper import DataPlane
from MediaInsightsEngineLambdaHelper import MediaInsightsOperationHelper
Expand All @@ -21,6 +23,40 @@
translate_client = boto3.client('translate', config=config)
s3 = boto3.client('s3', config=config)

def _load_tokenizer(lang: str) -> PunktSentenceTokenizer:
"""
Load a PunktSentenceTokenizer for a given language from pre-downloaded pickles.
Pickles found at: https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip
These pickles are downloaded and packaged during build. See deployment/nltk_download_functions.sh
Args:
lang (str): The language for which to load the tokenizer.
Returns:
PunktSentenceTokenizer: The tokenizer for the specified language.
"""
try:
# Get the directory of the current file
current_directory = os.path.dirname(os.path.abspath(__file__))

# Construct the path to the pickle file
pickle_path = os.path.join(current_directory, 'nltk_data', 'tokenizers', 'punkt', f'{lang.lower()}.pickle')

# Open the file and unpickle the tokenizer
with open(pickle_path, 'rb') as f:
tokenizer = pickle.load(f)

return tokenizer

except FileNotFoundError as e:
print("Error: Tokenizer file for '%s' not found." % lang)
raise e
except pickle.UnpicklingError as e:
print("Error: Failed to unpickle the tokenizer for '%s'." % lang)
raise e
except Exception as e:
print("An error occurred while loading the tokenizer: %s" % e)
raise e

def lambda_handler(event, _context):
print("We got the following event:\n", event)
Expand Down Expand Up @@ -73,12 +109,6 @@ def lambda_handler(event, _context):
operator_object.update_workflow_status("Complete")
return operator_object.return_output_object()

# Tell the NLTK data loader to look for files in the tmp directory
tmp_dir = tempfile.gettempdir()
nltk.data.path.append(tmp_dir)
# Download NLTK tokenizers to the tmp directory
# We use tmp because that's where AWS Lambda provides write access to the local file system.
nltk.download('punkt', download_dir=tmp_dir)
# Create language tokenizer according to user-specified source language.
# Default to English.
lang_options = {
Expand All @@ -91,7 +121,8 @@ def lambda_handler(event, _context):
}
lang = lang_options.get(source_lang, 'English')
print("Using {} dictionary to find sentence boundaries.".format(lang))
tokenizer = nltk.data.load('tokenizers/punkt/{}.pickle'.format(lang.lower()))

tokenizer = _load_tokenizer(lang)

# Split input text into a list of sentences
sentences = tokenizer.tokenize(transcript)
Expand Down
2 changes: 1 addition & 1 deletion test/e2e/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ boto3==1.28.5
botocore==1.31.7
pytest==7.2.0
requests==2.32.2
urllib3==1.26.18
urllib3==1.26.19
jsonschema==4.17.0
requests_aws4auth==1.2.0
Loading

0 comments on commit da597a9

Please sign in to comment.