From 8690c44ee067257634df0e30c9d516f0e3cfbf07 Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Tue, 5 Jul 2022 14:42:47 -0400 Subject: [PATCH] Make use of AUTOMATION env var consistent across demo2 notebooks (#167) * These changes make use of AUTOMATION env var consistent across notebooks. * Restore previous contents so we can fix CL1->CL2 references and Demo3 refactor as separate PRs. * Run notebooks preserving output cells (using samples_1 instead of samples_145 S3 pipeline_run folder). --- README.md | 2 +- notebooks/demo2/config.py | 2 +- notebooks/demo2/create_results_table.ipynb | 157 ++-- notebooks/demo2/infer_kpi.ipynb | 753 +++++++++++++++-- notebooks/demo2/infer_relevance.ipynb | 897 ++++++++++----------- notebooks/demo2/pdf_text_extraction.ipynb | 235 +++++- 6 files changed, 1413 insertions(+), 633 deletions(-) diff --git a/README.md b/README.md index b47b26b..da2c04c 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ The following demos provide examples of how to use the tools available with [Ope * [Ingest raw data from S3 as tables on Trino](notebooks/demo1/demo1-create-tables.ipynb) * [Run SQL queries from a Jupyter Notebook environment](notebooks/demo1/demo1-join-tables.ipynb) * [Demo 1 Elyra Pipeline](https://github.com/os-climate/aicoe-osc-demo/blob/master/notebooks/demo1/demo1.pipeline) -* [Results visualized on a Superset Dashboard](https://superset-secure-odh-superset.apps.odh-cl1.apps.os-climate.org/superset/dashboard/3/) +* [Results visualized on a Superset Dashboard](https://superset-secure-odh-superset.apps.odh-cl1.apps.os-climate.org/superset/dashboard/3) * [Video on creating Elyra Pipelines and Superset Dashboard](https://youtu.be/TFgsR7UlcHA) diff --git a/notebooks/demo2/config.py b/notebooks/demo2/config.py index 95938c3..f1e9c06 100644 --- a/notebooks/demo2/config.py +++ b/notebooks/demo2/config.py @@ -21,7 +21,7 @@ BASE_INFER_KPI_FOLDER = DATA_FOLDER / "infer_KPI" CHECKPOINT_S3_PREFIX = "aicoe-osc-demo/saved_models" -DATA_S3_PREFIX = "aicoe-osc-demo/pipeline_run/samples_145" +DATA_S3_PREFIX = "aicoe-osc-demo/pipeline_run/samples_1" BASE_PDF_S3_PREFIX = f"{DATA_S3_PREFIX}/pdfs" BASE_ANNOTATION_S3_PREFIX = f"{DATA_S3_PREFIX}/annotations" BASE_EXTRACTION_S3_PREFIX = f"{DATA_S3_PREFIX}/extraction" diff --git a/notebooks/demo2/create_results_table.ipynb b/notebooks/demo2/create_results_table.ipynb index 6b8c270..56cd8c0 100644 --- a/notebooks/demo2/create_results_table.ipynb +++ b/notebooks/demo2/create_results_table.ipynb @@ -65,7 +65,9 @@ "outputs": [], "source": [ "# Load credentials\n", - "dotenv_dir = \"/opt/app-root/src/aicoe-osc-demo\"\n", + "dotenv_dir = os.environ.get(\n", + " \"CREDENTIAL_DOTENV_DIR\", os.environ.get(\"PWD\", \"/opt/app-root/src\")\n", + ")\n", "dotenv_path = pathlib.Path(dotenv_dir) / \"credentials.env\"\n", "if os.path.exists(dotenv_path):\n", " load_dotenv(dotenv_path=dotenv_path, override=True)" @@ -106,7 +108,13 @@ "source": [ "if os.getenv(\"AUTOMATION\"):\n", " if not os.path.exists(config.BASE_INFER_KPI_FOLDER):\n", - " pathlib.Path(config.BASE_INFER_KPI_FOLDER).mkdir(parents=True, exist_ok=True)" + " pathlib.Path(config.BASE_INFER_KPI_FOLDER).mkdir(parents=True, exist_ok=True)\n", + "\n", + " # Download a sample dataset file from s3\n", + " s3c.download_files_in_prefix_to_dir(\n", + " s3_prefix=config.BASE_INFER_KPI_S3_PREFIX,\n", + " destination_dir=config.BASE_INFER_KPI_FOLDER\n", + " )" ] }, { @@ -151,66 +159,66 @@ " \n", " \n", " 0\n", - " 413749035_Eversource Energy_2019-12-31\n", + " sustainability-report-2019\n", " In which year was the annual report or the sus...\n", " <NA>\n", " 2019\n", - " 7\n", - " • Our core utility operations performed very w...\n", + " 3\n", + " This report focuses on the sustainability topi...\n", " Text\n", - " 13.372849\n", - " -10.76948\n", - " -25.76948\n", + " 12.819071\n", + " -11.384018\n", + " -26.384018\n", " \n", " \n", " 1\n", - " 413749035_Eversource Energy_2019-12-31\n", + " sustainability-report-2019\n", " In which year was the annual report or the sus...\n", " <NA>\n", - " 2019\n", - " 34\n", - " The American Council for an Energy-Efficient E...\n", + " 2018\n", + " 7\n", + " According to IPCC’s 1.5 C report from 2018 and...\n", " Text\n", - " 12.66205\n", - " -9.417558\n", - " -24.417558\n", + " 12.50875\n", + " -6.967497\n", + " -21.967497\n", " \n", " \n", " 2\n", - " 413749035_Eversource Energy_2019-12-31\n", + " sustainability-report-2019\n", " In which year was the annual report or the sus...\n", " <NA>\n", " 2019\n", - " 12\n", - " The Eversource Internal Audit Department perfo...\n", + " 26\n", + " Equinor Sustainability report 2019 High value ...\n", " Text\n", - " 12.373636\n", - " -10.899869\n", - " -25.899869\n", + " 12.427496\n", + " -9.680325\n", + " -24.680325\n", " \n", " \n", " 3\n", - " 413749035_Eversource Energy_2019-12-31\n", + " sustainability-report-2019\n", " In which year was the annual report or the sus...\n", " <NA>\n", " 2019\n", - " 118\n", - " These are referenced throughout our 2019 Susta...\n", + " 8\n", + " Equinor Sustainability report 2019Low carbon —...\n", " Text\n", - " 12.245757\n", - " -10.556628\n", - " -25.556628\n", + " 12.356202\n", + " -8.748007\n", + " -23.748007\n", " \n", " \n", " 4\n", - " 413749035_Eversource Energy_2019-12-31\n", + " sustainability-report-2019\n", " What is the annual total production from coal?\n", " <NA>\n", " no_answer\n", " <NA>\n", " <NA>\n", " Text\n", - " 2.720188\n", + " 2.840454\n", " <NA>\n", " <NA>\n", " \n", @@ -219,32 +227,32 @@ "" ], "text/plain": [ - " pdf_name \\\n", - "0 413749035_Eversource Energy_2019-12-31 \n", - "1 413749035_Eversource Energy_2019-12-31 \n", - "2 413749035_Eversource Energy_2019-12-31 \n", - "3 413749035_Eversource Energy_2019-12-31 \n", - "4 413749035_Eversource Energy_2019-12-31 \n", + " pdf_name \\\n", + "0 sustainability-report-2019 \n", + "1 sustainability-report-2019 \n", + "2 sustainability-report-2019 \n", + "3 sustainability-report-2019 \n", + "4 sustainability-report-2019 \n", "\n", " kpi kpi_id answer page \\\n", - "0 In which year was the annual report or the sus... 2019 7 \n", - "1 In which year was the annual report or the sus... 2019 34 \n", - "2 In which year was the annual report or the sus... 2019 12 \n", - "3 In which year was the annual report or the sus... 2019 118 \n", + "0 In which year was the annual report or the sus... 2019 3 \n", + "1 In which year was the annual report or the sus... 2018 7 \n", + "2 In which year was the annual report or the sus... 2019 26 \n", + "3 In which year was the annual report or the sus... 2019 8 \n", "4 What is the annual total production from coal? no_answer \n", "\n", " paragraph source score \\\n", - "0 • Our core utility operations performed very w... Text 13.372849 \n", - "1 The American Council for an Energy-Efficient E... Text 12.66205 \n", - "2 The Eversource Internal Audit Department perfo... Text 12.373636 \n", - "3 These are referenced throughout our 2019 Susta... Text 12.245757 \n", - "4 Text 2.720188 \n", + "0 This report focuses on the sustainability topi... Text 12.819071 \n", + "1 According to IPCC’s 1.5 C report from 2018 and... Text 12.50875 \n", + "2 Equinor Sustainability report 2019 High value ... Text 12.427496 \n", + "3 Equinor Sustainability report 2019Low carbon —... Text 12.356202 \n", + "4 Text 2.840454 \n", "\n", " no_ans_score no_answer_score_plus_boost \n", - "0 -10.76948 -25.76948 \n", - "1 -9.417558 -24.417558 \n", - "2 -10.899869 -25.899869 \n", - "3 -10.556628 -25.556628 \n", + "0 -11.384018 -26.384018 \n", + "1 -6.967497 -21.967497 \n", + "2 -9.680325 -24.680325 \n", + "3 -8.748007 -23.748007 \n", "4 " ] }, @@ -254,12 +262,6 @@ } ], "source": [ - "# Download a sample dataset file from s3\n", - "s3c.download_files_in_prefix_to_dir(\n", - " s3_prefix=config.BASE_INFER_KPI_S3_PREFIX,\n", - " destination_dir=config.BASE_INFER_KPI_FOLDER\n", - ")\n", - "\n", "all_files = glob.glob(str(config.BASE_INFER_KPI_FOLDER / \"*.csv\"))\n", "list_of_files = []\n", "\n", @@ -315,22 +317,22 @@ "output_type": "stream", "text": [ "\n", - "RangeIndex: 689 entries, 0 to 688\n", + "RangeIndex: 96 entries, 0 to 95\n", "Data columns (total 10 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", - " 0 pdf_name 689 non-null string \n", - " 1 kpi 689 non-null string \n", + " 0 pdf_name 96 non-null string \n", + " 1 kpi 96 non-null string \n", " 2 kpi_id 0 non-null Int64 \n", - " 3 answer 689 non-null string \n", - " 4 page 555 non-null Int64 \n", - " 5 paragraph 555 non-null string \n", - " 6 source 689 non-null string \n", - " 7 score 689 non-null Float64\n", - " 8 no_ans_score 555 non-null Float64\n", - " 9 no_answer_score_plus_boost 555 non-null Float64\n", + " 3 answer 96 non-null string \n", + " 4 page 79 non-null Int64 \n", + " 5 paragraph 79 non-null string \n", + " 6 source 96 non-null string \n", + " 7 score 96 non-null Float64\n", + " 8 no_ans_score 79 non-null Float64\n", + " 9 no_answer_score_plus_boost 79 non-null Float64\n", "dtypes: Float64(3), Int64(2), string(5)\n", - "memory usage: 57.3 KB\n" + "memory usage: 8.1 KB\n" ] } ], @@ -359,15 +361,6 @@ "name": "stdout", "output_type": "stream", "text": [ - "200\n", - "200\n", - "200\n", - "200\n", - "200\n", - "200\n", - "200\n", - "200\n", - "200\n", "200\n" ] } @@ -464,16 +457,16 @@ { "data": { "text/plain": [ - "['sustainability-report-2019',\n", + "['90044053_Fisher & Paykel Hl_2017-11-07',\n", " 'In which year was the annual report or the sustainability report published?',\n", " None,\n", - " '2019',\n", - " 26,\n", - " 'Equinor Sustainability report 2019 High value — creating shared value',\n", + " '2017',\n", + " 2,\n", + " 'Corporate Responsibility and Sustainability Report 2017Fisher & Paykel Healthcare Corporation Limited',\n", " 'Text',\n", - " 12.427505493164062,\n", - " -9.680328369140623,\n", - " -24.680328369140625]" + " 11.549626350402832,\n", + " -8.787019729614258,\n", + " -23.787019729614254]" ] }, "execution_count": 11, @@ -514,7 +507,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.6" + "version": "3.8.8" } }, "nbformat": 4, diff --git a/notebooks/demo2/infer_kpi.ipynb b/notebooks/demo2/infer_kpi.ipynb index a684866..e2928f7 100644 --- a/notebooks/demo2/infer_kpi.ipynb +++ b/notebooks/demo2/infer_kpi.ipynb @@ -3,7 +3,16 @@ { "cell_type": "markdown", "id": "0e85812e-ba28-45e1-9e01-cde16ef0b44b", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.004919, + "end_time": "2022-07-02T00:30:29.987143", + "exception": false, + "start_time": "2022-07-02T00:30:29.982224", + "status": "completed" + }, + "tags": [] + }, "source": [ "# KPI Inference\n", "This notebook takes in the relevant paragraphs to KPIs found in the relevance infer stage, the fine tuned KPI EXTRACTION model from the training stage, and performs inference to return specific answers to the KPIs." @@ -13,13 +22,28 @@ "cell_type": "code", "execution_count": 1, "id": "f79c4be0-b1f8-4902-8950-05841cdc41d6", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T00:30:29.997006Z", + "iopub.status.busy": "2022-07-02T00:30:29.996416Z", + "iopub.status.idle": "2022-07-02T00:30:33.477927Z", + "shell.execute_reply": "2022-07-02T00:30:33.477230Z" + }, + "papermill": { + "duration": 3.488551, + "end_time": "2022-07-02T00:30:33.479940", + "exception": false, + "start_time": "2022-07-02T00:30:29.991389", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "04/08/2022 18:55:16 - INFO - farm.modeling.prediction_head - Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .\n" + "07/02/2022 00:30:33 - INFO - farm.modeling.prediction_head - Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .\n" ] } ], @@ -39,7 +63,22 @@ "cell_type": "code", "execution_count": 2, "id": "1dfe6b89-357c-4c83-848f-d2f43faaf805", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T00:30:33.491033Z", + "iopub.status.busy": "2022-07-02T00:30:33.490325Z", + "iopub.status.idle": "2022-07-02T00:30:33.501491Z", + "shell.execute_reply": "2022-07-02T00:30:33.500816Z" + }, + "papermill": { + "duration": 0.0182, + "end_time": "2022-07-02T00:30:33.503253", + "exception": false, + "start_time": "2022-07-02T00:30:33.485053", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "# Load credentials\n", @@ -55,14 +94,29 @@ "cell_type": "code", "execution_count": 3, "id": "f0982f03-be7f-4abc-ba30-ff1e842cf147", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T00:30:33.512469Z", + "iopub.status.busy": "2022-07-02T00:30:33.511997Z", + "iopub.status.idle": "2022-07-02T00:30:33.603193Z", + "shell.execute_reply": "2022-07-02T00:30:33.602437Z" + }, + "papermill": { + "duration": 0.098028, + "end_time": "2022-07-02T00:30:33.605260", + "exception": false, + "start_time": "2022-07-02T00:30:33.507232", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "# init s3 connector\n", "s3c = S3Communication(\n", " s3_endpoint_url=os.getenv(\"S3_ENDPOINT\"),\n", - " aws_access_key_id=os.getenv(\"AWS_ACCESS_KEY_ID\"),\n", - " aws_secret_access_key=os.getenv(\"AWS_SECRET_ACCESS_KEY\"),\n", + " aws_access_key_id=os.getenv(\"S3_LANDING_ACCESS_KEY\"),\n", + " aws_secret_access_key=os.getenv(\"S3_LANDING_SECRET_KEY\"),\n", " s3_bucket=os.getenv(\"S3_BUCKET\"),\n", ")" ] @@ -71,7 +125,22 @@ "cell_type": "code", "execution_count": 4, "id": "2e5136cb-7e32-4ca2-a533-db0b990f288e", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T00:30:33.615567Z", + "iopub.status.busy": "2022-07-02T00:30:33.615059Z", + "iopub.status.idle": "2022-07-02T00:30:33.619075Z", + "shell.execute_reply": "2022-07-02T00:30:33.618421Z" + }, + "papermill": { + "duration": 0.011196, + "end_time": "2022-07-02T00:30:33.620983", + "exception": false, + "start_time": "2022-07-02T00:30:33.609787", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "#Settings data files and checkpoints parameters\n", @@ -83,7 +152,22 @@ "cell_type": "code", "execution_count": 5, "id": "e5607a79-d139-40e4-ae56-646cd54a1838", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T00:30:33.630728Z", + "iopub.status.busy": "2022-07-02T00:30:33.630228Z", + "iopub.status.idle": "2022-07-02T00:30:33.775792Z", + "shell.execute_reply": "2022-07-02T00:30:33.775113Z" + }, + "papermill": { + "duration": 0.152377, + "end_time": "2022-07-02T00:30:33.777607", + "exception": false, + "start_time": "2022-07-02T00:30:33.625230", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "# When running in Automation using Elyra and Kubeflow Pipelines,\n", @@ -113,7 +197,22 @@ "cell_type": "code", "execution_count": 6, "id": "7c24a0d7-40c2-4da8-9e7f-66ce78386ec3", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T00:30:33.787253Z", + "iopub.status.busy": "2022-07-02T00:30:33.786729Z", + "iopub.status.idle": "2022-07-02T00:31:18.002091Z", + "shell.execute_reply": "2022-07-02T00:31:18.001179Z" + }, + "papermill": { + "duration": 44.222679, + "end_time": "2022-07-02T00:31:18.004580", + "exception": false, + "start_time": "2022-07-02T00:30:33.781901", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "model_root = pathlib.Path(file_config.saved_models_dir).parent\n", @@ -126,7 +225,16 @@ { "cell_type": "markdown", "id": "bda4681d-2d84-414b-a701-b507acc99552", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.004738, + "end_time": "2022-07-02T00:31:18.014443", + "exception": false, + "start_time": "2022-07-02T00:31:18.009705", + "status": "completed" + }, + "tags": [] + }, "source": [ "## Inference" ] @@ -134,7 +242,16 @@ { "cell_type": "markdown", "id": "2af4fd06-c86d-42aa-b4f6-6992b60cf512", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.003976, + "end_time": "2022-07-02T00:31:18.023076", + "exception": false, + "start_time": "2022-07-02T00:31:18.019100", + "status": "completed" + }, + "tags": [] + }, "source": [ "We can use the saved model and test it on some real examples.

\n", "First let's load the model:" @@ -144,7 +261,22 @@ "cell_type": "code", "execution_count": 7, "id": "38a70174-65b3-4365-ba6f-d020457de4b7", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T00:31:18.032756Z", + "iopub.status.busy": "2022-07-02T00:31:18.032315Z", + "iopub.status.idle": "2022-07-02T00:31:18.039824Z", + "shell.execute_reply": "2022-07-02T00:31:18.039108Z" + }, + "papermill": { + "duration": 0.014463, + "end_time": "2022-07-02T00:31:18.041568", + "exception": false, + "start_time": "2022-07-02T00:31:18.027105", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "data": { @@ -165,14 +297,35 @@ "cell_type": "code", "execution_count": 8, "id": "b840d5cf-3d42-4e9e-be2d-d95634c4ac96", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T00:31:18.051110Z", + "iopub.status.busy": "2022-07-02T00:31:18.050661Z", + "iopub.status.idle": "2022-07-02T00:31:31.483495Z", + "shell.execute_reply": "2022-07-02T00:31:31.482128Z" + }, + "papermill": { + "duration": 13.440374, + "end_time": "2022-07-02T00:31:31.486055", + "exception": false, + "start_time": "2022-07-02T00:31:18.045681", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "04/08/2022 18:56:03 - WARNING - farm.modeling.prediction_head - Some unused parameters are passed to the QuestionAnsweringHead. Might not be a problem. Params: {\"training\": false, \"num_labels\": 2, \"ph_output_type\": \"per_token_squad\", \"model_type\": \"span_classification\", \"label_tensor_name\": \"question_answering_label_ids\", \"label_list\": [\"start_token\", \"end_token\"], \"metric\": \"squad\", \"name\": \"QuestionAnsweringHead\"}\n", - "04/08/2022 18:56:07 - WARNING - farm.infer - QAInferencer always has task_type='question_answering' even if another value is provided to Inferencer.load() or QAInferencer()\n" + "07/02/2022 00:31:27 - WARNING - farm.modeling.prediction_head - Some unused parameters are passed to the QuestionAnsweringHead. Might not be a problem. Params: {\"training\": false, \"num_labels\": 2, \"ph_output_type\": \"per_token_squad\", \"model_type\": \"span_classification\", \"label_tensor_name\": \"question_answering_label_ids\", \"label_list\": [\"start_token\", \"end_token\"], \"metric\": \"squad\", \"name\": \"QuestionAnsweringHead\"}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "07/02/2022 00:31:31 - WARNING - farm.infer - QAInferencer always has task_type='question_answering' even if another value is provided to Inferencer.load() or QAInferencer()\n" ] } ], @@ -183,7 +336,16 @@ { "cell_type": "markdown", "id": "c534da63-f0b2-4e5b-bef1-d4d0a217e433", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.004276, + "end_time": "2022-07-02T00:31:31.495925", + "exception": false, + "start_time": "2022-07-02T00:31:31.491649", + "status": "completed" + }, + "tags": [] + }, "source": [ "Now, let's make prediction on a pair of paragraph and question." ] @@ -192,7 +354,22 @@ "cell_type": "code", "execution_count": 9, "id": "d506a8b3-f013-4eef-98fc-d1cdd1525a14", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T00:31:31.508001Z", + "iopub.status.busy": "2022-07-02T00:31:31.507345Z", + "iopub.status.idle": "2022-07-02T00:31:31.513823Z", + "shell.execute_reply": "2022-07-02T00:31:31.512912Z" + }, + "papermill": { + "duration": 0.014743, + "end_time": "2022-07-02T00:31:31.515812", + "exception": false, + "start_time": "2022-07-02T00:31:31.501069", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "context = \"\"\"the paris agreement on climate change drafted in 2015 aims to reduce worldwide emissions of greenhouse\n", @@ -210,13 +387,44 @@ "cell_type": "code", "execution_count": 10, "id": "e92e6380-5440-499b-b4e8-6871dfc0ec13", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T00:31:31.525722Z", + "iopub.status.busy": "2022-07-02T00:31:31.525260Z", + "iopub.status.idle": "2022-07-02T00:31:32.053399Z", + "shell.execute_reply": "2022-07-02T00:31:32.052411Z" + }, + "papermill": { + "duration": 0.535398, + "end_time": "2022-07-02T00:31:32.055442", + "exception": false, + "start_time": "2022-07-02T00:31:31.520044", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 22.79 Batches/s]" + "\r", + "Inferencing Samples: 0%| | 0/1 [00:00\n", " \n", " \n", - " page\n", - " pdf_name\n", - " text\n", - " text_b\n", - " source\n", " \n", " \n", " \n", - " \n", - " 0\n", - " 1\n", - " sustainability-report-2019\n", - " What is the company name?\n", - " Equinor supports the Paris agreement and a net...\n", - " Text\n", - " \n", - " \n", - " 1\n", - " 1\n", - " sustainability-report-2019\n", - " What is the company name?\n", - " broad energy company is founded on a strong co...\n", - " Text\n", - " \n", - " \n", - " 2\n", - " 1\n", - " sustainability-report-2019\n", - " What is the company name?\n", - " Equinor and partners reached a final investmen...\n", - " Text\n", - " \n", - " \n", - " 3\n", - " 1\n", - " sustainability-report-2019\n", - " What is the company name?\n", - " awarded five major contracts. Equinor is posit...\n", - " Text\n", - " \n", - " \n", - " 4\n", - " 1\n", - " sustainability-report-2019\n", - " What is the company name?\n", - " Equinor is a values-based company. How we deli...\n", - " Text\n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 554\n", - " 29\n", - " sustainability-report-2019\n", - " What is the total amount of scope 1, scope 2 a...\n", - " GHG emissions associated with the production a...\n", - " Text\n", - " \n", - " \n", - " 555\n", - " 29\n", - " sustainability-report-2019\n", - " What is the total amount of scope 1, scope 2 a...\n", - " Indirect GHG emissions from energy imported fr...\n", - " Text\n", - " \n", - " \n", - " 556\n", - " 29\n", - " sustainability-report-2019\n", - " What is the total amount of scope 1, scope 2 a...\n", - " Upstream carbon dioxide (CO₂ ) emission intensity\n", - " Text\n", - " \n", - " \n", - " 557\n", - " 29\n", - " sustainability-report-2019\n", - " What is the total amount of scope 1, scope 2 a...\n", - " Total scope one emissions of CO₂ (kg CO₂) from...\n", - " Text\n", - " \n", - " \n", - " 558\n", - " 30\n", - " sustainability-report-2019\n", - " What is the total amount of scope 1, scope 2 a...\n", - " b) Disclose Scope 1, Scope 2, and, if appropri...\n", - " Text\n", - " \n", " \n", "\n", - "

559 rows × 5 columns

\n", "" ], "text/plain": [ - " page pdf_name \\\n", - "0 1 sustainability-report-2019 \n", - "1 1 sustainability-report-2019 \n", - "2 1 sustainability-report-2019 \n", - "3 1 sustainability-report-2019 \n", - "4 1 sustainability-report-2019 \n", - ".. ... ... \n", - "554 29 sustainability-report-2019 \n", - "555 29 sustainability-report-2019 \n", - "556 29 sustainability-report-2019 \n", - "557 29 sustainability-report-2019 \n", - "558 30 sustainability-report-2019 \n", - "\n", - " text \\\n", - "0 What is the company name? \n", - "1 What is the company name? \n", - "2 What is the company name? \n", - "3 What is the company name? \n", - "4 What is the company name? \n", - ".. ... \n", - "554 What is the total amount of scope 1, scope 2 a... \n", - "555 What is the total amount of scope 1, scope 2 a... \n", - "556 What is the total amount of scope 1, scope 2 a... \n", - "557 What is the total amount of scope 1, scope 2 a... \n", - "558 What is the total amount of scope 1, scope 2 a... \n", - "\n", - " text_b source \n", - "0 Equinor supports the Paris agreement and a net... Text \n", - "1 broad energy company is founded on a strong co... Text \n", - "2 Equinor and partners reached a final investmen... Text \n", - "3 awarded five major contracts. Equinor is posit... Text \n", - "4 Equinor is a values-based company. How we deli... Text \n", - ".. ... ... \n", - "554 GHG emissions associated with the production a... Text \n", - "555 Indirect GHG emissions from energy imported fr... Text \n", - "556 Upstream carbon dioxide (CO₂ ) emission intensity Text \n", - "557 Total scope one emissions of CO₂ (kg CO₂) from... Text \n", - "558 b) Disclose Scope 1, Scope 2, and, if appropri... Text \n", - "\n", - "[559 rows x 5 columns]" + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" ] }, - "execution_count": 14, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -614,305 +735,147 @@ { "cell_type": "markdown", "id": "2d78421f-ace9-4c19-ab0f-65e31fabcd06", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.00472, + "end_time": "2022-07-02T00:30:27.269127", + "exception": false, + "start_time": "2022-07-02T00:30:27.264407", + "status": "completed" + }, + "tags": [] + }, "source": [ "The results are saved in a CSV. For each table, the extracted text, as well as the page number from the source pdf file are saved." ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "id": "4ef54093-09ae-4fb9-bb77-6d71693a01ea", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T00:30:27.280244Z", + "iopub.status.busy": "2022-07-02T00:30:27.279731Z", + "iopub.status.idle": "2022-07-02T00:30:27.296605Z", + "shell.execute_reply": "2022-07-02T00:30:27.295956Z" + }, + "papermill": { + "duration": 0.024431, + "end_time": "2022-07-02T00:30:27.298328", + "exception": false, + "start_time": "2022-07-02T00:30:27.273897", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Unnamed: 0pagepdf_nametexttext_bsource
001sustainability-report-2019What is the company name?Equinor supports the Paris agreement and a net...Text
111sustainability-report-2019What is the company name?broad energy company is founded on a strong co...Text
221sustainability-report-2019What is the company name?Equinor and partners reached a final investmen...Text
331sustainability-report-2019What is the company name?awarded five major contracts. Equinor is posit...Text
441sustainability-report-2019What is the company name?Equinor is a values-based company. How we deli...Text
551sustainability-report-2019What is the company name?2019 marked the start-up of Johan Sverdrup – t...Text
661sustainability-report-2019What is the company name?For almost 50 years, Equinor has dedicated its...Text
771sustainability-report-2019What is the company name?Equinor is partnering with SSE Renewables to d...Text
881sustainability-report-2019What is the company name?Equinor Sustainability report 2019IntroductionText
992sustainability-report-2019What is the company name?We are Equinor, an international energy compan...Text
10103sustainability-report-2019What is the company name?Equinor.com For further information about sust...Text
11114sustainability-report-2019What is the company name?Sustainability is embedded in Equinor's:Text
12124sustainability-report-2019What is the company name?Equinor’s purpose is to turn natural resources...Text
13135sustainability-report-2019What is the company name?To be an industry leader in safety and securityText
14148sustainability-report-2019What is the company name?us. Equinor has low carbon as one of the main ...Text
151510sustainability-report-2019What is the company name?In 2019, Equinor reviewed its climate ambition...Text
161610sustainability-report-2019What is the company name?Equinor’s Climate Roadmap sets out new short-,...Text
171710sustainability-report-2019What is the company name?Equinor aims to reduce the CO₂ intensity of it...Text
181810sustainability-report-2019What is the company name?Equinor Sustainability report 2019 Equinor Sus...Text
191911sustainability-report-2019What is the company name?Global offshore wind major The past few years ...Text
\n", - "
" - ], - "text/plain": [ - " Unnamed: 0 page pdf_name text \\\n", - "0 0 1 sustainability-report-2019 What is the company name? \n", - "1 1 1 sustainability-report-2019 What is the company name? \n", - "2 2 1 sustainability-report-2019 What is the company name? \n", - "3 3 1 sustainability-report-2019 What is the company name? \n", - "4 4 1 sustainability-report-2019 What is the company name? \n", - "5 5 1 sustainability-report-2019 What is the company name? \n", - "6 6 1 sustainability-report-2019 What is the company name? \n", - "7 7 1 sustainability-report-2019 What is the company name? \n", - "8 8 1 sustainability-report-2019 What is the company name? \n", - "9 9 2 sustainability-report-2019 What is the company name? \n", - "10 10 3 sustainability-report-2019 What is the company name? \n", - "11 11 4 sustainability-report-2019 What is the company name? \n", - "12 12 4 sustainability-report-2019 What is the company name? \n", - "13 13 5 sustainability-report-2019 What is the company name? \n", - "14 14 8 sustainability-report-2019 What is the company name? \n", - "15 15 10 sustainability-report-2019 What is the company name? \n", - "16 16 10 sustainability-report-2019 What is the company name? \n", - "17 17 10 sustainability-report-2019 What is the company name? \n", - "18 18 10 sustainability-report-2019 What is the company name? \n", - "19 19 11 sustainability-report-2019 What is the company name? \n", - "\n", - " text_b source \n", - "0 Equinor supports the Paris agreement and a net... Text \n", - "1 broad energy company is founded on a strong co... Text \n", - "2 Equinor and partners reached a final investmen... Text \n", - "3 awarded five major contracts. Equinor is posit... Text \n", - "4 Equinor is a values-based company. How we deli... Text \n", - "5 2019 marked the start-up of Johan Sverdrup – t... Text \n", - "6 For almost 50 years, Equinor has dedicated its... Text \n", - "7 Equinor is partnering with SSE Renewables to d... Text \n", - "8 Equinor Sustainability report 2019Introduction Text \n", - "9 We are Equinor, an international energy compan... Text \n", - "10 Equinor.com For further information about sust... Text \n", - "11 Sustainability is embedded in Equinor's: Text \n", - "12 Equinor’s purpose is to turn natural resources... Text \n", - "13 To be an industry leader in safety and security Text \n", - "14 us. Equinor has low carbon as one of the main ... Text \n", - "15 In 2019, Equinor reviewed its climate ambition... Text \n", - "16 Equinor’s Climate Roadmap sets out new short-,... Text \n", - "17 Equinor aims to reduce the CO₂ intensity of it... Text \n", - "18 Equinor Sustainability report 2019 Equinor Sus... Text \n", - "19 Global offshore wind major The past few years ... Text " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "for file /opt/app-root/src/aicoe-osc-demo/data/infer_relevance/sustainability-report-2019_predictions_relevant.csv:\n", + " Unnamed: 0 page pdf_name text \\\n", + "0 0 1 sustainability-report-2019 What is the company name? \n", + "1 1 1 sustainability-report-2019 What is the company name? \n", + "2 2 1 sustainability-report-2019 What is the company name? \n", + "3 3 1 sustainability-report-2019 What is the company name? \n", + "4 4 1 sustainability-report-2019 What is the company name? \n", + "5 5 1 sustainability-report-2019 What is the company name? \n", + "6 6 1 sustainability-report-2019 What is the company name? \n", + "7 7 1 sustainability-report-2019 What is the company name? \n", + "8 8 1 sustainability-report-2019 What is the company name? \n", + "9 9 2 sustainability-report-2019 What is the company name? \n", + "10 10 3 sustainability-report-2019 What is the company name? \n", + "11 11 4 sustainability-report-2019 What is the company name? \n", + "12 12 4 sustainability-report-2019 What is the company name? \n", + "13 13 5 sustainability-report-2019 What is the company name? \n", + "14 14 8 sustainability-report-2019 What is the company name? \n", + "15 15 10 sustainability-report-2019 What is the company name? \n", + "16 16 10 sustainability-report-2019 What is the company name? \n", + "17 17 10 sustainability-report-2019 What is the company name? \n", + "18 18 10 sustainability-report-2019 What is the company name? \n", + "19 19 11 sustainability-report-2019 What is the company name? \n", + "\n", + " text_b source \n", + "0 Equinor supports the Paris agreement and a net... Text \n", + "1 broad energy company is founded on a strong co... Text \n", + "2 Equinor and partners reached a final investmen... Text \n", + "3 awarded five major contracts. Equinor is posit... Text \n", + "4 Equinor is a values-based company. How we deli... Text \n", + "5 2019 marked the start-up of Johan Sverdrup – t... Text \n", + "6 For almost 50 years, Equinor has dedicated its... Text \n", + "7 Equinor is partnering with SSE Renewables to d... Text \n", + "8 Equinor Sustainability report 2019Introduction Text \n", + "9 We are Equinor, an international energy compan... Text \n", + "10 Equinor.com For further information about sust... Text \n", + "11 Sustainability is embedded in Equinor's: Text \n", + "12 Equinor’s purpose is to turn natural resources... Text \n", + "13 To be an industry leader in safety and security Text \n", + "14 us. Equinor has low carbon as one of the main ... Text \n", + "15 In 2019, Equinor reviewed its climate ambition... Text \n", + "16 Equinor’s Climate Roadmap sets out new short-,... Text \n", + "17 Equinor aims to reduce the CO₂ intensity of it... Text \n", + "18 Equinor Sustainability report 2019 Equinor Sus... Text \n", + "19 Global offshore wind major The past few years ... Text \n" + ] } ], "source": [ - "df_table_results = pd.read_csv(infer_config.result_dir['Text'] + \"/sustainability-report-2019_predictions_relevant.csv\")\n", - "df_table_results.head(20)" + "csvfiles = [f for f in glob.glob(infer_config.result_dir['Text'] + \"/*.csv\")]\n", + "for i in range(len(csvfiles)):\n", + " f = csvfiles[i]\n", + " df_table_results = pd.read_csv(f)\n", + " if i < 5:\n", + " print(f\"for file {f}:\")\n", + " print(df_table_results.head(20))\n", + " else:\n", + " print (f\"and file {f} (len = {len(df_table_results)})\")" ] }, { "cell_type": "code", - "execution_count": 16, - "id": "015c992b-5460-4af8-ae15-09e68b4d62c9", - "metadata": {}, + "execution_count": 14, + "id": "14ae0386-ae1c-4eff-b826-5e07d7f46f47", + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T00:30:27.309919Z", + "iopub.status.busy": "2022-07-02T00:30:27.309443Z", + "iopub.status.idle": "2022-07-02T00:30:27.474525Z", + "shell.execute_reply": "2022-07-02T00:30:27.473821Z" + }, + "papermill": { + "duration": 0.172658, + "end_time": "2022-07-02T00:30:27.476387", + "exception": false, + "start_time": "2022-07-02T00:30:27.303729", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ - "# upload the predicted files to s3\n", - "s3c.upload_files_in_dir_to_prefix(\n", - " infer_config.result_dir['Text'],\n", - " config.BASE_INFER_RELEVANCE_S3_PREFIX\n", - ")" + "if os.getenv(\"AUTOMATION\"):\n", + " # upload the predicted files to s3\n", + " s3c.upload_files_in_dir_to_prefix(\n", + " infer_config.result_dir['Text'],\n", + " config.BASE_INFER_RELEVANCE_S3_PREFIX\n", + " )" ] }, { "cell_type": "markdown", "id": "5226fc3d-87dc-432a-a571-5109d0ecfc86", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.004923, + "end_time": "2022-07-02T00:30:27.486643", + "exception": false, + "start_time": "2022-07-02T00:30:27.481720", + "status": "completed" + }, + "tags": [] + }, "source": [ "# Conclusion\n", "This notebook ran the _Relevance_ inference on a sample dataset and stored the output in a csv format." @@ -936,8 +899,20 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" + }, + "papermill": { + "default_parameters": {}, + "duration": 29.248741, + "end_time": "2022-07-02T00:30:28.613627", + "environment_variables": {}, + "exception": null, + "input_path": "/opt/app-root/src/aicoe-osc-demo/notebooks/demo2/infer_relevance.ipynb", + "output_path": "/opt/app-root/src/aicoe-osc-demo/notebooks/demo2/infer_relevance.ipynb", + "parameters": {}, + "start_time": "2022-07-02T00:29:59.364886", + "version": "2.3.4" } }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/notebooks/demo2/pdf_text_extraction.ipynb b/notebooks/demo2/pdf_text_extraction.ipynb index 97d3f5f..2907243 100644 --- a/notebooks/demo2/pdf_text_extraction.ipynb +++ b/notebooks/demo2/pdf_text_extraction.ipynb @@ -2,7 +2,15 @@ "cells": [ { "cell_type": "markdown", + "id": "4bcaf08a", "metadata": { + "papermill": { + "duration": 0.00316, + "end_time": "2022-07-02T03:31:57.477214", + "exception": false, + "start_time": "2022-07-02T03:31:57.474054", + "status": "completed" + }, "tags": [] }, "source": [ @@ -13,7 +21,23 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "id": "8a565c11", + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T03:31:57.485976Z", + "iopub.status.busy": "2022-07-02T03:31:57.485522Z", + "iopub.status.idle": "2022-07-02T03:32:07.113061Z", + "shell.execute_reply": "2022-07-02T03:32:07.112253Z" + }, + "papermill": { + "duration": 9.636404, + "end_time": "2022-07-02T03:32:07.116204", + "exception": false, + "start_time": "2022-07-02T03:31:57.479800", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "# Author: ALLIANZ NLP esg data pipeline\n", @@ -28,7 +52,17 @@ }, { "cell_type": "markdown", - "metadata": {}, + "id": "ea8fcf11", + "metadata": { + "papermill": { + "duration": 0.002822, + "end_time": "2022-07-02T03:32:07.123649", + "exception": false, + "start_time": "2022-07-02T03:32:07.120827", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Injecting Credentials\n", "\n", @@ -52,7 +86,23 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "id": "a74d1cb5", + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T03:32:07.130136Z", + "iopub.status.busy": "2022-07-02T03:32:07.129454Z", + "iopub.status.idle": "2022-07-02T03:32:07.140631Z", + "shell.execute_reply": "2022-07-02T03:32:07.139980Z" + }, + "papermill": { + "duration": 0.016427, + "end_time": "2022-07-02T03:32:07.142377", + "exception": false, + "start_time": "2022-07-02T03:32:07.125950", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "# Load credentials\n", @@ -67,14 +117,30 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "id": "8585042a", + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T03:32:07.148350Z", + "iopub.status.busy": "2022-07-02T03:32:07.147745Z", + "iopub.status.idle": "2022-07-02T03:32:07.229730Z", + "shell.execute_reply": "2022-07-02T03:32:07.228969Z" + }, + "papermill": { + "duration": 0.08687, + "end_time": "2022-07-02T03:32:07.231521", + "exception": false, + "start_time": "2022-07-02T03:32:07.144651", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "# init s3 connector\n", "s3c = S3Communication(\n", " s3_endpoint_url=os.getenv(\"S3_ENDPOINT\"),\n", - " aws_access_key_id=os.getenv(\"AWS_ACCESS_KEY_ID\"),\n", - " aws_secret_access_key=os.getenv(\"AWS_SECRET_ACCESS_KEY\"),\n", + " aws_access_key_id=os.getenv(\"S3_LANDING_ACCESS_KEY\"),\n", + " aws_secret_access_key=os.getenv(\"S3_LANDING_SECRET_KEY\"),\n", " s3_bucket=os.getenv(\"S3_BUCKET\"),\n", ")" ] @@ -82,7 +148,23 @@ { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "id": "9b56396a", + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T03:32:07.237858Z", + "iopub.status.busy": "2022-07-02T03:32:07.237339Z", + "iopub.status.idle": "2022-07-02T03:32:07.706584Z", + "shell.execute_reply": "2022-07-02T03:32:07.705710Z" + }, + "papermill": { + "duration": 0.476193, + "end_time": "2022-07-02T03:32:07.710208", + "exception": false, + "start_time": "2022-07-02T03:32:07.234015", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "# When running in Automation using Elyra and Kubeflow Pipelines,\n", @@ -95,33 +177,52 @@ " config.BASE_PDF_FOLDER.mkdir(parents=True, exist_ok=True)\n", "\n", " if not os.path.exists(config.BASE_EXTRACTION_FOLDER):\n", - " config.BASE_EXTRACTION_FOLDER.mkdir(parents=True, exist_ok=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# download all files from which text is to be extracted\n", - "s3c.download_files_in_prefix_to_dir(\n", - " config.BASE_PDF_S3_PREFIX,\n", - " config.BASE_PDF_FOLDER,\n", - ")" + " config.BASE_EXTRACTION_FOLDER.mkdir(parents=True, exist_ok=True)\n", + "\n", + " # download all files from which text is to be extracted\n", + " s3c.download_files_in_prefix_to_dir(\n", + " config.BASE_PDF_S3_PREFIX,\n", + " config.BASE_PDF_FOLDER,\n", + " )" ] }, { "cell_type": "markdown", - "metadata": {}, + "id": "8c1214bf", + "metadata": { + "papermill": { + "duration": 0.002302, + "end_time": "2022-07-02T03:32:07.716069", + "exception": false, + "start_time": "2022-07-02T03:32:07.713767", + "status": "completed" + }, + "tags": [] + }, "source": [ "### Call text extracter" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": {}, + "execution_count": 5, + "id": "b1f4fcc1", + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T03:32:07.722836Z", + "iopub.status.busy": "2022-07-02T03:32:07.722162Z", + "iopub.status.idle": "2022-07-02T03:32:07.726137Z", + "shell.execute_reply": "2022-07-02T03:32:07.725503Z" + }, + "papermill": { + "duration": 0.026113, + "end_time": "2022-07-02T03:32:07.744804", + "exception": false, + "start_time": "2022-07-02T03:32:07.718691", + "status": "completed" + }, + "tags": [] + }, "outputs": [], "source": [ "PDFTextExtractor_kwargs = {\n", @@ -133,14 +234,35 @@ }, { "cell_type": "code", - "execution_count": 7, - "metadata": {}, + "execution_count": 6, + "id": "af666e22", + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T03:32:07.751249Z", + "iopub.status.busy": "2022-07-02T03:32:07.750723Z" + }, + "papermill": { + "duration": null, + "end_time": null, + "exception": false, + "start_time": "2022-07-02T03:32:07.747600", + "status": "running" + }, + "tags": [] + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "['/opt/app-root/src/aicoe-osc-demo/data/pdfs/sustainability-report-2019.pdf']\n" + "['/opt/app-root/data/pdfs/04_NOVATEK_AR_2016_ENG_11.pdf', '/opt/app-root/data/pdfs/04_NOVATEK_AR_2018_ENG_15.pdf', '/opt/app-root/data/pdfs/2013_book_mol_ar_eng_fin.pdf', '/opt/app-root/data/pdfs/2015_BASF_Report.pdf', '/opt/app-root/data/pdfs/2017 Sustainability Report.pdf', '/opt/app-root/data/pdfs/2017-Sustainability-Report.pdf', '/opt/app-root/data/pdfs/2017_SustainabilityReport_2_9_Web.pdf', '/opt/app-root/data/pdfs/2017_sustainability_report.pdf', '/opt/app-root/data/pdfs/2017_sustainability_report_tcm14-130393.pdf', '/opt/app-root/data/pdfs/2018 Annual Report.pdf', '/opt/app-root/data/pdfs/2018_sustainability_report.pdf', '/opt/app-root/data/pdfs/2019 Annual Report.pdf', '/opt/app-root/data/pdfs/2019_global_sustainability_plan_tcm14-148662.pdf', '/opt/app-root/data/pdfs/28022019-Repsol-Annual-Financial-Report-2018_tcm14-147383.pdf', '/opt/app-root/data/pdfs/2_LOTOS_Group Directors Report 2019.pdf', '/opt/app-root/data/pdfs/AGL Energy Ltd Annual Report 2019.pdf', '/opt/app-root/data/pdfs/AGL Energy Ltd FY19 Carbon Scenario Analysis.pdf', '/opt/app-root/data/pdfs/AKERBP-Annual-Report-2016.pdf', '/opt/app-root/data/pdfs/AKERBP-Annual-Report-2017.pdf', '/opt/app-root/data/pdfs/ANNUAL REPORT 2017.pdf', '/opt/app-root/data/pdfs/AR_FS_2017_ENG.pdf', '/opt/app-root/data/pdfs/Adani Group Adani Enterprises Annual Report 2019.pdf', '/opt/app-root/data/pdfs/Adani Group Adani Power Annual Report 2019.pdf', '/opt/app-root/data/pdfs/Aker-BP-Annual-report-2018.pdf', '/opt/app-root/data/pdfs/Aker-BP-Sustainability-Report-2018-1.pdf', '/opt/app-root/data/pdfs/Aker-BP-Sustainability-Report-2019.pdf', '/opt/app-root/data/pdfs/Aker-BP-Sustainability-report-2017.pdf', '/opt/app-root/data/pdfs/Ameren Corporation Climate Risk Report 2019.pdf', '/opt/app-root/data/pdfs/Ameren Corporation Sustainability Template 2019.pdf', '/opt/app-root/data/pdfs/Ameren_2019_Annual_Report.pdf', '/opt/app-root/data/pdfs/Annual Report 2016_0.pdf', '/opt/app-root/data/pdfs/Annual-Report-2018.pdf', '/opt/app-root/data/pdfs/Annual-report-2019.pdf', '/opt/app-root/data/pdfs/Annual_Report_and_Accounts_Galp_2017_1.pdf', '/opt/app-root/data/pdfs/BASF_Report_2016.pdf', '/opt/app-root/data/pdfs/BASF_Report_2017.pdf', '/opt/app-root/data/pdfs/BASF_Report_2018.pdf', '/opt/app-root/data/pdfs/China Resources Power Holdings Co Ltd Annual Report 2018.pdf', '/opt/app-root/data/pdfs/China Resources Power Holdings Co Ltd Sustainable Development Report 2018.pdf', '/opt/app-root/data/pdfs/Coal India Ltd Annual Report 2018-2019.pdf', '/opt/app-root/data/pdfs/DTE Energy Co ESG Report 2019.pdf', '/opt/app-root/data/pdfs/DTEK BV Group Integrated Report 2017.pdf', '/opt/app-root/data/pdfs/DTEK BV Group Integrated Report 2018.pdf', '/opt/app-root/data/pdfs/Duke Energy Corporation Annual Report 2018.pdf', '/opt/app-root/data/pdfs/Duke Energy Corporation Sustainability Report 2018.pdf', '/opt/app-root/data/pdfs/Electricity Generating Authority of Thailand (EGAT) Corporate Website.pdf', '/opt/app-root/data/pdfs/Endesa SA Annual Report 2018.pdf', '/opt/app-root/data/pdfs/Endesa SA Sustainability Report 2018.pdf', '/opt/app-root/data/pdfs/Enel SA sustainability-report-2018.pdf', '/opt/app-root/data/pdfs/Enel SpA Annual Report 2019.pdf', '/opt/app-root/data/pdfs/Enel annual-report-2018.pdf', '/opt/app-root/data/pdfs/EniFor-2018-eng.pdf', '/opt/app-root/data/pdfs/Ervia-Annual-Report-2018.pdf', '/opt/app-root/data/pdfs/Eskom Holdings SOC Ltd Integrated Report 2019.pdf', '/opt/app-root/data/pdfs/FUGRO_JV2019_Clickable.pdf', '/opt/app-root/data/pdfs/GO SNG 2018 Eng annual.pdf', '/opt/app-root/data/pdfs/Galp_Integrated_Report_2018.pdf', '/opt/app-root/data/pdfs/Galp_PartI_IntegratedManagementReport_2019_ENG.pdf', '/opt/app-root/data/pdfs/Great Plains Energy Inc Sustainability Template 2019.pdf', '/opt/app-root/data/pdfs/J-POWER (Electric Power Development Co., Ltd.)_report.pdf', '/opt/app-root/data/pdfs/LSE_WG_2016.pdf', '/opt/app-root/data/pdfs/LUKOIL_ANNUAL_REPORT_2018_ENG.pdf', '/opt/app-root/data/pdfs/LUKOIL_SUSTAINABILITY_REPORT_2018.pdf', '/opt/app-root/data/pdfs/Maharashtra State Power Generation Co. Ltd._website.pdf', '/opt/app-root/data/pdfs/NRG Energy Inc_2018-nrg-sustainability-report.pdf', '/opt/app-root/data/pdfs/NRG Energy Inc_Form 10 K.pdf', '/opt/app-root/data/pdfs/NTPC Limited-Report-FY-19.pdf', '/opt/app-root/data/pdfs/NYSE_TOT_2015 annual.pdf', '/opt/app-root/data/pdfs/NYSE_TOT_2016 annual.pdf', '/opt/app-root/data/pdfs/NYSE_TOT_2017 annual.pdf', '/opt/app-root/data/pdfs/NYSE_TOT_2018 annual.pdf', '/opt/app-root/data/pdfs/ORI20022020_2019_annual_financial_report_tcm14-174953.pdf', '/opt/app-root/data/pdfs/PAO OGK-2_AR 2018.pdf', '/opt/app-root/data/pdfs/PGE-Presentation_Q42018_EN.pdf', '/opt/app-root/data/pdfs/PGE-in-transition_june_2019v_with comments.pdf', '/opt/app-root/data/pdfs/PGE_Mngmnt_Board_consolidated_report_PGE_CG_2018_with comments.pdf', '/opt/app-root/data/pdfs/PJSC Tatneft annual report 2015.pdf', '/opt/app-root/data/pdfs/PJSC Tatneft annual report 2017.pdf', '/opt/app-root/data/pdfs/PJSC Tatneft annual report 2018.pdf', '/opt/app-root/data/pdfs/RN_SR2018_eng_web_1 sustainability 2017.pdf', '/opt/app-root/data/pdfs/RN_SR_2016_EN(2) sustainabilitz 2016.pdf', '/opt/app-root/data/pdfs/RN_SR_2016_EN_2_ sustainabilitz 2016.pdf', '/opt/app-root/data/pdfs/RWE ESG presentation.pdf', '/opt/app-root/data/pdfs/RWE Investor Presentation.pdf', '/opt/app-root/data/pdfs/RWE-annual-report-2019.pdf', '/opt/app-root/data/pdfs/RWE-csr_overall-report-2019.pdf', '/opt/app-root/data/pdfs/RWE-factbook_report-2018.pdf', '/opt/app-root/data/pdfs/Rosneft_CSR18_EN_Book sustainabilitz 2018 .pdf', '/opt/app-root/data/pdfs/SAIPEM SUSTAINABILITY 2016.pdf', '/opt/app-root/data/pdfs/SG-RSE2016-2017-EN.pdf', '/opt/app-root/data/pdfs/SUEK_AR19_ENG.pdf', '/opt/app-root/data/pdfs/SaipemSustainability2018.pdf', '/opt/app-root/data/pdfs/Sustainability Report 2012_EN.pdf', '/opt/app-root/data/pdfs/Sustainability Report 2013_EN.pdf', '/opt/app-root/data/pdfs/Sustainability Report 2014_EN.pdf', '/opt/app-root/data/pdfs/Sustainability Report 2015_EN.pdf', '/opt/app-root/data/pdfs/Sustainability Report 2016_EN.pdf', '/opt/app-root/data/pdfs/Sustainability Report 2017_EN.pdf', '/opt/app-root/data/pdfs/Sustainability_Report_2017_Eng_small_1_.pdf', '/opt/app-root/data/pdfs/TGS-AR2019-FINAL-WEB-PAGES-reduced annual .pdf', '/opt/app-root/data/pdfs/Teekay-Corporation-2018-Sustainability-Report.pdf', '/opt/app-root/data/pdfs/Transocean_Sustain_digital_FN_4 2017_2018.pdf', '/opt/app-root/data/pdfs/Uniper_Sustainability_Report_2018.pdf', '/opt/app-root/data/pdfs/Uniper_Sustainability_Report_2019_EN.pdf', '/opt/app-root/data/pdfs/University of Plymouth Sustainability_Report_2018.pdf', '/opt/app-root/data/pdfs/VERBUND-Integrated-Annual-Report-2018-English.pdf', '/opt/app-root/data/pdfs/VERBUND-Integrated-Annual-Report-2019-Englisch.pdf', '/opt/app-root/data/pdfs/Vietnam Electricity Corporation (EVN)_AnnualReport2018(1).pdf', '/opt/app-root/data/pdfs/Wintershall Dea annual report 2019.pdf', '/opt/app-root/data/pdfs/Wintershall-Dea_Sustainability_Report_2019.pdf', '/opt/app-root/data/pdfs/Wood_Sustainability_Report_2018-19_web-1.pdf', '/opt/app-root/data/pdfs/XCEL Energy Inc_AR 2018.pdf', '/opt/app-root/data/pdfs/XCEL Energy Inc_CRR-Performance-Summary.pdf', '/opt/app-root/data/pdfs/annual 2015.pdf', '/opt/app-root/data/pdfs/annual 2016.pdf', '/opt/app-root/data/pdfs/annual 2017.pdf', '/opt/app-root/data/pdfs/annual 2018.pdf', '/opt/app-root/data/pdfs/annual-report-and-accounts.pdf', '/opt/app-root/data/pdfs/annual_report_2019_eng.pdf', '/opt/app-root/data/pdfs/ar_2016_book_eng_0606.pdf', '/opt/app-root/data/pdfs/ar_2019_e.pdf', '/opt/app-root/data/pdfs/dload_Sustainability Report 2018_en.pdf', '/opt/app-root/data/pdfs/dload_sustainabilityreport2019_EN.pdf', '/opt/app-root/data/pdfs/equinor-2019-annual-report-and-form-20f.pdf', '/opt/app-root/data/pdfs/gazprom-environmental-report-2018-en.pdf', '/opt/app-root/data/pdfs/gb_2019-en_interaktiv.pdf', '/opt/app-root/data/pdfs/hunting-annual-report-2015_Tu.pdf', '/opt/app-root/data/pdfs/mol_group_integrated_annual_report_2018.pdf', '/opt/app-root/data/pdfs/mol_plc_s_parent_company_annual_report_2017_en.pdf', '/opt/app-root/data/pdfs/national-grid-plc-annual-report-and-accounts-2013-14.pdf', '/opt/app-root/data/pdfs/national-grid-plc-annual-report-and-accounts.pdf', '/opt/app-root/data/pdfs/ng-40940-ng-ar-final-lores-2013-05-23.pdf', '/opt/app-root/data/pdfs/ng-annual-report-and-accounts-2018-19.pdf', '/opt/app-root/data/pdfs/odfjell-se-annual-report-2018.pdf', '/opt/app-root/data/pdfs/shell_annual_report_2018.pdf', '/opt/app-root/data/pdfs/shell_annual_report_2019.pdf', '/opt/app-root/data/pdfs/shell_sustainability_report_2018.pdf', '/opt/app-root/data/pdfs/shell_sustainability_report_2019.pdf', '/opt/app-root/data/pdfs/sr_2019_e.pdf', '/opt/app-root/data/pdfs/sustainability 2015.pdf', '/opt/app-root/data/pdfs/sustainability-report-2019.pdf', '/opt/app-root/data/pdfs/sustainability-report-repsol-2016-eng-april-baja_tcm14-63403.pdf', '/opt/app-root/data/pdfs/sustainable development 2017.pdf', '/opt/app-root/data/pdfs/vopak_annual_report_2019.pdf']\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The PDF <_io.BufferedReader name='/opt/app-root/data/pdfs/2015_BASF_Report.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case\n" ] } ], @@ -151,20 +273,47 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": {}, + "execution_count": null, + "id": "ae8a7c00", + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T00:29:45.138429Z", + "iopub.status.busy": "2022-07-02T00:29:45.137771Z", + "iopub.status.idle": "2022-07-02T00:29:58.517963Z", + "shell.execute_reply": "2022-07-02T00:29:58.517240Z" + }, + "papermill": { + "duration": null, + "end_time": null, + "exception": null, + "start_time": null, + "status": "pending" + }, + "tags": [] + }, "outputs": [], "source": [ - "# upload the extracted files to s3\n", - "s3c.upload_files_in_dir_to_prefix(\n", - " config.BASE_EXTRACTION_FOLDER,\n", - " config.BASE_EXTRACTION_S3_PREFIX\n", - ")" + "if os.getenv(\"AUTOMATION\"):\n", + " # upload the extracted files to s3\n", + " s3c.upload_files_in_dir_to_prefix(\n", + " config.BASE_EXTRACTION_FOLDER,\n", + " config.BASE_EXTRACTION_S3_PREFIX\n", + " )" ] }, { "cell_type": "markdown", - "metadata": {}, + "id": "3a3fb75c", + "metadata": { + "papermill": { + "duration": null, + "end_time": null, + "exception": null, + "start_time": null, + "status": "pending" + }, + "tags": [] + }, "source": [ "### Conclusion\n", "We called the Extractor class to extract text from the PDF and store the ouput in the `ROOT/data/extraction` folder." @@ -190,9 +339,21 @@ "pygments_lexer": "ipython3", "version": "3.8.8" }, + "papermill": { + "default_parameters": {}, + "duration": null, + "end_time": null, + "environment_variables": {}, + "exception": null, + "input_path": "/opt/app-root/src/aicoe-osc-demo/notebooks/demo2/pdf_text_extraction.ipynb", + "output_path": "/opt/app-root/src/aicoe-osc-demo/notebooks/demo2/pdf_text_extraction.ipynb", + "parameters": {}, + "start_time": "2022-07-02T03:31:56.146806", + "version": "2.3.4" + }, "requirements": "{\"packages\":{\"config\":\"*\",\"src\":\"*\"},\"requires\":{\"python_version\":\"3.8\"},\"sources\":[{\"name\":\"pypi\",\"url\":\"https://pypi.org/simple\",\"verify_ssl\":true}]}", "requirements_lock": "{\"_meta\":{\"sources\":[{\"url\":\"https://pypi.org/simple\",\"verify_ssl\":true,\"name\":\"pypi\"}],\"requires\":{\"python_version\":\"3.8\"},\"hash\":{\"sha256\":\"e8a991415fecbbd714573539a77d69175edbedb517f135bd18e59629878bbc23\"},\"pipfile-spec\":6},\"default\":{\"config\":{\"version\":\"==0.5.1\",\"hashes\":[\"sha256:2dd4a03aa383d30711d5a3325a1858de225328d61950a85be5b74c100f63016d\",\"sha256:79ffa009ff2663cc8ca29e56bcec031c044609d4bafaa4f884132a413101ce84\"],\"index\":\"pypi\"},\"src\":{\"version\":\"==0.0.7\",\"hashes\":[\"sha256:2a33bd2995800b2d3fc80efb94a300951413e02ede94691aef58bffbd69e5b0e\"],\"index\":\"pypi\"}},\"develop\":{}}" }, "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat_minor": 5 +} \ No newline at end of file