From 8690c44ee067257634df0e30c9d516f0e3cfbf07 Mon Sep 17 00:00:00 2001 From: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com> Date: Tue, 5 Jul 2022 14:42:47 -0400 Subject: [PATCH] Make use of AUTOMATION env var consistent across demo2 notebooks (#167) * These changes make use of AUTOMATION env var consistent across notebooks. * Restore previous contents so we can fix CL1->CL2 references and Demo3 refactor as separate PRs. * Run notebooks preserving output cells (using samples_1 instead of samples_145 S3 pipeline_run folder). --- README.md | 2 +- notebooks/demo2/config.py | 2 +- notebooks/demo2/create_results_table.ipynb | 157 ++-- notebooks/demo2/infer_kpi.ipynb | 753 +++++++++++++++-- notebooks/demo2/infer_relevance.ipynb | 897 ++++++++++----------- notebooks/demo2/pdf_text_extraction.ipynb | 235 +++++- 6 files changed, 1413 insertions(+), 633 deletions(-) diff --git a/README.md b/README.md index b47b26b..da2c04c 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ The following demos provide examples of how to use the tools available with [Ope * [Ingest raw data from S3 as tables on Trino](notebooks/demo1/demo1-create-tables.ipynb) * [Run SQL queries from a Jupyter Notebook environment](notebooks/demo1/demo1-join-tables.ipynb) * [Demo 1 Elyra Pipeline](https://github.com/os-climate/aicoe-osc-demo/blob/master/notebooks/demo1/demo1.pipeline) -* [Results visualized on a Superset Dashboard](https://superset-secure-odh-superset.apps.odh-cl1.apps.os-climate.org/superset/dashboard/3/) +* [Results visualized on a Superset Dashboard](https://superset-secure-odh-superset.apps.odh-cl1.apps.os-climate.org/superset/dashboard/3) * [Video on creating Elyra Pipelines and Superset Dashboard](https://youtu.be/TFgsR7UlcHA) diff --git a/notebooks/demo2/config.py b/notebooks/demo2/config.py index 95938c3..f1e9c06 100644 --- a/notebooks/demo2/config.py +++ b/notebooks/demo2/config.py @@ -21,7 +21,7 @@ BASE_INFER_KPI_FOLDER = DATA_FOLDER / "infer_KPI" CHECKPOINT_S3_PREFIX = "aicoe-osc-demo/saved_models" -DATA_S3_PREFIX = "aicoe-osc-demo/pipeline_run/samples_145" +DATA_S3_PREFIX = "aicoe-osc-demo/pipeline_run/samples_1" BASE_PDF_S3_PREFIX = f"{DATA_S3_PREFIX}/pdfs" BASE_ANNOTATION_S3_PREFIX = f"{DATA_S3_PREFIX}/annotations" BASE_EXTRACTION_S3_PREFIX = f"{DATA_S3_PREFIX}/extraction" diff --git a/notebooks/demo2/create_results_table.ipynb b/notebooks/demo2/create_results_table.ipynb index 6b8c270..56cd8c0 100644 --- a/notebooks/demo2/create_results_table.ipynb +++ b/notebooks/demo2/create_results_table.ipynb @@ -65,7 +65,9 @@ "outputs": [], "source": [ "# Load credentials\n", - "dotenv_dir = \"/opt/app-root/src/aicoe-osc-demo\"\n", + "dotenv_dir = os.environ.get(\n", + " \"CREDENTIAL_DOTENV_DIR\", os.environ.get(\"PWD\", \"/opt/app-root/src\")\n", + ")\n", "dotenv_path = pathlib.Path(dotenv_dir) / \"credentials.env\"\n", "if os.path.exists(dotenv_path):\n", " load_dotenv(dotenv_path=dotenv_path, override=True)" @@ -106,7 +108,13 @@ "source": [ "if os.getenv(\"AUTOMATION\"):\n", " if not os.path.exists(config.BASE_INFER_KPI_FOLDER):\n", - " pathlib.Path(config.BASE_INFER_KPI_FOLDER).mkdir(parents=True, exist_ok=True)" + " pathlib.Path(config.BASE_INFER_KPI_FOLDER).mkdir(parents=True, exist_ok=True)\n", + "\n", + " # Download a sample dataset file from s3\n", + " s3c.download_files_in_prefix_to_dir(\n", + " s3_prefix=config.BASE_INFER_KPI_S3_PREFIX,\n", + " destination_dir=config.BASE_INFER_KPI_FOLDER\n", + " )" ] }, { @@ -151,66 +159,66 @@ "
\n", "559 rows × 5 columns
\n", "" ], "text/plain": [ - " page pdf_name \\\n", - "0 1 sustainability-report-2019 \n", - "1 1 sustainability-report-2019 \n", - "2 1 sustainability-report-2019 \n", - "3 1 sustainability-report-2019 \n", - "4 1 sustainability-report-2019 \n", - ".. ... ... \n", - "554 29 sustainability-report-2019 \n", - "555 29 sustainability-report-2019 \n", - "556 29 sustainability-report-2019 \n", - "557 29 sustainability-report-2019 \n", - "558 30 sustainability-report-2019 \n", - "\n", - " text \\\n", - "0 What is the company name? \n", - "1 What is the company name? \n", - "2 What is the company name? \n", - "3 What is the company name? \n", - "4 What is the company name? \n", - ".. ... \n", - "554 What is the total amount of scope 1, scope 2 a... \n", - "555 What is the total amount of scope 1, scope 2 a... \n", - "556 What is the total amount of scope 1, scope 2 a... \n", - "557 What is the total amount of scope 1, scope 2 a... \n", - "558 What is the total amount of scope 1, scope 2 a... \n", - "\n", - " text_b source \n", - "0 Equinor supports the Paris agreement and a net... Text \n", - "1 broad energy company is founded on a strong co... Text \n", - "2 Equinor and partners reached a final investmen... Text \n", - "3 awarded five major contracts. Equinor is posit... Text \n", - "4 Equinor is a values-based company. How we deli... Text \n", - ".. ... ... \n", - "554 GHG emissions associated with the production a... Text \n", - "555 Indirect GHG emissions from energy imported fr... Text \n", - "556 Upstream carbon dioxide (CO₂ ) emission intensity Text \n", - "557 Total scope one emissions of CO₂ (kg CO₂) from... Text \n", - "558 b) Disclose Scope 1, Scope 2, and, if appropri... Text \n", - "\n", - "[559 rows x 5 columns]" + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" ] }, - "execution_count": 14, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -614,305 +735,147 @@ { "cell_type": "markdown", "id": "2d78421f-ace9-4c19-ab0f-65e31fabcd06", - "metadata": {}, + "metadata": { + "papermill": { + "duration": 0.00472, + "end_time": "2022-07-02T00:30:27.269127", + "exception": false, + "start_time": "2022-07-02T00:30:27.264407", + "status": "completed" + }, + "tags": [] + }, "source": [ "The results are saved in a CSV. For each table, the extracted text, as well as the page number from the source pdf file are saved." ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "id": "4ef54093-09ae-4fb9-bb77-6d71693a01ea", - "metadata": {}, + "metadata": { + "execution": { + "iopub.execute_input": "2022-07-02T00:30:27.280244Z", + "iopub.status.busy": "2022-07-02T00:30:27.279731Z", + "iopub.status.idle": "2022-07-02T00:30:27.296605Z", + "shell.execute_reply": "2022-07-02T00:30:27.295956Z" + }, + "papermill": { + "duration": 0.024431, + "end_time": "2022-07-02T00:30:27.298328", + "exception": false, + "start_time": "2022-07-02T00:30:27.273897", + "status": "completed" + }, + "tags": [] + }, "outputs": [ { - "data": { - "text/html": [ - "\n", - " | Unnamed: 0 | \n", - "page | \n", - "pdf_name | \n", - "text | \n", - "text_b | \n", - "source | \n", - "
---|---|---|---|---|---|---|
0 | \n", - "0 | \n", - "1 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "Equinor supports the Paris agreement and a net... | \n", - "Text | \n", - "
1 | \n", - "1 | \n", - "1 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "broad energy company is founded on a strong co... | \n", - "Text | \n", - "
2 | \n", - "2 | \n", - "1 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "Equinor and partners reached a final investmen... | \n", - "Text | \n", - "
3 | \n", - "3 | \n", - "1 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "awarded five major contracts. Equinor is posit... | \n", - "Text | \n", - "
4 | \n", - "4 | \n", - "1 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "Equinor is a values-based company. How we deli... | \n", - "Text | \n", - "
5 | \n", - "5 | \n", - "1 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "2019 marked the start-up of Johan Sverdrup – t... | \n", - "Text | \n", - "
6 | \n", - "6 | \n", - "1 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "For almost 50 years, Equinor has dedicated its... | \n", - "Text | \n", - "
7 | \n", - "7 | \n", - "1 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "Equinor is partnering with SSE Renewables to d... | \n", - "Text | \n", - "
8 | \n", - "8 | \n", - "1 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "Equinor Sustainability report 2019Introduction | \n", - "Text | \n", - "
9 | \n", - "9 | \n", - "2 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "We are Equinor, an international energy compan... | \n", - "Text | \n", - "
10 | \n", - "10 | \n", - "3 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "Equinor.com For further information about sust... | \n", - "Text | \n", - "
11 | \n", - "11 | \n", - "4 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "Sustainability is embedded in Equinor's: | \n", - "Text | \n", - "
12 | \n", - "12 | \n", - "4 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "Equinor’s purpose is to turn natural resources... | \n", - "Text | \n", - "
13 | \n", - "13 | \n", - "5 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "To be an industry leader in safety and security | \n", - "Text | \n", - "
14 | \n", - "14 | \n", - "8 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "us. Equinor has low carbon as one of the main ... | \n", - "Text | \n", - "
15 | \n", - "15 | \n", - "10 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "In 2019, Equinor reviewed its climate ambition... | \n", - "Text | \n", - "
16 | \n", - "16 | \n", - "10 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "Equinor’s Climate Roadmap sets out new short-,... | \n", - "Text | \n", - "
17 | \n", - "17 | \n", - "10 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "Equinor aims to reduce the CO₂ intensity of it... | \n", - "Text | \n", - "
18 | \n", - "18 | \n", - "10 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "Equinor Sustainability report 2019 Equinor Sus... | \n", - "Text | \n", - "
19 | \n", - "19 | \n", - "11 | \n", - "sustainability-report-2019 | \n", - "What is the company name? | \n", - "Global offshore wind major The past few years ... | \n", - "Text | \n", - "