From cbe7686c922513e4bc51fc75cc7eb034f364d5e8 Mon Sep 17 00:00:00 2001 From: Shreyanand Date: Mon, 28 Feb 2022 21:06:57 +0000 Subject: [PATCH] Add move data util nb and update config Update model prefix in infer_kpi nb Co-authored-by: Karanraj Chauhan Signed-off-by: Shreyanand --- notebooks/demo2/config.py | 6 +- notebooks/demo2/infer_kpi.ipynb | 338 ++++++++++++++++++++-------- notebooks/move_data.ipynb | 203 +++++++++++++++++ src/components/utils/kpi_mapping.py | 2 +- 4 files changed, 453 insertions(+), 96 deletions(-) create mode 100644 notebooks/move_data.ipynb diff --git a/notebooks/demo2/config.py b/notebooks/demo2/config.py index f231129..7ad28f8 100644 --- a/notebooks/demo2/config.py +++ b/notebooks/demo2/config.py @@ -22,8 +22,8 @@ BASE_CURATION_FOLDER = DATA_FOLDER / "curation" BASE_INFER_KPI_FOLDER = DATA_FOLDER / "infer_KPI" -CHECKPOINT_S3_PREFIX = "corpdata/saved_models" -DATA_S3_PREFIX = "corpdata/ESG/pipeline_run/samples_10" +CHECKPOINT_S3_PREFIX = "aicoe-osc-demo/saved_models" +DATA_S3_PREFIX = "aicoe-osc-demo/pipeline_run/samples_4" BASE_PDF_S3_PREFIX = f"{DATA_S3_PREFIX}/pdfs" BASE_ANNOTATION_S3_PREFIX = f"{DATA_S3_PREFIX}/annotations" BASE_EXTRACTION_S3_PREFIX = f"{DATA_S3_PREFIX}/extraction" @@ -31,7 +31,7 @@ BASE_INFER_RELEVANCE_S3_PREFIX = f"{DATA_S3_PREFIX}/infer_relevance" BASE_INFER_KPI_S3_PREFIX = f"{DATA_S3_PREFIX}/infer_KPI" -BASE_INFER_KPI_TABLE_S3_PREFIX = "corpdata/ESG/KPI_table" +BASE_INFER_KPI_TABLE_S3_PREFIX = "aicoe-osc-demo/KPI_table" ckpt = "icdar_19b2_v2.pth" config_file = "cascade_mask_rcnn_hrnetv2p_w32_20e_v2.py" diff --git a/notebooks/demo2/infer_kpi.ipynb b/notebooks/demo2/infer_kpi.ipynb index bd8a4ed..46c5ce6 100644 --- a/notebooks/demo2/infer_kpi.ipynb +++ b/notebooks/demo2/infer_kpi.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "f79c4be0-b1f8-4902-8950-05841cdc41d6", "metadata": {}, "outputs": [ @@ -19,7 +19,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "11/09/2021 16:35:58 - INFO - farm.modeling.prediction_head - Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .\n" + "03/18/2022 15:57:45 - INFO - farm.modeling.prediction_head - Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .\n" ] } ], @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "1dfe6b89-357c-4c83-848f-d2f43faaf805", "metadata": {}, "outputs": [], @@ -53,7 +53,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "f0982f03-be7f-4abc-ba30-ff1e842cf147", "metadata": {}, "outputs": [], @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "2e5136cb-7e32-4ca2-a533-db0b990f288e", "metadata": {}, "outputs": [], @@ -81,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "e5607a79-d139-40e4-ae56-646cd54a1838", "metadata": {}, "outputs": [], @@ -111,14 +111,14 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "id": "7c24a0d7-40c2-4da8-9e7f-66ce78386ec3", "metadata": {}, "outputs": [], "source": [ "model_root = pathlib.Path(file_config.saved_models_dir).parent\n", "model_rel_zip = pathlib.Path(model_root, 'KPI_EXTRACTION.zip')\n", - "s3c.download_file_from_s3(model_rel_zip, \"corpdata/saved_models\", \"KPI_EXTRACTION.zip\")\n", + "s3c.download_file_from_s3(model_rel_zip, config.CHECKPOINT_S3_PREFIX, \"KPI_EXTRACTION.zip\")\n", "with zipfile.ZipFile(pathlib.Path(model_root, 'KPI_EXTRACTION.zip'), 'r') as z:\n", " z.extractall(model_root)" ] @@ -142,17 +142,17 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "id": "38a70174-65b3-4365-ba6f-d020457de4b7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'/opt/app-root/src/aicoe-osc-demo/models/KPI_EXTRACTION'" + "'/opt/app-root/src/aicoe-osc-demo-2022-02-28-14-32/models/KPI_EXTRACTION'" ] }, - "execution_count": 6, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -163,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, "id": "b840d5cf-3d42-4e9e-be2d-d95634c4ac96", "metadata": {}, "outputs": [ @@ -171,8 +171,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "11/09/2021 16:37:20 - WARNING - farm.modeling.prediction_head - Some unused parameters are passed to the QuestionAnsweringHead. Might not be a problem. Params: {\"training\": false, \"num_labels\": 2, \"ph_output_type\": \"per_token_squad\", \"model_type\": \"span_classification\", \"label_tensor_name\": \"question_answering_label_ids\", \"label_list\": [\"start_token\", \"end_token\"], \"metric\": \"squad\", \"name\": \"QuestionAnsweringHead\"}\n", - "11/09/2021 16:37:23 - WARNING - farm.infer - QAInferencer always has task_type='question_answering' even if another value is provided to Inferencer.load() or QAInferencer()\n" + "03/18/2022 16:00:30 - WARNING - farm.modeling.prediction_head - Some unused parameters are passed to the QuestionAnsweringHead. Might not be a problem. Params: {\"training\": false, \"num_labels\": 2, \"ph_output_type\": \"per_token_squad\", \"model_type\": \"span_classification\", \"label_tensor_name\": \"question_answering_label_ids\", \"label_list\": [\"start_token\", \"end_token\"], \"metric\": \"squad\", \"name\": \"QuestionAnsweringHead\"}\n", + "03/18/2022 16:00:34 - WARNING - farm.infer - QAInferencer always has task_type='question_answering' even if another value is provided to Inferencer.load() or QAInferencer()\n" ] } ], @@ -190,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "id": "d506a8b3-f013-4eef-98fc-d1cdd1525a14", "metadata": {}, "outputs": [], @@ -209,7 +209,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "id": "e92e6380-5440-499b-b4e8-6871dfc0ec13", "metadata": {}, "outputs": [ @@ -217,7 +217,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 4.79 Batches/s]" + "Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 19.54 Batches/s]" ] }, { @@ -235,7 +235,7 @@ " 'offset_context_end': 414,\n", " 'offset_context_start': 314,\n", " 'probability': None,\n", - " 'score': 7.129119873046875},\n", + " 'score': 7.129114151000977},\n", " {'answer': 'no_answer',\n", " 'context': '',\n", " 'document_id': '0-0',\n", @@ -244,10 +244,10 @@ " 'offset_context_end': 0,\n", " 'offset_context_start': 0,\n", " 'probability': None,\n", - " 'score': -20.135550498962402}],\n", + " 'score': -20.135552406311035}],\n", " 'ground_truth': [],\n", " 'id': '0-0',\n", - " 'no_ans_gap': 27.264670372009277,\n", + " 'no_ans_gap': 27.26466655731201,\n", " 'question': 'What is the target year for climate '\n", " 'commitment?'}],\n", " 'task': 'qa'}\n" @@ -282,14 +282,14 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "id": "274ed119-f30a-43c6-9fae-a49a59167648", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'score': 7.129119873046875,\n", + "{'score': 7.129114151000977,\n", " 'probability': None,\n", " 'answer': '2021',\n", " 'offset_answer_start': 362,\n", @@ -300,7 +300,7 @@ " 'document_id': '0-0'}" ] }, - "execution_count": 10, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -313,14 +313,14 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 14, "id": "06544a27-29a1-43bc-a9b4-3983f9f8478c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'score': -20.135550498962402,\n", + "{'score': -20.135552406311035,\n", " 'probability': None,\n", " 'answer': 'no_answer',\n", " 'offset_answer_start': 0,\n", @@ -331,7 +331,7 @@ " 'document_id': '0-0'}" ] }, - "execution_count": 11, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -351,17 +351,17 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "id": "9ddef850-c2bc-412a-ba61-7c473cded67c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'Text': '/opt/app-root/src/aicoe-osc-demo/data/infer_relevance'}" + "{'Text': '/opt/app-root/src/aicoe-osc-demo-2022-02-28-14-32/data/infer_relevance'}" ] }, - "execution_count": 12, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -372,7 +372,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "id": "715e0399-beee-43ac-8ef3-50e97be6cbf6", "metadata": {}, "outputs": [ @@ -380,14 +380,26 @@ "name": "stderr", "output_type": "stream", "text": [ - "11/09/2021 16:37:24 - INFO - src.models.text_kpi_infer - #################### Starting KPI Inference for the following relevance CSV files found in /opt/app-root/src/aicoe-osc-demo/data/infer_kpi:\n", - "['sustainability-report-2019_predictions_relevant.csv'] \n", - "11/09/2021 16:37:24 - INFO - src.models.text_kpi_infer - #################### 1/1\n", - "11/09/2021 16:37:24 - INFO - src.models.text_kpi_infer - Starting KPI Extraction for sustainability-report-2019\n", - "Inferencing Samples: 100%|██████████| 45/45 [01:08<00:00, 1.53s/ Batches]\n", - "11/09/2021 16:38:35 - ERROR - farm.modeling.predictions - Both start and end offsets should be 0: \n", - "226, 226 with a no_answer. \n", - "11/09/2021 16:38:35 - INFO - src.models.text_kpi_infer - Save the result of KPI extraction to /opt/app-root/src/aicoe-osc-demo/data/infer_kpi/sustainability-report-2019_predictions_kpi.csv\n" + "03/18/2022 16:00:38 - INFO - src.models.text_kpi_infer - #################### Starting KPI Inference for the following relevance CSV files found in /opt/app-root/src/aicoe-osc-demo-2022-02-28-14-32/data/infer_kpi:\n", + "['75506106_BOA_2016-12-31_predictions_relevant.csv', 'sustainability-report-2019_predictions_relevant.csv', '90044053_Fisher & Paykel Hl_2017-11-07_predictions_relevant.csv', '88094292_Carriage Svcs Inc_2019-07-23_predictions_relevant.csv'] \n", + "03/18/2022 16:00:38 - INFO - src.models.text_kpi_infer - #################### 1/4\n", + "03/18/2022 16:00:38 - INFO - src.models.text_kpi_infer - Starting KPI Extraction for 75506106_BOA_2016-12-31\n", + "Inferencing Samples: 100%|██████████| 5/5 [00:01<00:00, 4.29 Batches/s]\n", + "03/18/2022 16:00:39 - INFO - src.models.text_kpi_infer - Save the result of KPI extraction to /opt/app-root/src/aicoe-osc-demo-2022-02-28-14-32/data/infer_kpi/75506106_BOA_2016-12-31_predictions_kpi.csv\n", + "03/18/2022 16:00:39 - INFO - src.models.text_kpi_infer - #################### 2/4\n", + "03/18/2022 16:00:39 - INFO - src.models.text_kpi_infer - Starting KPI Extraction for sustainability-report-2019\n", + "Inferencing Samples: 100%|██████████| 35/35 [00:07<00:00, 4.42 Batches/s]\n", + "03/18/2022 16:00:49 - ERROR - farm.modeling.predictions - Both start and end offsets should be 0: \n", + "86, 86 with a no_answer. \n", + "03/18/2022 16:00:49 - INFO - src.models.text_kpi_infer - Save the result of KPI extraction to /opt/app-root/src/aicoe-osc-demo-2022-02-28-14-32/data/infer_kpi/sustainability-report-2019_predictions_kpi.csv\n", + "03/18/2022 16:00:49 - INFO - src.models.text_kpi_infer - #################### 3/4\n", + "03/18/2022 16:00:49 - INFO - src.models.text_kpi_infer - Starting KPI Extraction for 90044053_Fisher & Paykel Hl_2017-11-07\n", + "Inferencing Samples: 100%|██████████| 9/9 [00:02<00:00, 4.47 Batches/s]\n", + "03/18/2022 16:00:51 - INFO - src.models.text_kpi_infer - Save the result of KPI extraction to /opt/app-root/src/aicoe-osc-demo-2022-02-28-14-32/data/infer_kpi/90044053_Fisher & Paykel Hl_2017-11-07_predictions_kpi.csv\n", + "03/18/2022 16:00:51 - INFO - src.models.text_kpi_infer - #################### 4/4\n", + "03/18/2022 16:00:51 - INFO - src.models.text_kpi_infer - Starting KPI Extraction for 88094292_Carriage Svcs Inc_2019-07-23\n", + "Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 28.57 Batches/s]\n", + "03/18/2022 16:00:52 - INFO - src.models.text_kpi_infer - Save the result of KPI extraction to /opt/app-root/src/aicoe-osc-demo-2022-02-28-14-32/data/infer_kpi/88094292_Carriage Svcs Inc_2019-07-23_predictions_kpi.csv\n" ] }, { @@ -427,91 +439,233 @@ " \n", " \n", " 0\n", - " sustainability-report-2019\n", - " What is the company name?\n", + " 75506106_BOA_2016-12-31\n", + " In which year was the annual report or the sus...\n", " NaN\n", - " Equinor ASA\n", - " 32.0\n", - " Equinor ASA Box 8500 NO-4035 Stavanger Norway ...\n", + " 2015-2016\n", + " 24.0\n", + " Nombre de projets ayant atteint le closing fin...\n", " Text\n", - " 16.310862\n", - " -10.405624\n", - " -25.405624\n", + " 11.181186\n", + " -9.148028\n", + " -24.148028\n", " NaN\n", " \n", " \n", " 1\n", - " sustainability-report-2019\n", - " What is the company name?\n", + " 75506106_BOA_2016-12-31\n", + " In which year was the annual report or the sus...\n", " NaN\n", - " Equinor\n", - " 27.0\n", - " To show our commitment to equal and inclusive ...\n", + " 2016\n", + " 30.0\n", + " L’Atelier Finance Climat pour l’Afrique Franc...\n", " Text\n", - " 16.253649\n", - " -7.369206\n", - " -22.369206\n", + " 11.143324\n", + " -9.074394\n", + " -24.074394\n", " NaN\n", " \n", " \n", " 2\n", - " sustainability-report-2019\n", - " What is the company name?\n", + " 75506106_BOA_2016-12-31\n", + " In which year was the annual report or the sus...\n", " NaN\n", - " Equinor\n", - " 29.0\n", - " Payments made directly by Equinor to governmen...\n", + " 2016\n", + " 48.0\n", + " Au cours de l’anne 2016, EFE-Maroc a form 4 74...\n", " Text\n", - " 16.137005\n", - " -8.840895\n", - " -23.840895\n", + " 11.117598\n", + " -9.312975\n", + " -24.312975\n", " NaN\n", " \n", " \n", " 3\n", - " sustainability-report-2019\n", + " 75506106_BOA_2016-12-31\n", + " In which year was the annual report or the sus...\n", + " NaN\n", + " 2016\n", + " 30.0\n", + " Business Climate Summit, 28-29 juin 2016, Lond...\n", + " Text\n", + " 11.020465\n", + " -8.655083\n", + " -23.655083\n", + " NaN\n", + " \n", + " \n", + " 4\n", + " 75506106_BOA_2016-12-31\n", + " What is the base year for carbon reduction com...\n", + " NaN\n", + " 2016\n", + " 30.0\n", + " Business Climate Summit, 28-29 juin 2016, Lond...\n", + " Text\n", + " -8.263815\n", + " 6.064293\n", + " -8.935707\n", + " NaN\n", + " \n", + " \n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " \n", + " \n", + " 51\n", + " 90044053_Fisher & Paykel Hl_2017-11-07\n", + " What is the volume of estimated proven hydroca...\n", + " NaN\n", + " no_answer\n", + " NaN\n", + " NaN\n", + " Text\n", + " 2.527988\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 52\n", + " 90044053_Fisher & Paykel Hl_2017-11-07\n", + " What is the volume of estimated proven hydroca...\n", + " NaN\n", + " 3\n", + " 26.0\n", + " Within the first month, we were able to increas...\n", + " Text\n", + " -9.606719\n", + " 17.527988\n", + " 2.527988\n", + " NaN\n", + " \n", + " \n", + " 53\n", + " 90044053_Fisher & Paykel Hl_2017-11-07\n", + " What is the volume of estimated proven hydroca...\n", + " NaN\n", + " 0.33 tCO2e/ NZ$M\n", + " 25.0\n", + " This financial year we measured waste outputs i...\n", + " Text\n", + " -9.643618\n", + " 17.502722\n", + " 2.502722\n", + " NaN\n", + " \n", + " \n", + " 0\n", + " 88094292_Carriage Svcs Inc_2019-07-23\n", + " In which year was the annual report or the sus...\n", + " NaN\n", + " February 15, 2017\n", + " 0.0\n", + " CARRIAGE SERVICES, INC. (the Company) CORPORAT...\n", + " Text\n", + " 8.340643\n", + " -9.459160\n", + " -24.459160\n", + " NaN\n", + " \n", + " \n", + " 1\n", + " 88094292_Carriage Svcs Inc_2019-07-23\n", " What is the company name?\n", " NaN\n", - " Equinor Sustainability\n", - " 20.0\n", - " Equinor Sustainability report 2019 Always safe...\n", + " STOCKHOLDER\n", + " 5.0\n", + " STOCKHOLDER COMMUNICATIONS WITH DIRECTORS\n", " Text\n", - " 15.984804\n", - " -5.345298\n", - " -20.345298\n", + " 11.089083\n", + " -5.213624\n", + " -20.213624\n", " NaN\n", " \n", " \n", "\n", + "

189 rows × 11 columns

\n", "" ], "text/plain": [ - " pdf_name kpi kpi_id \\\n", - "0 sustainability-report-2019 What is the company name? NaN \n", - "1 sustainability-report-2019 What is the company name? NaN \n", - "2 sustainability-report-2019 What is the company name? NaN \n", - "3 sustainability-report-2019 What is the company name? NaN \n", + " pdf_name \\\n", + "0 75506106_BOA_2016-12-31 \n", + "1 75506106_BOA_2016-12-31 \n", + "2 75506106_BOA_2016-12-31 \n", + "3 75506106_BOA_2016-12-31 \n", + "4 75506106_BOA_2016-12-31 \n", + ".. ... \n", + "51 90044053_Fisher & Paykel Hl_2017-11-07 \n", + "52 90044053_Fisher & Paykel Hl_2017-11-07 \n", + "53 90044053_Fisher & Paykel Hl_2017-11-07 \n", + "0 88094292_Carriage Svcs Inc_2019-07-23 \n", + "1 88094292_Carriage Svcs Inc_2019-07-23 \n", "\n", - " answer page \\\n", - "0 Equinor ASA 32.0 \n", - "1 Equinor 27.0 \n", - "2 Equinor 29.0 \n", - "3 Equinor Sustainability 20.0 \n", + " kpi kpi_id \\\n", + "0 In which year was the annual report or the sus... NaN \n", + "1 In which year was the annual report or the sus... NaN \n", + "2 In which year was the annual report or the sus... NaN \n", + "3 In which year was the annual report or the sus... NaN \n", + "4 What is the base year for carbon reduction com... NaN \n", + ".. ... ... \n", + "51 What is the volume of estimated proven hydroca... NaN \n", + "52 What is the volume of estimated proven hydroca... NaN \n", + "53 What is the volume of estimated proven hydroca... NaN \n", + "0 In which year was the annual report or the sus... NaN \n", + "1 What is the company name? NaN \n", "\n", - " paragraph source score \\\n", - "0 Equinor ASA Box 8500 NO-4035 Stavanger Norway ... Text 16.310862 \n", - "1 To show our commitment to equal and inclusive ... Text 16.253649 \n", - "2 Payments made directly by Equinor to governmen... Text 16.137005 \n", - "3 Equinor Sustainability report 2019 Always safe... Text 15.984804 \n", + " answer page \\\n", + "0 2015-2016 24.0 \n", + "1 2016 30.0 \n", + "2 2016 48.0 \n", + "3 2016 30.0 \n", + "4 2016 30.0 \n", + ".. ... ... \n", + "51 no_answer NaN \n", + "52 3 26.0 \n", + "53 0.33 tCO2e/ NZ$M 25.0 \n", + "0 February 15, 2017 0.0 \n", + "1 STOCKHOLDER 5.0 \n", "\n", - " no_ans_score no_answer_score_plus_boost index \n", - "0 -10.405624 -25.405624 NaN \n", - "1 -7.369206 -22.369206 NaN \n", - "2 -8.840895 -23.840895 NaN \n", - "3 -5.345298 -20.345298 NaN " + " paragraph source score \\\n", + "0 Nombre de projets ayant atteint le closing fin... Text 11.181186 \n", + "1 L’Atelier Finance Climat pour l’Afrique Franc... Text 11.143324 \n", + "2 Au cours de l’anne 2016, EFE-Maroc a form 4 74... Text 11.117598 \n", + "3 Business Climate Summit, 28-29 juin 2016, Lond... Text 11.020465 \n", + "4 Business Climate Summit, 28-29 juin 2016, Lond... Text -8.263815 \n", + ".. ... ... ... \n", + "51 NaN Text 2.527988 \n", + "52 Within the first month, we were able to increas... Text -9.606719 \n", + "53 This financial year we measured waste outputs i... Text -9.643618 \n", + "0 CARRIAGE SERVICES, INC. (the Company) CORPORAT... Text 8.340643 \n", + "1 STOCKHOLDER COMMUNICATIONS WITH DIRECTORS Text 11.089083 \n", + "\n", + " no_ans_score no_answer_score_plus_boost index \n", + "0 -9.148028 -24.148028 NaN \n", + "1 -9.074394 -24.074394 NaN \n", + "2 -9.312975 -24.312975 NaN \n", + "3 -8.655083 -23.655083 NaN \n", + "4 6.064293 -8.935707 NaN \n", + ".. ... ... ... \n", + "51 NaN NaN NaN \n", + "52 17.527988 2.527988 NaN \n", + "53 17.502722 2.502722 NaN \n", + "0 -9.459160 -24.459160 NaN \n", + "1 -5.213624 -20.213624 NaN \n", + "\n", + "[189 rows x 11 columns]" ] }, - "execution_count": 13, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -522,7 +676,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "id": "ebfbd2f9-1bd8-4dc4-a581-67b86b0df92f", "metadata": {}, "outputs": [], @@ -560,7 +714,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.6" + "version": "3.8.8" } }, "nbformat": 4, diff --git a/notebooks/move_data.ipynb b/notebooks/move_data.ipynb new file mode 100644 index 0000000..ada30a9 --- /dev/null +++ b/notebooks/move_data.ipynb @@ -0,0 +1,203 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8a1b9496-4187-40e1-85fa-b8111c971b8e", + "metadata": {}, + "source": [ + "# Move data \n", + "In this notebook, we take all the relevant data for the inference notebooks and pipeline stored in `ocp-odh-os-demo-s3` bucket and move it to the `redhat-osc-physical-landing` bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a73004a-934e-4cbe-bf5d-301dac96e8f8", + "metadata": {}, + "outputs": [], + "source": [ + "from src.data.s3_communication import S3Communication\n", + "import os\n", + "import pathlib\n", + "from dotenv import load_dotenv\n", + "from pathlib import Path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bfc74b4-47e7-46ca-986e-7c34f73e6ea4", + "metadata": {}, + "outputs": [], + "source": [ + "dotenv_dir = os.environ.get(\n", + " \"CREDENTIAL_DOTENV_DIR\", os.environ.get(\"PWD\", \"/opt/app-root/src\")\n", + ")\n", + "dotenv_path = pathlib.Path(dotenv_dir) / \"credentials.env\"\n", + "if os.path.exists(dotenv_path):\n", + " load_dotenv(dotenv_path=dotenv_path, override=True)" + ] + }, + { + "cell_type": "markdown", + "id": "8e1438eb-2efa-4b1a-a90f-34a39a68adc9", + "metadata": {}, + "source": [ + "### Red Hat Physical landing bucket (`redhat-osc-physical-landing`) s3 connector" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4944bf4d-4863-4e8d-b4a8-4ade481217a7", + "metadata": {}, + "outputs": [], + "source": [ + "# init s3 connector\n", + "s3c = S3Communication(\n", + " s3_endpoint_url=os.getenv(\"S3_ENDPOINT\"),\n", + " aws_access_key_id=os.getenv(\"AWS_ACCESS_KEY_ID\"),\n", + " aws_secret_access_key=os.getenv(\"AWS_SECRET_ACCESS_KEY\"),\n", + " s3_bucket=os.getenv(\"S3_BUCKET\"),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "794b283e-310a-4995-8504-db3a2c9c9372", + "metadata": {}, + "source": [ + "### Trino bucket (`ocp-odh-os-demo-s3`) s3 connector" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e931a663-7ebe-4855-bc02-33e793a94fd8", + "metadata": {}, + "outputs": [], + "source": [ + "s3c_trino = S3Communication(\n", + " s3_endpoint_url=os.getenv(\"Trino_S3_ENDPOINT\"),\n", + " aws_access_key_id=os.getenv(\"Trino_ACCESS_KEY\"),\n", + " aws_secret_access_key=os.getenv(\"Trino_SECRET_KEY\"),\n", + " s3_bucket=os.getenv(\"Trino_S3_BUCKET\"),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "7b26887a-bd17-4545-8f20-c03ea831135d", + "metadata": {}, + "source": [ + "### Download relevant files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06ba910f-7503-4753-8299-a57851d7f17a", + "metadata": {}, + "outputs": [], + "source": [ + "# Download data pdfs\n", + "s3c_trino.download_files_in_prefix_to_dir('corpdata/ESG/pipeline_run/samples_4/pdfs', \"./data/samples_4\")\n", + "s3c_trino.download_files_in_prefix_to_dir('corpdata/ESG/pipeline_run/samples_10/pdfs', \"./data/samples_10\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "529b0fde-da93-4be0-b588-4855ad885123", + "metadata": {}, + "outputs": [], + "source": [ + "# Download pretrained models\n", + "s3c_trino.download_files_in_prefix_to_dir('corpdata/saved_models', \"./models\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a66cdee-cc23-492e-a8af-259c343a1da9", + "metadata": {}, + "outputs": [], + "source": [ + "# Download kpi_mapping.csv\n", + "s3c_trino.download_file_from_s3(\"./kpi_mapping.csv\", \"corpdata/ESG/kpi_mapping\", 'kpi_mapping.csv')" + ] + }, + { + "cell_type": "markdown", + "id": "2e1269fb-62de-46ef-a0e0-9509486ca85d", + "metadata": {}, + "source": [ + "### Upload relevant files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e73ab83d-cdeb-4e0b-9476-4d70190f3b13", + "metadata": {}, + "outputs": [], + "source": [ + "# Upload data pdfs\n", + "s3c.upload_files_in_dir_to_prefix(\"./data/samples_4\", \"aicoe-osc-demo/pipeline_run/samples_4/pdfs\")\n", + "s3c.upload_files_in_dir_to_prefix(\"./data/samples_10\", \"aicoe-osc-demo/pipeline_run/samples_10/pdfs\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "433594d3-c8bd-4395-bd39-439c765cf95b", + "metadata": {}, + "outputs": [], + "source": [ + "# Upload pretrained models\n", + "s3c.upload_files_in_dir_to_prefix(\"./models\", \"aicoe-osc-demo/saved_models\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58fdf1c9-b810-437c-988c-8d7d85d90ad2", + "metadata": {}, + "outputs": [], + "source": [ + "# Upload kpi_mapping.csv\n", + "s3c.upload_file_to_s3(\"./kpi_mapping.csv\", \"aicoe-osc-demo/kpi_mapping\", \"kpi_mapping.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "708c091a-e507-4a40-acd4-29bbff8ea16d", + "metadata": {}, + "source": [ + "# Conclusion\n", + "The notebook uses `S3Communication` class to move data from one bucket to another." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/components/utils/kpi_mapping.py b/src/components/utils/kpi_mapping.py index f07e0d2..1181e60 100755 --- a/src/components/utils/kpi_mapping.py +++ b/src/components/utils/kpi_mapping.py @@ -22,7 +22,7 @@ # Read kpi mapping csv from s3 df = s3c.download_df_from_s3( - "corpdata/ESG/kpi_mapping", + "aicoe-osc-demo/kpi_mapping", "kpi_mapping.csv", filetype=S3FileType.CSV, header=0,