From bbfba524dc391d0c8d232a59589be293b33d6a01 Mon Sep 17 00:00:00 2001 From: Laura Porter Date: Mon, 18 Jul 2022 15:03:03 +0100 Subject: [PATCH 1/3] Add some logging to indicate when publishing a judgment fails for any reason If, when publishing a judgment, copying the assets from the private to public bucket fails for any reason, we need some logging to swhow it's happened and to help with debugging. --- judgments/views.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/judgments/views.py b/judgments/views.py index 443cddaa1..375367491 100644 --- a/judgments/views.py +++ b/judgments/views.py @@ -1,3 +1,4 @@ +import logging import math import re import time @@ -381,7 +382,12 @@ def publish_documents(uri: str) -> None: if not key.endswith("parser.log") and not key.endswith(".tar.gz"): source = {"Bucket": private_bucket, "Key": key} extra_args = {"ACL": "public-read"} - client.copy(source, public_bucket, key, extra_args) + try: + client.copy(source, public_bucket, key, extra_args) + except botocore.client.ClientError as e: + logging.warning( + f"Unable to copy file {key} to new location {public_bucket}, error: {e}" + ) def unpublish_documents(uri: str) -> None: From 660d938e29b10e4fdb3b8c63c220c836922649ad Mon Sep 17 00:00:00 2001 From: Laura Porter Date: Mon, 18 Jul 2022 15:04:44 +0100 Subject: [PATCH 2/3] Strip left slash from uri when listing objects in an S3 bucket The background: When moving a "failure" judgment from a failure URI to a real, NCN-generated URI, the judgment's assets (docx, pdf and images) were moving to a new prefix in the unpublished assets bucket. But when publishing the judgment, the assets were not being copied into the published assets bucket. At first we thought this was due to permissions or encryption on the new assets. Possinly caused by something to do with permissions differences between an object created with PutObject, vs an object created with CopyObject. After much digging, we discovered that the assets were not being published because the call to `client.list_objects` in `publish_documents()` was always returning an empty array. After even more digging, we discovered that `list_objects` doesn't recognise bucket prefixes that start with a leading slash `/`. Trailing slashes are fine but leading slashes are not. For now, trim the leading slash wherever we perform the `list_objects` action on a bucket. TODO: somehow remove the leading slash on a URI everywhere. --- judgments/views.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/judgments/views.py b/judgments/views.py index 375367491..02154f5dd 100644 --- a/judgments/views.py +++ b/judgments/views.py @@ -374,6 +374,7 @@ def publish_documents(uri: str) -> None: public_bucket = env("PUBLIC_ASSET_BUCKET") private_bucket = env("PRIVATE_ASSET_BUCKET") + uri = uri.lstrip('/') response = client.list_objects(Bucket=private_bucket, Prefix=uri) for result in response.get("Contents", []): @@ -391,6 +392,7 @@ def publish_documents(uri: str) -> None: def unpublish_documents(uri: str) -> None: + uri = uri.lstrip('/') delete_from_bucket(uri, env("PUBLIC_ASSET_BUCKET")) From 406772a63847521abebd0c2cf56724d1870d2d6f Mon Sep 17 00:00:00 2001 From: Laura Porter Date: Mon, 18 Jul 2022 16:01:44 +0100 Subject: [PATCH 3/3] Refactor multiple `uri.lstrip()` calls into a named method To make it more obvious that we are preparing a uri for s3, by stripping its leading slash. S3 does not like leading slashes on its bucket prefixes! --- judgments/utils.py | 8 ++++++-- judgments/views.py | 17 +++++++---------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/judgments/utils.py b/judgments/utils.py index ee93f8c1f..fb78948f4 100644 --- a/judgments/utils.py +++ b/judgments/utils.py @@ -102,8 +102,8 @@ def set_metadata(old_uri, new_uri): def copy_assets(old_uri, new_uri): client = create_s3_client() bucket = env("PRIVATE_ASSET_BUCKET") - old_uri = old_uri.lstrip("/") - new_uri = new_uri.lstrip("/") + old_uri = uri_for_s3(old_uri) + new_uri = uri_for_s3(new_uri) response = client.list_objects(Bucket=bucket, Prefix=old_uri) @@ -141,3 +141,7 @@ def create_s3_client(): region_name=env("PRIVATE_ASSET_BUCKET_REGION", default=None), config=botocore.client.Config(signature_version="s3v4"), ) + + +def uri_for_s3(uri: str): + return uri.lstrip("/") diff --git a/judgments/views.py b/judgments/views.py index 02154f5dd..b98fd529d 100644 --- a/judgments/views.py +++ b/judgments/views.py @@ -29,6 +29,7 @@ NeutralCitationToUriError, get_judgment_root, update_judgment_uri, + uri_for_s3, ) env = environ.Env() @@ -60,8 +61,8 @@ def get_metadata(self, uri: str, judgment: ET.Element) -> dict: xml_tools.get_neutral_citation_name_value(judgment) or "" ) meta["judgment_date"] = xml_tools.get_judgment_date_value(judgment) or "" - meta["docx_url"] = generate_docx_url(uri) - meta["pdf_url"] = generate_pdf_url(uri) + meta["docx_url"] = generate_docx_url(uri_for_s3(uri)) + meta["pdf_url"] = generate_pdf_url(uri_for_s3(uri)) meta["previous_versions"] = self.get_versions(uri) meta["consignment_reference"] = api_client.get_property( uri, "transfer-consignment-reference" @@ -112,9 +113,9 @@ def post(self, request, *args, **kwargs): api_client.set_anonymised(judgment_uri, anonymised) if published: - publish_documents(judgment_uri) + publish_documents(uri_for_s3(judgment_uri)) else: - unpublish_documents(judgment_uri) + unpublish_documents(uri_for_s3(judgment_uri)) # Set name new_name = request.POST["metadata_name"] @@ -180,8 +181,8 @@ def detail(request): judgment = multipart_data.parts[0].text context["judgment"] = judgment context["page_title"] = metadata_name - context["docx_url"] = generate_docx_url(judgment_uri) - context["pdf_url"] = generate_pdf_url(judgment_uri) + context["docx_url"] = generate_docx_url(uri_for_s3(judgment_uri)) + context["pdf_url"] = generate_pdf_url(uri_for_s3(judgment_uri)) if version_uri: try: @@ -374,7 +375,6 @@ def publish_documents(uri: str) -> None: public_bucket = env("PUBLIC_ASSET_BUCKET") private_bucket = env("PRIVATE_ASSET_BUCKET") - uri = uri.lstrip('/') response = client.list_objects(Bucket=private_bucket, Prefix=uri) for result in response.get("Contents", []): @@ -392,7 +392,6 @@ def publish_documents(uri: str) -> None: def unpublish_documents(uri: str) -> None: - uri = uri.lstrip('/') delete_from_bucket(uri, env("PUBLIC_ASSET_BUCKET")) @@ -434,7 +433,6 @@ def generate_docx_url(uri: str): return "" client = create_s3_client() - uri = uri.lstrip("/") key = f'{uri}/{uri.replace("/", "_")}.docx' @@ -449,7 +447,6 @@ def generate_pdf_url(uri: str): return "" client = create_s3_client() - uri = uri.lstrip("/") key = f'{uri}/{uri.replace("/", "_")}.pdf'