From 4e115ee7db17acbbf4bf642e1d7d55ff63b34e4e Mon Sep 17 00:00:00 2001 From: Nick Moreton Date: Fri, 17 Dec 2021 13:57:21 +0000 Subject: [PATCH 1/2] refactor block builder fetch_url I've also made some adjustments to remove configs that are written as functions. save --- .../block_builder_defaults.py | 393 +++++++++--------- 1 file changed, 201 insertions(+), 192 deletions(-) diff --git a/wagtail_wordpress_import/block_builder_defaults.py b/wagtail_wordpress_import/block_builder_defaults.py index b4157700..47b10049 100644 --- a/wagtail_wordpress_import/block_builder_defaults.py +++ b/wagtail_wordpress_import/block_builder_defaults.py @@ -10,52 +10,6 @@ ImportedImage = get_image_model() ImportedDocument = get_document_model() -"""StreamField blocks""" - - -def build_block_quote_block(tag): - block_dict = { - "type": "block_quote", - "value": {"quote": tag.text.strip(), "attribution": tag.cite}, - } - return block_dict - - -def build_form_block(tag): - block_dict = {"type": "raw_html", "value": str(tag)} - return block_dict - - -def build_heading_block(tag): - block_dict = { - "type": "heading", - "value": {"importance": tag.name, "text": tag.text}, - } - return block_dict - - -def build_iframe_block(tag): - block_dict = { - "type": "raw_html", - "value": '
{}
'.format( - str(tag) - ), - } - return block_dict - - -def build_image_block(tag): - def get_image_id(src): - return 1 - - block_dict = {"type": "image", "value": get_image_id(tag.src)} - return block_dict - - -def build_table_block(tag): - block_dict = {"type": "raw_html", "value": str(tag)} - return block_dict - def conf_html_tags_to_blocks(): return getattr( @@ -63,11 +17,6 @@ def conf_html_tags_to_blocks(): "WAGTAIL_WORDPRESS_IMPORTER_CONVERT_HTML_TAGS_TO_BLOCKS", { "h1": "wagtail_wordpress_import.block_builder_defaults.build_heading_block", - # "h2": "wagtail_wordpress_import.block_builder_defaults.build_heading_block", - # "h3": "wagtail_wordpress_import.block_builder_defaults.build_heading_block", - # "h4": "wagtail_wordpress_import.block_builder_defaults.build_heading_block", - # "h5": "wagtail_wordpress_import.block_builder_defaults.build_heading_block", - # "h6": "wagtail_wordpress_import.block_builder_defaults.build_heading_block", "table": "wagtail_wordpress_import.block_builder_defaults.build_table_block", "iframe": "wagtail_wordpress_import.block_builder_defaults.build_iframe_block", "form": "wagtail_wordpress_import.block_builder_defaults.build_form_block", @@ -77,74 +26,45 @@ def conf_html_tags_to_blocks(): ) -"""Fall back StreamField block""" - - -def conf_fallback_block(): - return getattr( - settings, - "WAGTAIL_WORDPRESS_IMPORTER_FALLBACK_BLOCK", - "wagtail_wordpress_import.block_builder_defaults.build_richtext_block_content", - ) - - -def build_richtext_block_content(html, blocks): - """ - image_linker is called to link up and retrive the remote image - document_linker is called to link up and retrive the remote documents - filters are called to replace inline shortcodes - """ - html = image_linker(html) - html = document_linker(html) - for inline_shortcode_handler in getattr( - settings, "WAGTAIL_WORDPRESS_IMPORTER_INLINE_SHORTCODE_HANDLERS", [] - ): - function = import_string(inline_shortcode_handler).construct_html_tag - html = function(html) - blocks.append({"type": "rich_text", "value": html}) - html = "" - return html - - -"""Rich Text Functions""" - - -def conf_valid_image_content_types(): - return getattr( - settings, - "WAGTAIL_WORDPRESS_IMPORTER_VALID_IMAGE_CONTENT_TYPES", - [ - "image/gif", - "image/jpeg", - "image/png", - "image/webp", - "text/html", - ], - ) - - -def conf_valid_document_file_types(): - return getattr( - settings, - "", - [ - "pdf", - "ppt", - "docx", - ], - ) - - -def conf_valid_document_content_types(): - return getattr( - settings, - "", - [ - "application/pdf", - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - ], - ) +def fetch_url(src, allow_redirects=True): + """general purpose url fetcher with ability to pass in own config""" + try: + response = requests.get( + src, + **getattr( + settings, + "WAGTAIL_WORDPRESS_IMPORTER_REQUESTS_SETTINGS", + { + "headers": {"User-Agent": "WagtailWordpressImporter"}, + "timeout": 5, + "stream": True, + "allow_redirects": allow_redirects, + }, + ), + ) + status = True if response.status_code == 200 else False + return response, status, response.headers.get("content-type") + except requests.ConnectionError: + print(f"ConnectionError: {src}") + return None, False, None + except requests.HTTPError: + print(f"HTTPError: {src}") + return None, False, None + except requests.RequestException: + print(f"RequestException: {src}") + return None, False, None + except requests.ReadTimeout: + print(f"ReadTimeout: {src}") + return None, False, None + except requests.Timeout: + print(f"Timeout: {src}") + return None, False, None + except requests.ConnectTimeout: + print(f"ConnectTimeout: {src}") + return None, False, None + + +## FUNCTIONS FOR IMAGES def image_linker(html): @@ -183,50 +103,24 @@ def image_linker(html): return str(soup) -def get_image_alt(img_tag): - return img_tag.attrs["alt"] if "alt" in img_tag.attrs else None - - -def get_image_file_name(src): - return src.split("/")[-1] if src else None # need the last part - - -def get_document_file_name(src): - return src.split("/")[-1] if src else None # need the last part - - -def image_exists(name): - try: - return ImportedImage.objects.get(title=name) - except ImportedImage.DoesNotExist: - pass - - -def document_exists(name): - try: - return ImportedDocument.objects.get(title=name) - except ImportedDocument.DoesNotExist: - pass - - -def conf_get_requests_settings(): - return getattr( - settings, - "WAGTAIL_WORDPRESS_IMPORTER_REQUESTS_SETTINGS", - { - "headers": {"User-Agent": "WagtailWordpressImporter"}, - "timeout": 5, - "stream": False, - }, - ) - - def get_or_save_image(src): image_file_name = get_image_file_name(src) existing_image = image_exists(image_file_name) if not existing_image: response, valid, type = fetch_url(src) - if valid and (type in conf_valid_image_content_types()): + if valid and ( + type + in getattr( + settings, + "WAGTAIL_WORDPRESS_IMPORTER_VALID_IMAGE_CONTENT_TYPES", + [ + "image/gif", + "image/jpeg", + "image/png", + "image/webp", + ], + ) + ): temp_image = NamedTemporaryFile(delete=True) temp_image.name = image_file_name temp_image.write(response.content) @@ -242,38 +136,7 @@ def get_or_save_image(src): return existing_image -def fetch_url(src, r=None, status=False, content_type=None): - """general purpose url fetcher with ability to pass in own config""" - try: - response = requests.get(src, **conf_get_requests_settings()) - status = True if response.status_code == 200 else False - content_type = ( - response.headers["content-type"].lower() - if response.headers.get("content-type") - else "" - ) - return response, status, content_type - except Exception as e: - raise requests.ConnectionError(e) - - -def get_absolute_src(src, domain_prefix=None): - src = src.lstrip("/") - if not src.startswith("http") and domain_prefix: - return domain_prefix + "/" + src - return src - - -def get_alignment_class(image): - alignment = "fullwidth" - - if "class" in image.attrs: - if "align-left" in image.attrs["class"]: - alignment = "left" - elif "align-right" in image.attrs["class"]: - alignment = "right" - - return alignment +## FUNCTIONS FOR DOCUMENTS def document_linker(html): @@ -314,12 +177,31 @@ def document_linker(html): def get_or_save_document(href): file_type = href.split(".")[-1] - if file_type in conf_valid_document_file_types(): + if file_type in getattr( + settings, + "", + [ + "pdf", + "ppt", + "docx", + ], + ): document_file_name = get_document_file_name(href) existing_document = document_exists(document_file_name) if not existing_document: response, valid, type = fetch_url(href) - if valid and (type in conf_valid_document_content_types()): + if valid and ( + type + in getattr( + settings, + "", + [ + "application/pdf", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ], + ) + ): temp_document = NamedTemporaryFile(delete=True) temp_document.name = document_file_name temp_document.write(response.content) @@ -333,3 +215,130 @@ def get_or_save_document(href): else: print(f"RECEIVED INVALID DOCUMENT RESPONSE: {href}") return existing_document + + +## STREAMFIELD BLOCKS + + +def build_block_quote_block(tag): + block_dict = { + "type": "block_quote", + "value": {"quote": tag.text.strip(), "attribution": tag.cite}, + } + return block_dict + + +def build_form_block(tag): + block_dict = {"type": "raw_html", "value": str(tag)} + return block_dict + + +def build_heading_block(tag): + block_dict = { + "type": "heading", + "value": {"importance": tag.name, "text": tag.text}, + } + return block_dict + + +def build_iframe_block(tag): + block_dict = { + "type": "raw_html", + "value": '
{}
'.format( + str(tag) + ), + } + return block_dict + + +def build_image_block(tag): + def get_image_id(src): + return 1 + + block_dict = {"type": "image", "value": get_image_id(tag.src)} + return block_dict + + +def build_table_block(tag): + block_dict = {"type": "raw_html", "value": str(tag)} + return block_dict + + +## FALLBACK STREAMFIELD BLOCKS + + +def conf_fallback_block(): + return getattr( + settings, + "WAGTAIL_WORDPRESS_IMPORTER_FALLBACK_BLOCK", + "wagtail_wordpress_import.block_builder_defaults.build_richtext_block_content", + ) + + +def build_richtext_block_content(html, blocks): + """ + image_linker is called to link up and retrive the remote image + document_linker is called to link up and retrive the remote documents + filters are called to replace inline shortcodes + """ + html = image_linker(html) + html = document_linker(html) + for inline_shortcode_handler in getattr( + settings, "WAGTAIL_WORDPRESS_IMPORTER_INLINE_SHORTCODE_HANDLERS", [] + ): + function = import_string(inline_shortcode_handler).construct_html_tag + html = function(html) + blocks.append({"type": "rich_text", "value": html}) + html = "" + return html + + +## RICH TEXT FUNCTIONS + + +def get_image_alt(img_tag): + return img_tag.attrs["alt"] if "alt" in img_tag.attrs else None + + +def get_image_file_name(src): + return src.split("/")[-1] if src else None # need the last part + + +def get_document_file_name(src): + return src.split("/")[-1] if src else None # need the last part + + +def image_exists(name): + try: + return ImportedImage.objects.get(title=name) + except ImportedImage.DoesNotExist: + pass + + +def document_exists(name): + try: + return ImportedDocument.objects.get(title=name) + except ImportedDocument.DoesNotExist: + pass + + +## GENERAL FUNCTIONS + + +def get_absolute_src(src, domain_prefix=None): + src = src.lstrip("/") + if not src.startswith("http") and domain_prefix: + return domain_prefix + "/" + src + return src + + +def get_alignment_class(image): + alignment = "fullwidth" + + if "class" in image.attrs: + if "align-left" in image.attrs["class"]: + alignment = "left" + elif "align-right" in image.attrs["class"]: + alignment = "right" + + return alignment From 2b9db1fb5514439886a5b29c985f0be96e383f04 Mon Sep 17 00:00:00 2001 From: Nick Moreton Date: Fri, 17 Dec 2021 15:46:53 +0000 Subject: [PATCH 2/2] add tests for requests errors --- .../test/tests/test_block_builder.py | 84 ++++++++++++++++--- 1 file changed, 73 insertions(+), 11 deletions(-) diff --git a/wagtail_wordpress_import/test/tests/test_block_builder.py b/wagtail_wordpress_import/test/tests/test_block_builder.py index fa813654..27f12c1d 100644 --- a/wagtail_wordpress_import/test/tests/test_block_builder.py +++ b/wagtail_wordpress_import/test/tests/test_block_builder.py @@ -364,30 +364,92 @@ def test_get_alignment_class_not_present(self): class TestBlockBuilderFetchUrlRequests(TestCase): + def setUp(self): + self.image_url = "http://example.com/no-image.jpg" + self.page_url = "http://example.com/no-page.html" + @responses.activate def test_fetch_url_success(self): - url_to_fetch = "https://www.example.com/images/bruno-4-runner.jpg" responses.add( responses.GET, - "https://www.example.com/images/bruno-4-runner.jpg", + self.image_url, body=mock_image().read(), status=200, - content_type="image/jpeg", + content_type="image/jpegs", ) - response, status, content_type = fetch_url(url_to_fetch) + response, status, content_type = fetch_url(self.image_url) self.assertTrue(response.content.startswith(b"\xff\xd8")) self.assertTrue(status) self.assertTrue("image/jpeg" in content_type) @responses.activate def test_fetch_url_raises_connection_error(self): - # test a valid url with an domain/image that doesn't exist - url_to_fetch = "https://www.example.com/images/connection_error.jpg" responses.add( responses.GET, - "https://www.example.com/images/connection_error.jpg", - body=Exception("Connection error"), + self.page_url, + body=requests.ConnectionError(), + ) + self.assertTrue( + fetch_url(self.page_url), + (None, True, None), + ) + + @responses.activate + def test_fetch_url_raises_http_error(self): + responses.add( + responses.GET, + self.page_url, + body=requests.HTTPError(), + ) + self.assertTrue( + fetch_url(self.page_url), + (None, True, None), + ) + + @responses.activate + def test_fetch_url_raises_request_exception(self): + responses.add( + responses.GET, + self.page_url, + body=requests.RequestException(), + ) + self.assertTrue( + fetch_url(self.page_url), + (None, True, None), + ) + + @responses.activate + def test_fetch_url_raises_read_timeout(self): + responses.add( + responses.GET, + self.page_url, + body=requests.ReadTimeout(), + ) + self.assertTrue( + fetch_url(self.page_url), + (None, True, None), + ) + + @responses.activate + def test_fetch_url_raises_timeout(self): + responses.add( + responses.GET, + self.page_url, + body=requests.Timeout(), + ) + self.assertTrue( + fetch_url(self.page_url), + (None, True, None), + ) + + @responses.activate + def test_fetch_url_raises_connect_timeout(self): + responses.add( + responses.GET, + self.page_url, + body=requests.ConnectTimeout(), + ) + self.assertTrue( + fetch_url(self.page_url), + (None, True, None), ) - with self.assertRaises(requests.ConnectionError) as ctx: - fetch_url(url_to_fetch) - self.assertTrue("Connection error" in str(ctx.exception))