From 4e115ee7db17acbbf4bf642e1d7d55ff63b34e4e Mon Sep 17 00:00:00 2001
From: Nick Moreton <nick.moreton@torchbox.com>
Date: Fri, 17 Dec 2021 13:57:21 +0000
Subject: [PATCH 1/2] refactor block builder fetch_url

I've also made some adjustments to remove configs that are written as functions.

save
---
 .../block_builder_defaults.py                 | 393 +++++++++---------
 1 file changed, 201 insertions(+), 192 deletions(-)
diff --git a/wagtail_wordpress_import/block_builder_defaults.py b/wagtail_wordpress_import/block_builder_defaults.py
index b4157700..47b10049 100644
--- a/wagtail_wordpress_import/block_builder_defaults.py
+++ b/wagtail_wordpress_import/block_builder_defaults.py
@@ -10,52 +10,6 @@
 ImportedImage = get_image_model()
 ImportedDocument = get_document_model()
 
-"""StreamField blocks"""
-
-
-def build_block_quote_block(tag):
-    block_dict = {
-        "type": "block_quote",
-        "value": {"quote": tag.text.strip(), "attribution": tag.cite},
-    }
-    return block_dict
-
-
-def build_form_block(tag):
-    block_dict = {"type": "raw_html", "value": str(tag)}
-    return block_dict
-
-
-def build_heading_block(tag):
-    block_dict = {
-        "type": "heading",
-        "value": {"importance": tag.name, "text": tag.text},
-    }
-    return block_dict
-
-
-def build_iframe_block(tag):
-    block_dict = {
-        "type": "raw_html",
-        "value": '<div class="core-custom"><div class="responsive-iframe">{}</div></div>'.format(
-            str(tag)
-        ),
-    }
-    return block_dict
-
-
-def build_image_block(tag):
-    def get_image_id(src):
-        return 1
-
-    block_dict = {"type": "image", "value": get_image_id(tag.src)}
-    return block_dict
-
-
-def build_table_block(tag):
-    block_dict = {"type": "raw_html", "value": str(tag)}
-    return block_dict
-
 
 def conf_html_tags_to_blocks():
     return getattr(
@@ -63,11 +17,6 @@ def conf_html_tags_to_blocks():
         "WAGTAIL_WORDPRESS_IMPORTER_CONVERT_HTML_TAGS_TO_BLOCKS",
         {
             "h1": "wagtail_wordpress_import.block_builder_defaults.build_heading_block",
-            # "h2": "wagtail_wordpress_import.block_builder_defaults.build_heading_block",
-            # "h3": "wagtail_wordpress_import.block_builder_defaults.build_heading_block",
-            # "h4": "wagtail_wordpress_import.block_builder_defaults.build_heading_block",
-            # "h5": "wagtail_wordpress_import.block_builder_defaults.build_heading_block",
-            # "h6": "wagtail_wordpress_import.block_builder_defaults.build_heading_block",
             "table": "wagtail_wordpress_import.block_builder_defaults.build_table_block",
             "iframe": "wagtail_wordpress_import.block_builder_defaults.build_iframe_block",
             "form": "wagtail_wordpress_import.block_builder_defaults.build_form_block",
@@ -77,74 +26,45 @@ def conf_html_tags_to_blocks():
     )
 
 
-"""Fall back StreamField block"""
-
-
-def conf_fallback_block():
-    return getattr(
-        settings,
-        "WAGTAIL_WORDPRESS_IMPORTER_FALLBACK_BLOCK",
-        "wagtail_wordpress_import.block_builder_defaults.build_richtext_block_content",
-    )
-
-
-def build_richtext_block_content(html, blocks):
-    """
-    image_linker is called to link up and retrive the remote image
-    document_linker is called to link up and retrive the remote documents
-    filters are called to replace inline shortcodes
-    """
-    html = image_linker(html)
-    html = document_linker(html)
-    for inline_shortcode_handler in getattr(
-        settings, "WAGTAIL_WORDPRESS_IMPORTER_INLINE_SHORTCODE_HANDLERS", []
-    ):
-        function = import_string(inline_shortcode_handler).construct_html_tag
-        html = function(html)
-    blocks.append({"type": "rich_text", "value": html})
-    html = ""
-    return html
-
-
-"""Rich Text Functions"""
-
-
-def conf_valid_image_content_types():
-    return getattr(
-        settings,
-        "WAGTAIL_WORDPRESS_IMPORTER_VALID_IMAGE_CONTENT_TYPES",
-        [
-            "image/gif",
-            "image/jpeg",
-            "image/png",
-            "image/webp",
-            "text/html",
-        ],
-    )
-
-
-def conf_valid_document_file_types():
-    return getattr(
-        settings,
-        "",
-        [
-            "pdf",
-            "ppt",
-            "docx",
-        ],
-    )
-
-
-def conf_valid_document_content_types():
-    return getattr(
-        settings,
-        "",
-        [
-            "application/pdf",
-            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-        ],
-    )
+def fetch_url(src, allow_redirects=True):
+    """general purpose url fetcher with ability to pass in own config"""
+    try:
+        response = requests.get(
+            src,
+            **getattr(
+                settings,
+                "WAGTAIL_WORDPRESS_IMPORTER_REQUESTS_SETTINGS",
+                {
+                    "headers": {"User-Agent": "WagtailWordpressImporter"},
+                    "timeout": 5,
+                    "stream": True,
+                    "allow_redirects": allow_redirects,
+                },
+            ),
+        )
+        status = True if response.status_code == 200 else False
+        return response, status, response.headers.get("content-type")
+    except requests.ConnectionError:
+        print(f"ConnectionError: {src}")
+        return None, False, None
+    except requests.HTTPError:
+        print(f"HTTPError: {src}")
+        return None, False, None
+    except requests.RequestException:
+        print(f"RequestException: {src}")
+        return None, False, None
+    except requests.ReadTimeout:
+        print(f"ReadTimeout: {src}")
+        return None, False, None
+    except requests.Timeout:
+        print(f"Timeout: {src}")
+        return None, False, None
+    except requests.ConnectTimeout:
+        print(f"ConnectTimeout: {src}")
+        return None, False, None
+
+
+## FUNCTIONS FOR IMAGES
 
 
 def image_linker(html):
@@ -183,50 +103,24 @@ def image_linker(html):
     return str(soup)
 
 
-def get_image_alt(img_tag):
-    return img_tag.attrs["alt"] if "alt" in img_tag.attrs else None
-
-
-def get_image_file_name(src):
-    return src.split("/")[-1] if src else None  # need the last part
-
-
-def get_document_file_name(src):
-    return src.split("/")[-1] if src else None  # need the last part
-
-
-def image_exists(name):
-    try:
-        return ImportedImage.objects.get(title=name)
-    except ImportedImage.DoesNotExist:
-        pass
-
-
-def document_exists(name):
-    try:
-        return ImportedDocument.objects.get(title=name)
-    except ImportedDocument.DoesNotExist:
-        pass
-
-
-def conf_get_requests_settings():
-    return getattr(
-        settings,
-        "WAGTAIL_WORDPRESS_IMPORTER_REQUESTS_SETTINGS",
-        {
-            "headers": {"User-Agent": "WagtailWordpressImporter"},
-            "timeout": 5,
-            "stream": False,
-        },
-    )
-
-
 def get_or_save_image(src):
     image_file_name = get_image_file_name(src)
     existing_image = image_exists(image_file_name)
     if not existing_image:
         response, valid, type = fetch_url(src)
-        if valid and (type in conf_valid_image_content_types()):
+        if valid and (
+            type
+            in getattr(
+                settings,
+                "WAGTAIL_WORDPRESS_IMPORTER_VALID_IMAGE_CONTENT_TYPES",
+                [
+                    "image/gif",
+                    "image/jpeg",
+                    "image/png",
+                    "image/webp",
+                ],
+            )
+        ):
             temp_image = NamedTemporaryFile(delete=True)
             temp_image.name = image_file_name
             temp_image.write(response.content)
@@ -242,38 +136,7 @@ def get_or_save_image(src):
     return existing_image
 
 
-def fetch_url(src, r=None, status=False, content_type=None):
-    """general purpose url fetcher with ability to pass in own config"""
-    try:
-        response = requests.get(src, **conf_get_requests_settings())
-        status = True if response.status_code == 200 else False
-        content_type = (
-            response.headers["content-type"].lower()
-            if response.headers.get("content-type")
-            else ""
-        )
-        return response, status, content_type
-    except Exception as e:
-        raise requests.ConnectionError(e)
-
-
-def get_absolute_src(src, domain_prefix=None):
-    src = src.lstrip("/")
-    if not src.startswith("http") and domain_prefix:
-        return domain_prefix + "/" + src
-    return src
-
-
-def get_alignment_class(image):
-    alignment = "fullwidth"
-
-    if "class" in image.attrs:
-        if "align-left" in image.attrs["class"]:
-            alignment = "left"
-        elif "align-right" in image.attrs["class"]:
-            alignment = "right"
-
-    return alignment
+## FUNCTIONS FOR DOCUMENTS
 
 
 def document_linker(html):
@@ -314,12 +177,31 @@ def document_linker(html):
 
 def get_or_save_document(href):
     file_type = href.split(".")[-1]
-    if file_type in conf_valid_document_file_types():
+    if file_type in getattr(
+        settings,
+        "",
+        [
+            "pdf",
+            "ppt",
+            "docx",
+        ],
+    ):
         document_file_name = get_document_file_name(href)
         existing_document = document_exists(document_file_name)
         if not existing_document:
             response, valid, type = fetch_url(href)
-            if valid and (type in conf_valid_document_content_types()):
+            if valid and (
+                type
+                in getattr(
+                    settings,
+                    "",
+                    [
+                        "application/pdf",
+                        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+                        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                    ],
+                )
+            ):
                 temp_document = NamedTemporaryFile(delete=True)
                 temp_document.name = document_file_name
                 temp_document.write(response.content)
@@ -333,3 +215,130 @@ def get_or_save_document(href):
             else:
                 print(f"RECEIVED INVALID DOCUMENT RESPONSE: {href}")
         return existing_document
+
+
+## STREAMFIELD BLOCKS
+
+
+def build_block_quote_block(tag):
+    block_dict = {
+        "type": "block_quote",
+        "value": {"quote": tag.text.strip(), "attribution": tag.cite},
+    }
+    return block_dict
+
+
+def build_form_block(tag):
+    block_dict = {"type": "raw_html", "value": str(tag)}
+    return block_dict
+
+
+def build_heading_block(tag):
+    block_dict = {
+        "type": "heading",
+        "value": {"importance": tag.name, "text": tag.text},
+    }
+    return block_dict
+
+
+def build_iframe_block(tag):
+    block_dict = {
+        "type": "raw_html",
+        "value": '<div class="core-custom"><div class="responsive-iframe">{}</div></div>'.format(
+            str(tag)
+        ),
+    }
+    return block_dict
+
+
+def build_image_block(tag):
+    def get_image_id(src):
+        return 1
+
+    block_dict = {"type": "image", "value": get_image_id(tag.src)}
+    return block_dict
+
+
+def build_table_block(tag):
+    block_dict = {"type": "raw_html", "value": str(tag)}
+    return block_dict
+
+
+## FALLBACK STREAMFIELD BLOCKS
+
+
+def conf_fallback_block():
+    return getattr(
+        settings,
+        "WAGTAIL_WORDPRESS_IMPORTER_FALLBACK_BLOCK",
+        "wagtail_wordpress_import.block_builder_defaults.build_richtext_block_content",
+    )
+
+
+def build_richtext_block_content(html, blocks):
+    """
+    image_linker is called to link up and retrive the remote image
+    document_linker is called to link up and retrive the remote documents
+    filters are called to replace inline shortcodes
+    """
+    html = image_linker(html)
+    html = document_linker(html)
+    for inline_shortcode_handler in getattr(
+        settings, "WAGTAIL_WORDPRESS_IMPORTER_INLINE_SHORTCODE_HANDLERS", []
+    ):
+        function = import_string(inline_shortcode_handler).construct_html_tag
+        html = function(html)
+    blocks.append({"type": "rich_text", "value": html})
+    html = ""
+    return html
+
+
+## RICH TEXT FUNCTIONS
+
+
+def get_image_alt(img_tag):
+    return img_tag.attrs["alt"] if "alt" in img_tag.attrs else None
+
+
+def get_image_file_name(src):
+    return src.split("/")[-1] if src else None  # need the last part
+
+
+def get_document_file_name(src):
+    return src.split("/")[-1] if src else None  # need the last part
+
+
+def image_exists(name):
+    try:
+        return ImportedImage.objects.get(title=name)
+    except ImportedImage.DoesNotExist:
+        pass
+
+
+def document_exists(name):
+    try:
+        return ImportedDocument.objects.get(title=name)
+    except ImportedDocument.DoesNotExist:
+        pass
+
+
+## GENERAL FUNCTIONS
+
+
+def get_absolute_src(src, domain_prefix=None):
+    src = src.lstrip("/")
+    if not src.startswith("http") and domain_prefix:
+        return domain_prefix + "/" + src
+    return src
+
+
+def get_alignment_class(image):
+    alignment = "fullwidth"
+
+    if "class" in image.attrs:
+        if "align-left" in image.attrs["class"]:
+            alignment = "left"
+        elif "align-right" in image.attrs["class"]:
+            alignment = "right"
+
+    return alignment

From 2b9db1fb5514439886a5b29c985f0be96e383f04 Mon Sep 17 00:00:00 2001
From: Nick Moreton <nick.moreton@torchbox.com>
Date: Fri, 17 Dec 2021 15:46:53 +0000
Subject: [PATCH 2/2] add tests for requests errors

---
 .../test/tests/test_block_builder.py          | 84 ++++++++++++++++---
 1 file changed, 73 insertions(+), 11 deletions(-)

diff --git a/wagtail_wordpress_import/test/tests/test_block_builder.py b/wagtail_wordpress_import/test/tests/test_block_builder.py
index fa813654..27f12c1d 100644
--- a/wagtail_wordpress_import/test/tests/test_block_builder.py
+++ b/wagtail_wordpress_import/test/tests/test_block_builder.py
@@ -364,30 +364,92 @@ def test_get_alignment_class_not_present(self):
 
 
 class TestBlockBuilderFetchUrlRequests(TestCase):
+    def setUp(self):
+        self.image_url = "http://example.com/no-image.jpg"
+        self.page_url = "http://example.com/no-page.html"
+
     @responses.activate
     def test_fetch_url_success(self):
-        url_to_fetch = "https://www.example.com/images/bruno-4-runner.jpg"
         responses.add(
             responses.GET,
-            "https://www.example.com/images/bruno-4-runner.jpg",
+            self.image_url,
             body=mock_image().read(),
             status=200,
-            content_type="image/jpeg",
+            content_type="image/jpegs",
         )
-        response, status, content_type = fetch_url(url_to_fetch)
+        response, status, content_type = fetch_url(self.image_url)
         self.assertTrue(response.content.startswith(b"\xff\xd8"))
         self.assertTrue(status)
         self.assertTrue("image/jpeg" in content_type)
 
     @responses.activate
     def test_fetch_url_raises_connection_error(self):
-        # test a valid url with an domain/image that doesn't exist
-        url_to_fetch = "https://www.example.com/images/connection_error.jpg"
         responses.add(
             responses.GET,
-            "https://www.example.com/images/connection_error.jpg",
-            body=Exception("Connection error"),
+            self.page_url,
+            body=requests.ConnectionError(),
+        )
+        self.assertTrue(
+            fetch_url(self.page_url),
+            (None, True, None),
+        )
+
+    @responses.activate
+    def test_fetch_url_raises_http_error(self):
+        responses.add(
+            responses.GET,
+            self.page_url,
+            body=requests.HTTPError(),
+        )
+        self.assertTrue(
+            fetch_url(self.page_url),
+            (None, True, None),
+        )
+
+    @responses.activate
+    def test_fetch_url_raises_request_exception(self):
+        responses.add(
+            responses.GET,
+            self.page_url,
+            body=requests.RequestException(),
+        )
+        self.assertTrue(
+            fetch_url(self.page_url),
+            (None, True, None),
+        )
+
+    @responses.activate
+    def test_fetch_url_raises_read_timeout(self):
+        responses.add(
+            responses.GET,
+            self.page_url,
+            body=requests.ReadTimeout(),
+        )
+        self.assertTrue(
+            fetch_url(self.page_url),
+            (None, True, None),
+        )
+
+    @responses.activate
+    def test_fetch_url_raises_timeout(self):
+        responses.add(
+            responses.GET,
+            self.page_url,
+            body=requests.Timeout(),
+        )
+        self.assertTrue(
+            fetch_url(self.page_url),
+            (None, True, None),
+        )
+
+    @responses.activate
+    def test_fetch_url_raises_connect_timeout(self):
+        responses.add(
+            responses.GET,
+            self.page_url,
+            body=requests.ConnectTimeout(),
+        )
+        self.assertTrue(
+            fetch_url(self.page_url),
+            (None, True, None),
         )
-        with self.assertRaises(requests.ConnectionError) as ctx:
-            fetch_url(url_to_fetch)
-        self.assertTrue("Connection error" in str(ctx.exception))