Merge branch 'develop' into get-api-data-without-errors

nationalarchives · Apr 16, 2024 · 34dd3a0 · 34dd3a0
2 parents 792fe32 + 645b8fc
commit 34dd3a0
Show file tree

Hide file tree

Showing 6 changed files with 82 additions and 16 deletions.
diff --git a/.github/workflows/remove-untagged.yml b/.github/workflows/remove-untagged.yml
@@ -2,8 +2,8 @@ name: Remove untagged container images
 
 on:
   workflow_dispatch:
-  schedule:
-    - cron: "0 3 * * 1"
+  # schedule:
+  #   - cron: "0 3 * * 1"
 
 jobs:
   remove-untagged:
@@ -23,7 +23,7 @@ jobs:
                 if (version.metadata.container.tags.length == 0 && version.name !== "latest" && version.name !== "preview") {
                     console.log("Delete " + version.id)
 
-                    const deleteResponse = await github.request("DELETE /orgs/${{ github.repository_owner }}/packages/container/${{ vars.DOCKER_IMAGE_NAME }}/versions/" + version.id, { });
-                    console.log("status " + deleteResponse.status)
+                    // const deleteResponse = await github.request("DELETE /orgs/${{ github.repository_owner }}/packages/container/${{ vars.DOCKER_IMAGE_NAME }}/versions/" + version.id, { });
+                    // console.log("status " + deleteResponse.status)
                 }
             }
diff --git a/.platform/services.yaml b/.platform/services.yaml
@@ -1,6 +1,6 @@
 db:
     type: postgresql:12
-    disk: 256
+    disk: 512
 
 redis:
     type: redis:6.0
diff --git a/config/settings/base.py b/config/settings/base.py
@@ -290,6 +290,10 @@
     "WAGTAILADMIN_BASE_URL", "https://nationalarchives.gov.uk"
 )
 
+CSRF_TRUSTED_ORIGINS = [
+    os.getenv("CSRF_TRUSTED_ORIGIN", "https://nationalarchives.gov.uk")
+]
+
 # For search results within Wagtail itself
 WAGTAILSEARCH_BACKENDS = {
     "default": {

diff --git a/etna/ciim/tests/test_utils.py b/etna/ciim/tests/test_utils.py
@@ -10,6 +10,7 @@
     find_all,
     format_description_markup,
     pluck,
+    strip_html,
 )
 
 
@@ -370,3 +371,45 @@ def test_index_is_zero_for_non_int_sort_key(self):
         index = convert_sort_key_to_index(sort)
 
         self.assertEqual(index, 0)
+
+
+class TestStripHtml(SimpleTestCase):
+
+    def test_ensure_spaces_preserve_marks(self):
+
+        test_data = (
+            (
+                "test for span tag",
+                "This is a<span>test example</span>",
+                "This is a test example",
+            ),
+            (
+                "test for p tag",
+                "This is a<p>test example</p>",
+                "This is a test example",
+            ),
+            (
+                "test for unknown tag",
+                "This is a<unknown>test example</unknown>",
+                "This is atest example",
+            ),
+            (
+                "D7376859",
+                '<span class="wrapper"><span altrender="doctype" class="emph"></span><span class="persname"><span altrender="surname" class="emph">Patman</span><span altrender="forenames" class="emph">Clifford Douglas</span></span><span altrender="rank" class="emph">Armament Quarter Master Serjeant</span><span altrender="regno" class="emph">1865334</span><span class="corpname">Royal Army Ordnance Corps, 8 Hussars now Royal Electrical and Mechanical Engineers</span><span class="geogname">Escape and Evasion</span><span altrender="award" class="emph">Mentions in Despatches</span></span>',
+                "Patman Clifford Douglas Armament Quarter Master Serjeant 1865334 Royal Army Ordnance Corps, 8 Hussars now Royal Electrical and Mechanical Engineers Escape and Evasion Mentions in Despatches",
+            ),
+        )
+
+        for label, value, expected in test_data:
+            with self.subTest(label):
+                result = strip_html(value, preserve_marks=True, ensure_spaces=True)
+                self.assertEqual(result, expected)
+
+    def test_allow_tags(self):
+        value = """<a href="http://test.com">this is a test</a>"""
+        expected = (
+            """<a href="http://test.com" rel="noopener noreferrer">this is a test</a>"""
+        )
+        allow_tags = {"a", "br", "p"}
+        result = strip_html(value, allow_tags=allow_tags)
+        self.assertEqual(result, expected)
diff --git a/etna/ciim/utils.py b/etna/ciim/utils.py
@@ -257,20 +257,39 @@ def format_link(link_html: str) -> Dict[str, str]:
     return {"href": href, "id": id, "text": document.text()}
 
 
-def strip_html(value: str, *, preserve_marks, ensure_spaces):
+def strip_html(
+    value: str,
+    *,
+    preserve_marks: bool = False,
+    ensure_spaces: bool = False,
+    allow_tags: Optional[set] = None,
+) -> str:
     """
     Temporary HTML sanitiser to remove unwanted tags from data.
-    K-int will eventually sanitise this at API level.
-    preserve_marks=True will keep <mark> tags in the output, otherwise they are removed.
-
-    Replacing <span> and <p> tags is necessary to prevent "bunched" data,
-    "This is a<span>test</span>example" will return as "This is atestexample"
-    without the placement of the space.
+    TODO:this will eventually be sanitised at API level.
+
+    value:
+        the value to be sanitised
+    preserver_marks:
+        allow pre-defined tags for styling
+    ensure_spaces:
+        allow pre-defined tags and replaces them with whitespace
+    allow_tags:
+        sets the tags that are allowed
     """
     clean_tags = {"span", "p"} if ensure_spaces else set()
-    clean_html = nh3.clean(
-        value, tags={*clean_tags, "mark"} if preserve_marks else clean_tags
-    )
+
+    if allow_tags is None:
+        allow_tags = set()
+
+    tags = set()
+    if preserve_marks:
+        tags.add("mark")
+    tags.update(clean_tags)
+    tags.update(allow_tags)
+
+    clean_html = nh3.clean(value, tags=tags)
+
     for tag in clean_tags:
         opening_regex = rf"<{tag}[^>]*>"
         closing_regex = rf"</{tag}>"

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "Etna"
-version = "24.04.03.31"
+version = "24.04.16.32"
 description = ""
 authors = ["James Biggs <[email protected]>"]