Merge pull request #53 from alphagov/keywords

Update metadata and unstructured content extraction
alphagov · Oct 24, 2023 · d012157 · d012157
2 parents b28231d + 06a817f
commit d012157
Show file tree

Hide file tree

Showing 3 changed files with 151 additions and 91 deletions.
diff --git a/lib/document_sync_worker/document/publish.rb b/lib/document_sync_worker/document/publish.rb
@@ -1,25 +1,32 @@
 module DocumentSyncWorker
   module Document
     class Publish < Base
-      # All the possible keys in the message hash that can contain unstructured content that we want
-      # to index, represented as JsonPath path strings.
+      # All the possible keys in the message hash that can contain the primary unstructured document
+      # content that we want to index, represented as JsonPath path strings.
       INDEXABLE_CONTENT_VALUES_JSON_PATHS = %w[
-        $.details.body
-        $.details.contact_groups[*].title
         $.details.description
-        $.details.hidden_search_terms
         $.details.introduction
         $.details.introductory_paragraph
-        $.details.metadata.hidden_indexable_content
-        $.details.metadata.project_code
-        $.details.more_information
+        $.details.contact_groups[*].title
+        $.details.title
+        $.details.summary
+        $.details.body
         $.details.need_to_know
+        $.details.more_information
         $.details.parts[*]['title','body']
-        $.details.summary
-        $.details.title
       ].map { JsonPath.new(_1) }.freeze
       INDEXABLE_CONTENT_SEPARATOR = "\n".freeze
 
+      # All the possible keys in the message hash that can contain additional keywords or other text
+      # that should be searchable but doesn't form part of the primary document content, represented
+      # as JsonPath path strings.
+      ADDITIONAL_SEARCHABLE_TEXT_VALUES_JSON_PATHS = %w[
+        $.details.hidden_search_terms
+        $.details.metadata.hidden_indexable_content
+        $.details.metadata.project_code
+      ].map { JsonPath.new(_1) }.freeze
+      ADDITIONAL_SEARCHABLE_TEXT_VALUES_SEPARATOR = "\n".freeze
+
       # Synchronize the document to the given repository (i.e. put it in the repository).
       def synchronize_to(repository)
         repository.put(content_id, metadata, content:, payload_version:)
@@ -29,13 +36,16 @@ def synchronize_to(repository)
       def metadata
         {
           content_id: document_hash["content_id"],
-          document_type: document_hash["document_type"],
           title: document_hash["title"],
           description: document_hash["description"],
+          additional_searchable_text:,
           link:,
           url:,
           public_timestamp:,
-          public_timestamp_int:,
+          document_type: document_hash["document_type"],
+          content_purpose_supergroup: document_hash["content_purpose_supergroup"],
+          part_of_taxonomy_tree: document_hash.dig("links", "taxons") || [],
+          locale: document_hash["locale"],
         }
       end
 
@@ -61,14 +71,17 @@ def url
         Plek.website_root + link
       end
 
-      def public_timestamp
-        document_hash["public_updated_at"]
+      def additional_searchable_text
+        values = ADDITIONAL_SEARCHABLE_TEXT_VALUES_JSON_PATHS.map { _1.on(document_hash) }
+        values.flatten.join(ADDITIONAL_SEARCHABLE_TEXT_VALUES_SEPARATOR)
       end
 
-      def public_timestamp_int
-        return nil unless public_timestamp
+      def public_timestamp
+        return nil unless document_hash["public_updated_at"]
 
-        Time.parse(public_timestamp).to_i # rubocop:disable Rails/TimeZone (string contains TZ info)
+        # rubocop:disable Rails/TimeZone (string already contains timezone info which would be lost)
+        Time.parse(document_hash["public_updated_at"]).to_i
+        # rubocop:enable Rails/TimeZone
       end
     end
   end

diff --git a/spec/lib/document_sync_worker/document/publish_spec.rb b/spec/lib/document_sync_worker/document/publish_spec.rb
@@ -35,62 +35,19 @@
       let(:document_hash) do
         {
           "details" => {
-            "body" => "a",
-            "description" => "b",
-            "hidden_search_terms" => "c",
-            "introduction" => "d",
-            "introductory_paragraph" => "e",
-            "more_information" => "f",
+            "description" => "a",
+            "introduction" => "b",
+            "introductory_paragraph" => "c",
+            "title" => "d",
+            "summary" => "e",
+            "body" => "f",
             "need_to_know" => "g",
-            "summary" => "h",
-            "title" => "i",
+            "more_information" => "h",
           },
         }
       end
 
-      it { is_expected.to eq("a\nb\nc\nd\ne\nf\ng\nh\ni") }
-    end
-
-    describe "with hidden indexable content as an array" do
-      let(:document_hash) do
-        {
-          "details" => {
-            "metadata" => {
-              "hidden_indexable_content" => %w[x y z],
-            },
-          },
-        }
-      end
-
-      it { is_expected.to eq("x\ny\nz") }
-    end
-
-    describe "with hidden indexable content as a string" do
-      let(:document_hash) do
-        {
-          "details" => {
-            "metadata" => {
-              "hidden_indexable_content" => "x y z",
-            },
-          },
-        }
-      end
-
-      it { is_expected.to eq("x y z") }
-    end
-
-    describe "with a project code" do
-      let(:document_hash) do
-        {
-          "details" => {
-            "metadata" => {
-              "project_code" => "PRINCE2",
-            },
-          },
-        }
-      end
-
-      it { is_expected.to eq("PRINCE2") }
+      it { is_expected.to eq("a\nb\nc\nd\ne\nf\ng\nh") }
     end
 
     describe "with contact groups" do
@@ -145,14 +102,6 @@
       it { is_expected.to eq("000-000-000") }
     end
 
-    describe "document_type" do
-      subject(:extracted_document_type) { document.metadata[:document_type] }
-
-      let(:document_hash) { { "document_type" => "foo_bar" } }
-
-      it { is_expected.to eq("foo_bar") }
-    end
-
     describe "title" do
       subject(:extracted_title) { document.metadata[:title] }
 
@@ -169,6 +118,64 @@
       it { is_expected.to eq("Lorem ipsum dolor sit amet.") }
     end
 
+    describe "additional_searchable_text" do
+      subject(:additional_searchable_text) { document.metadata[:additional_searchable_text] }
+
+      describe "with hidden search terms" do
+        let(:document_hash) do
+          {
+            "details" => {
+              "hidden_search_terms" => "a b c",
+            },
+          }
+        end
+
+        it { is_expected.to eq("a b c") }
+      end
+
+      describe "with hidden indexable content as an array" do
+        let(:document_hash) do
+          {
+            "details" => {
+              "metadata" => {
+                "hidden_indexable_content" => %w[x y z],
+              },
+            },
+          }
+        end
+
+        it { is_expected.to eq("x\ny\nz") }
+      end
+
+      describe "with hidden indexable content as a string" do
+        let(:document_hash) do
+          {
+            "details" => {
+              "metadata" => {
+                "hidden_indexable_content" => "x y z",
+              },
+            },
+          }
+        end
+
+        it { is_expected.to eq("x y z") }
+      end
+
+      describe "with a project code" do
+        let(:document_hash) do
+          {
+            "details" => {
+              "metadata" => {
+                "project_code" => "PRINCE2",
+              },
+            },
+          }
+        end
+
+        it { is_expected.to eq("PRINCE2") }
+      end
+    end
+
     describe "link" do
       subject(:extracted_link) { document.metadata[:link] }
 
@@ -240,14 +247,6 @@
 
       let(:document_hash) { { "public_updated_at" => "2012-02-01T00:00:00Z" } }
 
-      it { is_expected.to eq("2012-02-01T00:00:00Z") }
-    end
-
-    describe "public_timestamp_int" do
-      subject(:extracted_public_timestamp_int) { document.metadata[:public_timestamp_int] }
-
-      let(:document_hash) { { "public_updated_at" => "2012-02-01T00:00:00Z" } }
-
       it { is_expected.to eq(1_328_054_400) }
 
       context "without a public_timestamp" do
@@ -256,6 +255,46 @@
         it { is_expected.to be_nil }
       end
     end
+
+    describe "document_type" do
+      subject(:extracted_document_type) { document.metadata[:document_type] }
+
+      let(:document_hash) { { "document_type" => "foo_bar" } }
+
+      it { is_expected.to eq("foo_bar") }
+    end
+
+    describe "content_purpose_supergroup" do
+      subject(:extracted_content_purpose_supergroup) { document.metadata[:content_purpose_supergroup] }
+
+      let(:document_hash) { { "content_purpose_supergroup" => "foo_bar" } }
+
+      it { is_expected.to eq("foo_bar") }
+    end
+
+    describe "part_of_taxonomy_tree" do
+      subject(:extracted_part_of_taxonomy_tree) { document.metadata[:part_of_taxonomy_tree] }
+
+      context "with a set of taxon links" do
+        let(:document_hash) { { "links" => { "taxons" => %w[0000 ffff] } } }
+
+        it { is_expected.to eq(%w[0000 ffff]) }
+      end
+
+      context "without taxon links" do
+        let(:document_hash) { { "links": {} } }
+
+        it { is_expected.to be_empty }
+      end
+    end
+
+    describe "locale" do
+      subject(:extracted_locale) { document.metadata[:locale] }
+
+      let(:document_hash) { { "locale" => "en" } }
+
+      it { is_expected.to eq("en") }
+    end
   end
 
   describe "#synchronize_to" do

diff --git a/spec/lib/document_sync_worker_integration_spec.rb b/spec/lib/document_sync_worker_integration_spec.rb
@@ -16,13 +16,18 @@
       result = repository.get("f75d26a3-25a4-4c31-beea-a77cada4ce12")
       expect(result[:metadata]).to eq(
         content_id: "f75d26a3-25a4-4c31-beea-a77cada4ce12",
-        document_type: "press_release",
         title: "Ebola medal for over 3000 heroes",
         description: "A new medal has been created to recognise the bravery and hard work of people who have helped to stop the spread of Ebola.",
+        additional_searchable_text: "",
         link: "/government/news/ebola-medal-for-over-3000-heroes",
         url: "http://www.dev.gov.uk/government/news/ebola-medal-for-over-3000-heroes",
-        public_timestamp: "2015-06-11T11:14:00Z",
-        public_timestamp_int: 1_434_021_240,
+        public_timestamp: 1_434_021_240,
+        document_type: "press_release",
+        content_purpose_supergroup: "news_and_communications",
+        part_of_taxonomy_tree: %w[
+          668cd623-c7a8-4159-9575-90caac36d4b4 c31256e8-f328-462b-993f-dce50b7892e9
+        ],
+        locale: "en",
       )
       expect(result[:content]).to start_with("<div class=\"govspeak\"><p>The government has")
       expect(result[:content]).to end_with("response to Ebola</a>.</p>\n</div>\n\n</div>")
@@ -38,15 +43,18 @@
       result = repository.get("526d5caf-221b-4c7b-9e74-b3e0b189fc8d")
       expect(result[:metadata]).to eq(
         content_id: "526d5caf-221b-4c7b-9e74-b3e0b189fc8d",
-        document_type: "external_content",
         title: "Brighton & Hove City Council",
         description: "Website of Brighton & Hove City Council",
+        additional_searchable_text: "Brighton & Hove City Council",
         link: "https://www.brighton-hove.gov.uk",
         url: "https://www.brighton-hove.gov.uk",
-        public_timestamp: "2023-09-28T14:56:19Z",
-        public_timestamp_int: 1_695_912_979,
+        public_timestamp: 1_695_912_979,
+        document_type: "external_content",
+        content_purpose_supergroup: "other",
+        part_of_taxonomy_tree: [],
+        locale: "en",
       )
-      expect(result[:content]).to eq("Brighton & Hove City Council")
+      expect(result[:content]).to be_blank
     end
   end