Skip to content

Commit

Permalink
Merge pull request #53 from alphagov/keywords
Browse files Browse the repository at this point in the history
Update metadata and unstructured content extraction
  • Loading branch information
csutter authored Oct 24, 2023
2 parents b28231d + 06a817f commit d012157
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 91 deletions.
47 changes: 30 additions & 17 deletions lib/document_sync_worker/document/publish.rb
Original file line number Diff line number Diff line change
@@ -1,25 +1,32 @@
module DocumentSyncWorker
module Document
class Publish < Base
# All the possible keys in the message hash that can contain unstructured content that we want
# to index, represented as JsonPath path strings.
# All the possible keys in the message hash that can contain the primary unstructured document
# content that we want to index, represented as JsonPath path strings.
INDEXABLE_CONTENT_VALUES_JSON_PATHS = %w[
$.details.body
$.details.contact_groups[*].title
$.details.description
$.details.hidden_search_terms
$.details.introduction
$.details.introductory_paragraph
$.details.metadata.hidden_indexable_content
$.details.metadata.project_code
$.details.more_information
$.details.contact_groups[*].title
$.details.title
$.details.summary
$.details.body
$.details.need_to_know
$.details.more_information
$.details.parts[*]['title','body']
$.details.summary
$.details.title
].map { JsonPath.new(_1) }.freeze
INDEXABLE_CONTENT_SEPARATOR = "\n".freeze

# All the possible keys in the message hash that can contain additional keywords or other text
# that should be searchable but doesn't form part of the primary document content, represented
# as JsonPath path strings.
ADDITIONAL_SEARCHABLE_TEXT_VALUES_JSON_PATHS = %w[
$.details.hidden_search_terms
$.details.metadata.hidden_indexable_content
$.details.metadata.project_code
].map { JsonPath.new(_1) }.freeze
ADDITIONAL_SEARCHABLE_TEXT_VALUES_SEPARATOR = "\n".freeze

# Synchronize the document to the given repository (i.e. put it in the repository).
def synchronize_to(repository)
repository.put(content_id, metadata, content:, payload_version:)
Expand All @@ -29,13 +36,16 @@ def synchronize_to(repository)
def metadata
{
content_id: document_hash["content_id"],
document_type: document_hash["document_type"],
title: document_hash["title"],
description: document_hash["description"],
additional_searchable_text:,
link:,
url:,
public_timestamp:,
public_timestamp_int:,
document_type: document_hash["document_type"],
content_purpose_supergroup: document_hash["content_purpose_supergroup"],
part_of_taxonomy_tree: document_hash.dig("links", "taxons") || [],
locale: document_hash["locale"],
}
end

Expand All @@ -61,14 +71,17 @@ def url
Plek.website_root + link
end

def public_timestamp
document_hash["public_updated_at"]
def additional_searchable_text
values = ADDITIONAL_SEARCHABLE_TEXT_VALUES_JSON_PATHS.map { _1.on(document_hash) }
values.flatten.join(ADDITIONAL_SEARCHABLE_TEXT_VALUES_SEPARATOR)
end

def public_timestamp_int
return nil unless public_timestamp
def public_timestamp
return nil unless document_hash["public_updated_at"]

Time.parse(public_timestamp).to_i # rubocop:disable Rails/TimeZone (string contains TZ info)
# rubocop:disable Rails/TimeZone (string already contains timezone info which would be lost)
Time.parse(document_hash["public_updated_at"]).to_i
# rubocop:enable Rails/TimeZone
end
end
end
Expand Down
173 changes: 106 additions & 67 deletions spec/lib/document_sync_worker/document/publish_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,62 +35,19 @@
let(:document_hash) do
{
"details" => {
"body" => "a",
"description" => "b",
"hidden_search_terms" => "c",
"introduction" => "d",
"introductory_paragraph" => "e",
"more_information" => "f",
"description" => "a",
"introduction" => "b",
"introductory_paragraph" => "c",
"title" => "d",
"summary" => "e",
"body" => "f",
"need_to_know" => "g",
"summary" => "h",
"title" => "i",
"more_information" => "h",
},
}
end

it { is_expected.to eq("a\nb\nc\nd\ne\nf\ng\nh\ni") }
end

describe "with hidden indexable content as an array" do
let(:document_hash) do
{
"details" => {
"metadata" => {
"hidden_indexable_content" => %w[x y z],
},
},
}
end

it { is_expected.to eq("x\ny\nz") }
end

describe "with hidden indexable content as a string" do
let(:document_hash) do
{
"details" => {
"metadata" => {
"hidden_indexable_content" => "x y z",
},
},
}
end

it { is_expected.to eq("x y z") }
end

describe "with a project code" do
let(:document_hash) do
{
"details" => {
"metadata" => {
"project_code" => "PRINCE2",
},
},
}
end

it { is_expected.to eq("PRINCE2") }
it { is_expected.to eq("a\nb\nc\nd\ne\nf\ng\nh") }
end

describe "with contact groups" do
Expand Down Expand Up @@ -145,14 +102,6 @@
it { is_expected.to eq("000-000-000") }
end

describe "document_type" do
subject(:extracted_document_type) { document.metadata[:document_type] }

let(:document_hash) { { "document_type" => "foo_bar" } }

it { is_expected.to eq("foo_bar") }
end

describe "title" do
subject(:extracted_title) { document.metadata[:title] }

Expand All @@ -169,6 +118,64 @@
it { is_expected.to eq("Lorem ipsum dolor sit amet.") }
end

describe "additional_searchable_text" do
subject(:additional_searchable_text) { document.metadata[:additional_searchable_text] }

describe "with hidden search terms" do
let(:document_hash) do
{
"details" => {
"hidden_search_terms" => "a b c",
},
}
end

it { is_expected.to eq("a b c") }
end

describe "with hidden indexable content as an array" do
let(:document_hash) do
{
"details" => {
"metadata" => {
"hidden_indexable_content" => %w[x y z],
},
},
}
end

it { is_expected.to eq("x\ny\nz") }
end

describe "with hidden indexable content as a string" do
let(:document_hash) do
{
"details" => {
"metadata" => {
"hidden_indexable_content" => "x y z",
},
},
}
end

it { is_expected.to eq("x y z") }
end

describe "with a project code" do
let(:document_hash) do
{
"details" => {
"metadata" => {
"project_code" => "PRINCE2",
},
},
}
end

it { is_expected.to eq("PRINCE2") }
end
end

describe "link" do
subject(:extracted_link) { document.metadata[:link] }

Expand Down Expand Up @@ -240,14 +247,6 @@

let(:document_hash) { { "public_updated_at" => "2012-02-01T00:00:00Z" } }

it { is_expected.to eq("2012-02-01T00:00:00Z") }
end

describe "public_timestamp_int" do
subject(:extracted_public_timestamp_int) { document.metadata[:public_timestamp_int] }

let(:document_hash) { { "public_updated_at" => "2012-02-01T00:00:00Z" } }

it { is_expected.to eq(1_328_054_400) }

context "without a public_timestamp" do
Expand All @@ -256,6 +255,46 @@
it { is_expected.to be_nil }
end
end

describe "document_type" do
subject(:extracted_document_type) { document.metadata[:document_type] }

let(:document_hash) { { "document_type" => "foo_bar" } }

it { is_expected.to eq("foo_bar") }
end

describe "content_purpose_supergroup" do
subject(:extracted_content_purpose_supergroup) { document.metadata[:content_purpose_supergroup] }

let(:document_hash) { { "content_purpose_supergroup" => "foo_bar" } }

it { is_expected.to eq("foo_bar") }
end

describe "part_of_taxonomy_tree" do
subject(:extracted_part_of_taxonomy_tree) { document.metadata[:part_of_taxonomy_tree] }

context "with a set of taxon links" do
let(:document_hash) { { "links" => { "taxons" => %w[0000 ffff] } } }

it { is_expected.to eq(%w[0000 ffff]) }
end

context "without taxon links" do
let(:document_hash) { { "links": {} } }

it { is_expected.to be_empty }
end
end

describe "locale" do
subject(:extracted_locale) { document.metadata[:locale] }

let(:document_hash) { { "locale" => "en" } }

it { is_expected.to eq("en") }
end
end

describe "#synchronize_to" do
Expand Down
22 changes: 15 additions & 7 deletions spec/lib/document_sync_worker_integration_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,18 @@
result = repository.get("f75d26a3-25a4-4c31-beea-a77cada4ce12")
expect(result[:metadata]).to eq(
content_id: "f75d26a3-25a4-4c31-beea-a77cada4ce12",
document_type: "press_release",
title: "Ebola medal for over 3000 heroes",
description: "A new medal has been created to recognise the bravery and hard work of people who have helped to stop the spread of Ebola.",
additional_searchable_text: "",
link: "/government/news/ebola-medal-for-over-3000-heroes",
url: "http://www.dev.gov.uk/government/news/ebola-medal-for-over-3000-heroes",
public_timestamp: "2015-06-11T11:14:00Z",
public_timestamp_int: 1_434_021_240,
public_timestamp: 1_434_021_240,
document_type: "press_release",
content_purpose_supergroup: "news_and_communications",
part_of_taxonomy_tree: %w[
668cd623-c7a8-4159-9575-90caac36d4b4 c31256e8-f328-462b-993f-dce50b7892e9
],
locale: "en",
)
expect(result[:content]).to start_with("<div class=\"govspeak\"><p>The government has")
expect(result[:content]).to end_with("response to Ebola</a>.</p>\n</div>\n\n</div>")
Expand All @@ -38,15 +43,18 @@
result = repository.get("526d5caf-221b-4c7b-9e74-b3e0b189fc8d")
expect(result[:metadata]).to eq(
content_id: "526d5caf-221b-4c7b-9e74-b3e0b189fc8d",
document_type: "external_content",
title: "Brighton & Hove City Council",
description: "Website of Brighton & Hove City Council",
additional_searchable_text: "Brighton & Hove City Council",
link: "https://www.brighton-hove.gov.uk",
url: "https://www.brighton-hove.gov.uk",
public_timestamp: "2023-09-28T14:56:19Z",
public_timestamp_int: 1_695_912_979,
public_timestamp: 1_695_912_979,
document_type: "external_content",
content_purpose_supergroup: "other",
part_of_taxonomy_tree: [],
locale: "en",
)
expect(result[:content]).to eq("Brighton & Hove City Council")
expect(result[:content]).to be_blank
end
end

Expand Down

0 comments on commit d012157

Please sign in to comment.