Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add publication place to sentence context #262

Open
wants to merge 2 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion experiments/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,7 +743,10 @@ def linking_experiments(self) -> None:
self.mylinker.linking_resources["mentions_to_wikidata"],
)
if self.mylinker.rel_params["with_publication"]:
# If "publ", add an artificial publication entry:
# If "publ", add publication info to context and as new entry:
article_dataset = rel_utils.add_publication_in_context(
article_dataset
)
article_dataset = rel_utils.add_publication(article_dataset)
predicted = linking_model.predict(article_dataset)
if self.mylinker.rel_params["with_publication"]:
Expand Down
63 changes: 44 additions & 19 deletions geoparser/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,15 @@ def run_sentence(
mentions_dataset = dict()
mentions_dataset["linking"] = []
for m in mentions:
prediction = self.format_prediction(m, sentence, wk_cands=wk_cands, context=context, sent_idx=sent_idx, place=place, place_wqid=place_wqid)
prediction = self.format_prediction(
m,
sentence,
wk_cands=wk_cands,
context=context,
sent_idx=sent_idx,
place=place,
place_wqid=place_wqid,
)
mentions_dataset["linking"].append(prediction)

# If the linking method is "reldisamb", rank and format candidates,
Expand All @@ -274,6 +282,12 @@ def run_sentence(
place_wqid = self.mylinker.rel_params["default_publwqid"]
place = self.mylinker.rel_params["default_publname"]

# If "publ", add the place of publication to the context:
mentions_dataset = rel_utils.add_publication_in_context(
mentions_dataset,
place,
)

# If "publ", add an artificial publication entry:
mentions_dataset = rel_utils.add_publication(
mentions_dataset,
Expand Down Expand Up @@ -508,11 +522,7 @@ def run_text(

return document_dataset


def run_sentence_recognition(
self,
sentence
) -> List[dict]:
def run_sentence_recognition(self, sentence) -> List[dict]:
# Get predictions:
predictions = self.myner.ner_predict(sentence)

Expand All @@ -525,15 +535,16 @@ def run_sentence_recognition(
# Aggregate mentions:
mentions = ner.aggregate_mentions(procpreds, "pred")
return mentions


def format_prediction(self, mention,
sentence: str,
wk_cands: Optional[dict] = None,
context: Optional[Tuple[str, str]] = ("", ""),
sent_idx: Optional[int] = 0,
place: Optional[str] = "",
place_wqid: Optional[str] = ""

def format_prediction(
self,
mention,
sentence: str,
wk_cands: Optional[dict] = None,
context: Optional[Tuple[str, str]] = ("", ""),
sent_idx: Optional[int] = 0,
place: Optional[str] = "",
place_wqid: Optional[str] = "",
) -> dict:
prediction = dict()
prediction["mention"] = mention["mention"]
Expand All @@ -551,12 +562,12 @@ def format_prediction(self, mention,
prediction["place"] = place
prediction["place_wqid"] = place_wqid
if wk_cands:
prediction["string_match_candidates"] = wk_cands.get(mention["mention"], dict())
prediction["string_match_candidates"] = wk_cands.get(
mention["mention"], dict()
)
prediction["candidates"] = wk_cands.get(mention["mention"], dict())
return prediction



def run_text_recognition(
self,
text: str,
Expand Down Expand Up @@ -627,7 +638,15 @@ def run_text_recognition(

mentions_dataset = []
for m in mentions:
prediction = self.format_prediction(m, sentence, wk_cands=None, context=context, sent_idx=idx, place=place, place_wqid=place_wqid)
prediction = self.format_prediction(
m,
sentence,
wk_cands=None,
context=context,
sent_idx=idx,
place=place,
place_wqid=place_wqid,
)
# mentions_dataset["linking"].append(prediction)
if not len(m["mention"]) == 1 and not m["mention"].islower():
mentions_dataset.append(prediction)
Expand Down Expand Up @@ -777,6 +796,12 @@ def run_disambiguation(
place_wqid = self.mylinker.rel_params["default_publwqid"]
place = self.mylinker.rel_params["default_publname"]

# If "publ", add the place of publication to the context:
mentions_dataset = rel_utils.add_publication_in_context(
mentions_dataset,
place,
)

# If "publ", add an artificial publication entry:
mentions_dataset = rel_utils.add_publication(
mentions_dataset,
Expand Down
69 changes: 23 additions & 46 deletions resources/publication_metadata.json
Original file line number Diff line number Diff line change
@@ -1,162 +1,139 @@
{
"sn83030483": {
"publication_title": "Gazette of the United-States",
"publication_place": "New York",
"publication_ctxt": "New York",
"publication_place": "New York, New York",
"publication_dates": "1789-1793",
"wikidata_qid": "Q60"
},
"sn84026272": {
"publication_title": "Gazette of the United-States",
"publication_place": "Philadelphia",
"publication_ctxt": "Pennsylvania",
"publication_place": "Philadelphia, Pennsylvania",
"publication_dates": "1800-1801",
"wikidata_qid": "Q1345"
},
"sn82014385": {
"publication_title": "The Delaware gazette",
"publication_place": "Wilmington",
"publication_ctxt": "Delaware",
"publication_place": "Wilmington, Delaware",
"publication_dates": "1809-1810",
"wikidata_qid": "Q174224"
},
"sn83026170": {
"publication_title": "Alexandria Gazette",
"publication_place": "Alexandria",
"publication_ctxt": "Virginia",
"publication_place": "Alexandria, Virginia",
"publication_dates": "1817-1822",
"wikidata_qid": "Q88"
},
"sn83020874": {
"publication_title": "Cherokee Phoenix, and Indian's advocate",
"publication_place": "Echota",
"publication_ctxt": "Georgia",
"publication_place": "Echota, Georgia",
"publication_dates": "1829-1834",
"wikidata_qid": "Q7007061"
},
"sn84020750": {
"publication_title": "The North Carolinian",
"publication_place": "Fayetteville",
"publication_ctxt": "North Carolina",
"publication_place": "Fayetteville, North Carolina",
"publication_dates": "1839-1861",
"wikidata_qid": "Q331104"
},
"sn85042404": {
"publication_title": "Jamestown Alert",
"publication_place": "Jamestown",
"publication_ctxt": "North Dakota",
"publication_place": "Jamestown, North Dakota",
"publication_dates": "1878-1882",
"wikidata_qid": "Q1052658"
},
"sn88068010": {
"publication_title": "Chariton Courier",
"publication_place": "Keytesville",
"publication_ctxt": "Missouri",
"publication_place": "Keytesville, Missouri",
"publication_dates": "1878-current",
"wikidata_qid": "Q957297"
},
"sn86063397": {
"publication_title": "The Elk Mountain pilot",
"publication_place": "Irwin",
"publication_ctxt": "Colorado",
"publication_place": "Irwin, Colorado",
"publication_dates": "1880-19??",
"wikidata_qid": "Q592729"
},
"sn88085488": {
"publication_title": "Pullman Herald",
"publication_place": "Pullman",
"publication_ctxt": "Washington",
"publication_place": "Pullman, Washington",
"publication_dates": "1888-1989",
"wikidata_qid": "Q983540"
},
"sn89058133": {
"publication_title": "Putnam County Herald",
"publication_place": "Cookeville",
"publication_ctxt": "Tennessee",
"publication_place": "Cookeville, Tennessee",
"publication_dates": "1903-1922",
"wikidata_qid": "Q2456192"
},
"sn83025812": {
"publication_title": "The Independent",
"publication_place": "Elizabeth City",
"publication_ctxt": "North Carolina",
"publication_place": "Elizabeth City, North Carolina",
"publication_dates": "1908-1936",
"wikidata_qid": "Q1018467"
},
"sn92063852": {
"publication_title": "The Detroit Tribune",
"publication_place": "Detroit",
"publication_ctxt": "Michigan",
"publication_place": "Detroit, Michigan",
"publication_dates": "1935-1966",
"wikidata_qid": "Q12439"
},
"sn91068761": {
"publication_title": "Tabor City Tribune",
"publication_place": "Tabor City",
"publication_ctxt": "North Carolina",
"publication_place": "Tabor City, North Carolina",
"publication_dates": "1946-1991",
"wikidata_qid": "Q586130"
},
"0000408": {
"publication_title": "Dorset County Chronicle",
"publication_place": "Dorchester",
"publication_ctxt": "Dorset",
"publication_place": "Dorchester, Dorset",
"publication_dates": "1824-1884",
"wikidata_qid": "Q503331"
},
"0000206": {
"publication_title": "Manchester Courier and Lancashire General Advertiser.",
"publication_place": "Manchester",
"publication_ctxt": "Lancashire",
"publication_place": "Manchester, Lancashire",
"publication_dates": "1825-1916",
"wikidata_qid": "Q18125"
},
"0000968": {
"publication_title": "The Ashton Weekly Reporter, and Stalybridge and Dukinfield Chronicle",
"publication_place": "Ashton-under-Lyne",
"publication_ctxt": "Lancashire",
"publication_place": "Ashton-under-Lyne, Lancashire",
"publication_dates": "1855-",
"wikidata_qid": "Q659803"
},
"0000200": {
"publication_title": "The Manchester Mercury",
"publication_place": "Manchester",
"publication_ctxt": "Lancashire",
"publication_place": "Manchester, Lancashire",
"publication_dates": "1752-1830",
"wikidata_qid": "Q18125"
},
"0000201": {
"publication_title": "The Manchester Mercury",
"publication_place": "Manchester",
"publication_ctxt": "Lancashire",
"publication_place": "Manchester, Lancashire",
"publication_dates": "1752-1830",
"wikidata_qid": "Q18125"
},
"0000239": {
"publication_title": "The Manchester Mercury",
"publication_place": "Manchester",
"publication_ctxt": "Lancashire",
"publication_place": "Manchester, Lancashire",
"publication_dates": "1752-1830",
"wikidata_qid": "Q18125"
},
"0000240": {
"publication_title": "The Manchester Mercury",
"publication_place": "Manchester",
"publication_ctxt": "Lancashire",
"publication_place": "Manchester, Lancashire",
"publication_dates": "1752-1830",
"wikidata_qid": "Q18125"
},
"0000967": {
"publication_title": "Ashton and Stalybridge Reporter",
"publication_place": "Ashton-under-Lyne",
"publication_ctxt": "Lancashire",
"publication_place": "Ashton-under-Lyne, Lancashire",
"publication_dates": "1855-",
"wikidata_qid": "Q659803"
},
"0002325": {
"publication_title": "The Poole and South-Western Herald",
"publication_place": "Poole",
"publication_ctxt": "Dorset",
"publication_place": "Poole, Dorset",
"publication_dates": "1852-1889",
"wikidata_qid": "Q203349"
}
Expand Down
30 changes: 30 additions & 0 deletions utils/rel_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,32 @@ def add_publication(
return new_json


def add_publication_in_context(rel_json: dict, publname: Optional[str] = "") -> dict:
"""
Add publication information to the provided JSON data as context.

Arguments:
rel_json (dict): The JSON data containing articles and mention
information.
publname (str, optional): The name of the publication. Defaults to an
empty string.

Returns:
dict: A new JSON dictionary with the added publication information.
"""
new_json = rel_json.copy()
for article in rel_json:
place = publname
if article != "linking":
place = rel_json[article][0].get("place", publname)
new_article = []
for art_mention in rel_json[article]:
art_mention["context"][1] += " " + place
new_article.append(art_mention)
new_json[article] = new_article
return new_json


def prepare_rel_trainset(
df: pd.DataFrame,
rel_params,
Expand Down Expand Up @@ -333,6 +359,10 @@ def prepare_rel_trainset(
# If "publ" is taken into account for the disambiguation, add the place
# of publication as an additional already disambiguated entity per row:
if rel_params["with_publication"] == True:
rel_json = add_publication_in_context(
rel_json,
rel_params["default_publname"],
)
rel_json = add_publication(
rel_json,
rel_params["default_publname"],
Expand Down