Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmarked Resiliparse & added flag to evaluate parsers individually #25

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 30 additions & 7 deletions evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import re
import statistics
from typing import Any, Dict, Tuple, List
import importlib


def main():
Expand All @@ -19,19 +20,41 @@ def main():
parser.add_argument('--bootstrap-differences', action='store_true',
help='run bootstrap for differences')
parser.add_argument('--output', type=Path, help='output results as json')
parser.add_argument('--parser', type=str, help='Specify a parser name to evaluate only that parser')

args = parser.parse_args()
ground_truth = load_json(Path('ground-truth.json'))
metrics_by_name = {}
for path in sorted(Path('output').glob('*.json')):
name = path.stem

if args.parser:
name = args.parser
path = Path('output') / f'{name}.json'
if not path.exists():
try:
extractor_module = importlib.import_module(f'extractors.run_{name}')
extractor_module.main()
except:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd rather catch Exception here, e.g. see motivation in this (rejected) PEP https://peps.python.org/pep-0760/#motivation

Suggested change
except:
except Exception:

raise ValueError(f'Parser {name} not found')

metrics = evaluate(ground_truth, load_json(path), args.n_bootstrap)
print('{name:<20} '
'precision={precision:.3f} ± {precision_std:.3f} '
'recall={recall:.3f} ± {recall_std:.3f} '
'F1={f1:.3f} ± {f1_std:.3f} '
'accuracy={accuracy:.3f} ± {accuracy_std:.3f} '
.format(name=name, **metrics))
'precision={precision:.3f} ± {precision_std:.3f} '
'recall={recall:.3f} ± {recall_std:.3f} '
'F1={f1:.3f} ± {f1_std:.3f} '
'accuracy={accuracy:.3f} ± {accuracy_std:.3f} '
.format(name=name, **metrics))
metrics_by_name[name] = metrics
else:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be best to refactor the code in a way which does not leave to having to repeat the reporting. For example, we could pass args.parser to evaluate function.

for path in sorted(Path('output').glob('*.json')):
name = path.stem
metrics = evaluate(ground_truth, load_json(path), args.n_bootstrap)
print('{name:<20} '
'precision={precision:.3f} ± {precision_std:.3f} '
'recall={recall:.3f} ± {recall_std:.3f} '
'F1={f1:.3f} ± {f1_std:.3f} '
'accuracy={accuracy:.3f} ± {accuracy_std:.3f} '
.format(name=name, **metrics))
metrics_by_name[name] = metrics

if args.bootstrap_differences:
# check differences with bootstrap
Expand Down
22 changes: 22 additions & 0 deletions extractors/run_resiliparse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env python3
import gzip
import json
from pathlib import Path

from resiliparse.extract.html2text import extract_plain_text


def main():
output = {}
for path in Path('html').glob('*.html.gz'):
with gzip.open(path, 'rt', encoding='utf8') as f:
html = f.read()
item_id = path.stem.split('.')[0]
output[item_id] = {'articleBody': extract_plain_text(html, main_content=True)}
(Path('output') / 'resiliparse.json').write_text(
json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
encoding='utf8')


if __name__ == '__main__':
main()
Loading
Loading