scrapinghub · KhoomeiK · Sep 16, 2024 · Sep 16, 2024 · lopuhin · Oct 11, 2024
diff --git a/evaluate.py b/evaluate.py
@@ -7,6 +7,7 @@
 import re
 import statistics
 from typing import Any, Dict, Tuple, List
+import importlib
 
 
 def main():
@@ -19,19 +20,41 @@ def main():
     parser.add_argument('--bootstrap-differences', action='store_true',
                         help='run bootstrap for differences')
     parser.add_argument('--output', type=Path, help='output results as json')
+    parser.add_argument('--parser', type=str, help='Specify a parser name to evaluate only that parser')
+
     args = parser.parse_args()
     ground_truth = load_json(Path('ground-truth.json'))
     metrics_by_name = {}
-    for path in sorted(Path('output').glob('*.json')):
-        name = path.stem
+
+    if args.parser:
+        name = args.parser
+        path = Path('output') / f'{name}.json'
+        if not path.exists():
+            try:
+                extractor_module = importlib.import_module(f'extractors.run_{name}')
+                extractor_module.main()
+            except:
-            except:
+            except Exception:
-            except:
+            except Exception:
+                raise ValueError(f'Parser {name} not found')
+
         metrics = evaluate(ground_truth, load_json(path), args.n_bootstrap)
         print('{name:<20} '
-              'precision={precision:.3f} ± {precision_std:.3f}  '
-              'recall={recall:.3f} ± {recall_std:.3f}  '
-              'F1={f1:.3f} ± {f1_std:.3f} '
-              'accuracy={accuracy:.3f} ± {accuracy_std:.3f} '
-              .format(name=name, **metrics))
+            'precision={precision:.3f} ± {precision_std:.3f}  '
+            'recall={recall:.3f} ± {recall_std:.3f}  '
+            'F1={f1:.3f} ± {f1_std:.3f} '
+            'accuracy={accuracy:.3f} ± {accuracy_std:.3f} '
+            .format(name=name, **metrics))
         metrics_by_name[name] = metrics
+    else:
+        for path in sorted(Path('output').glob('*.json')):
+            name = path.stem
+            metrics = evaluate(ground_truth, load_json(path), args.n_bootstrap)
+            print('{name:<20} '
+                'precision={precision:.3f} ± {precision_std:.3f}  '
+                'recall={recall:.3f} ± {recall_std:.3f}  '
+                'F1={f1:.3f} ± {f1_std:.3f} '
+                'accuracy={accuracy:.3f} ± {accuracy_std:.3f} '
+                .format(name=name, **metrics))
+            metrics_by_name[name] = metrics
 
     if args.bootstrap_differences:
         # check differences with bootstrap

diff --git a/extractors/run_resiliparse.py b/extractors/run_resiliparse.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+import gzip
+import json
+from pathlib import Path
+
+from resiliparse.extract.html2text import extract_plain_text
+
+
+def main():
+    output = {}
+    for path in Path('html').glob('*.html.gz'):
+        with gzip.open(path, 'rt', encoding='utf8') as f:
+            html = f.read()
+        item_id = path.stem.split('.')[0]
+        output[item_id] = {'articleBody': extract_plain_text(html, main_content=True)}
+    (Path('output') / 'resiliparse.json').write_text(
+        json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
+        encoding='utf8')
+
+
+if __name__ == '__main__':
+    main()