Add Legal Contract Summarization scenario (#3131)

Co-authored-by: Mikio Takeuchi <[email protected]> Co-authored-by: Ryo Kawahara <[email protected]>
stanford-crfm · Nov 12, 2024 · 6bb3662 · 6bb3662
1 parent 070d36a
commit 6bb3662
Show file tree

Hide file tree

Showing 3 changed files with 190 additions and 0 deletions.
diff --git a/src/helm/benchmark/run_specs/enterprise_run_specs.py b/src/helm/benchmark/run_specs/enterprise_run_specs.py
@@ -4,6 +4,7 @@
     get_generation_adapter_spec,
 )
 from helm.benchmark.metrics.common_metric_specs import (
+    get_basic_metric_specs,
     get_classification_metric_specs,
     get_exact_match_metric_specs,
 )
@@ -34,3 +35,27 @@ def get_news_headline_spec(category: str) -> RunSpec:
         metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
         groups=["gold_commodity_news"],
     )
+
+
+@run_spec_function("legal_contract_summarization")
+def get_legal_contract_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.legal_contract_summarization_scenario.LegalContractSummarizationScenario",
+        args={},
+    )
+
+    adapter_spec = get_generation_adapter_spec(
+        instructions="Summarize the legal document in plain English.",
+        input_noun="Document",
+        output_noun="Summary",
+        max_tokens=100,
+        stop_sequences=["\n\n"],
+    )
+
+    return RunSpec(
+        name="legal_contract_summarization",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_basic_metric_specs(["rouge_1", "rouge_2"]),
+        groups=["legal_contract_summarization"],
+    )
diff --git a/src/helm/benchmark/scenarios/legal_contract_summarization_scenario.py b/src/helm/benchmark/scenarios/legal_contract_summarization_scenario.py
@@ -0,0 +1,129 @@
+import os
+import pandas as pd
+import json
+import re
+
+from typing import List
+from helm.common.general import ensure_file_downloaded, ensure_directory_exists
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Output,
+)
+
+
+class LegalContractSummarizationScenario(Scenario):
+    """Legal Contract Summarization
+
+    A legal contract summarization benchmark based on the paper
+    Plain English Summarization of Contracts (Manor & Li, NAACL 2019),
+    which presented a dataset of legal text snippets paired with summaries
+    written in plain English.
+
+    @inproceedings{manor-li-2019-plain,
+        title = "Plain {E}nglish Summarization of Contracts",
+        author = "Manor, Laura  and
+        Li, Junyi Jessy",
+        editor = "Aletras, Nikolaos  and
+        Ash, Elliott  and
+        Barrett, Leslie  and
+        Chen, Daniel  and
+        Meyers, Adam  and
+        Preotiuc-Pietro, Daniel  and
+        Rosenberg, David  and
+        Stent, Amanda",
+        booktitle = "Proceedings of the Natural Legal Language Processing Workshop 2019",
+        month = jun,
+        year = "2019",
+        address = "Minneapolis, Minnesota",
+        publisher = "Association for Computational Linguistics",
+        url = "https://aclanthology.org/W19-2201",
+        doi = "10.18653/v1/W19-2201",
+        pages = "1--11",
+        abstract = "Unilateral legal contracts, such as terms of service, play a substantial role in modern digital life. However, few read these documents before accepting the terms within, as they are too long and the language too complicated. We propose the task of summarizing such legal documents in plain English, which would enable users to have a better understanding of the terms they are accepting. We propose an initial dataset of legal text snippets paired with summaries written in plain English. We verify the quality of these summaries manually, and show that they involve heavy abstraction, compression, and simplification. Initial experiments show that unsupervised extractive summarization methods do not perform well on this task due to the level of abstraction and style differences. We conclude with a call for resource and technique development for simplification and style transfer for legal language.",
+    }
+    """  # noqa: E501
+
+    TRAIN_RATIO: float = 0.2
+    ARTICLE_COLUMN_NAME = "original_text"
+    SUMMARY_COLUMN_NAME = "reference_summary"
+    ID_COLUMN_NAME = "uid"
+
+    name = "legal_contract_summarization"
+    description = (
+        "Plain English Summarization of Contracts [(Manor et al., 2019)](https://aclanthology.org/W19-2201.pdf)."
+    )
+    tags = ["summarization", "legal"]
+
+    def __init__(self):
+        """
+        Initializes the scenario.
+
+        """
+        super().__init__()
+
+    @staticmethod
+    def _clean(text: str) -> str:
+        return re.sub(r"\s+", " ", text)
+
+    def _load_dataset(self, output_path: str):
+        data_dir = os.path.join(output_path, "data")
+        ensure_directory_exists(data_dir)
+
+        source_url = "https://raw.githubusercontent.com/lauramanor/legal_summarization/master/all_v1.json"
+        source_file = os.path.basename(source_url)
+        target_path = os.path.join(data_dir, source_file)
+        ensure_file_downloaded(
+            source_url=source_url,
+            target_path=target_path,
+        )
+
+        target_df = pd.DataFrame()
+        with open(target_path) as f:
+            json_data = json.load(f)
+            target_df = pd.DataFrame.from_records(list(json_data.values()))
+            target_df = target_df.dropna(
+                subset=[
+                    LegalContractSummarizationScenario.ARTICLE_COLUMN_NAME,
+                    LegalContractSummarizationScenario.SUMMARY_COLUMN_NAME,
+                    LegalContractSummarizationScenario.ID_COLUMN_NAME,
+                ]
+            )
+            # Split randomly (works better than split by order)
+            train_df = target_df.sample(frac=LegalContractSummarizationScenario.TRAIN_RATIO, random_state=0)
+            test_df = target_df.drop(train_df.index).sample(frac=1, random_state=0)
+
+        return {TRAIN_SPLIT: train_df, TEST_SPLIT: test_df}
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        dataset = self._load_dataset(output_path)
+
+        instances: List[Instance] = []
+
+        for split, split_data in dataset.items():
+            for example in split_data.itertuples():
+                id = getattr(example, LegalContractSummarizationScenario.ID_COLUMN_NAME)
+                article = LegalContractSummarizationScenario._clean(
+                    getattr(example, LegalContractSummarizationScenario.ARTICLE_COLUMN_NAME)
+                )
+                summary = LegalContractSummarizationScenario._clean(
+                    getattr(example, LegalContractSummarizationScenario.SUMMARY_COLUMN_NAME)
+                )
+                input = Input(
+                    text=article,
+                )
+                output = Output(text=summary)
+                instance = Instance(
+                    id=id,
+                    input=input,
+                    references=[Reference(output=output, tags=[CORRECT_TAG])],
+                    split=split,
+                )
+                instances.append(instance)
+
+        return instances
diff --git a/src/helm/benchmark/static/schema_enterprise.yaml b/src/helm/benchmark/static/schema_enterprise.yaml
@@ -51,6 +51,18 @@ metrics:
     short_display_name: EM
     description: Fraction of instances that the predicted output matches a correct reference up to light processing.
     lower_is_better: false
+  - name: rouge_1
+    display_name: ROUGE-1
+    description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 1-gram overlap.
+    lower_is_better: false
+  - name: rouge_2
+    display_name: ROUGE-2
+    description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on 2-gram overlap.
+    lower_is_better: false
+  - name: rouge_l
+    display_name: ROUGE-L
+    description: Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/) based on longest common subsequence overlap.
+    lower_is_better: false
 
 ############################################################
 perturbations: []
@@ -109,3 +121,27 @@ run_groups:
       who: financial journalists
       when: 2000-2019
       language: English
+
+  - name: legal_scenarios
+    display_name: Legal Scenarios
+    description: Scenarios for the legal domain
+    category: All scenarios
+    subgroups:
+      - legal_contract_summarization
+
+  - name: legal_contract_summarization
+    display_name: Legal Contract Summarization
+    description: Plain English Summarization of Contracts [(Manor et al., 2019)](https://aclanthology.org/W19-2201.pdf).
+    metric_groups:
+      - accuracy
+      - efficiency
+      - general_information
+    environment:
+      main_name: rouge_2
+      main_split: test
+    taxonomy:
+      task: summarization
+      what: legal contracts (e.g. terms of service, license agreements)
+      who: lawyers
+      when: before 2019
+      language: English