From 75fa706f49190268ef855d66e3232730ced0bed7 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 20 Sep 2023 12:03:12 +0200 Subject: [PATCH 1/3] Reverse control flow, ditch kwargs for generate_prompts(). --- spacy_llm/tasks/builtin_task.py | 24 ++++++++++++++++++++---- spacy_llm/tasks/rel/task.py | 21 +++++++++++---------- spacy_llm/tasks/sentiment/task.py | 3 --- spacy_llm/tasks/span/task.py | 21 ++++++++++----------- spacy_llm/tasks/summarization/task.py | 10 +++++++--- spacy_llm/tasks/textcat/task.py | 16 ++++++++-------- spacy_llm/ty.py | 2 +- 7 files changed, 57 insertions(+), 40 deletions(-) diff --git a/spacy_llm/tasks/builtin_task.py b/spacy_llm/tasks/builtin_task.py index fa565a97..9af03f5c 100644 --- a/spacy_llm/tasks/builtin_task.py +++ b/spacy_llm/tasks/builtin_task.py @@ -4,7 +4,7 @@ import jinja2 import srsly -from spacy import Language, util, Errors +from spacy import Errors, Language, util from spacy.tokens import Doc from spacy.training import Example @@ -45,19 +45,35 @@ def __init__( self._template = template self._prompt_example_type = prompt_example_type - def generate_prompts(self, docs: Iterable[Doc], **kwargs) -> Iterable[Any]: + def generate_prompts(self, docs: Iterable[Doc]) -> Iterable[Any]: """Generate prompts from docs. docs (Iterable[Doc]): Docs to generate prompts from. RETURNS (Iterable[Any]): Iterable with one prompt per doc. """ environment = jinja2.Environment() _template = environment.from_string(self._template) - for doc in docs: + for doc in self._preprocess_docs_for_prompt(docs): prompt = _template.render( - text=doc.text, prompt_examples=self._prompt_examples, **kwargs + text=doc.text, + prompt_examples=self._prompt_examples, + **self._prompt_data, ) yield prompt + @property + def _prompt_data(self) -> Dict[str, Any]: + """Returns data injected into prompt template. No-op if not overridden by inheriting task class. + RETURNS (Dict[str, Any]): Data injected into prompt template. + """ + return {} + + def _preprocess_docs_for_prompt(self, docs: Iterable[Doc]) -> Iterable[Doc]: + """Preprocesses docs before injection into prompt template. No-op if not overridden by inheriting task class. + docs (Iterable[Doc]): Docs to generate prompts from. + RETURNS (Iterable[Doc]): Preprocessed docs. + """ + return docs + @abc.abstractmethod def parse_responses( self, docs: Iterable[Doc], responses: Iterable[Any] diff --git a/spacy_llm/tasks/rel/task.py b/spacy_llm/tasks/rel/task.py index fe9f6e4e..98ff30a9 100644 --- a/spacy_llm/tasks/rel/task.py +++ b/spacy_llm/tasks/rel/task.py @@ -1,4 +1,4 @@ -from typing import Callable, Dict, Iterable, List, Optional, Type, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Type, Union from spacy.language import Language from spacy.tokens import Doc @@ -52,15 +52,16 @@ def __init__( self._verbose = verbose self._field = "rel" - def generate_prompts(self, docs: Iterable[Doc], **kwargs) -> Iterable[str]: - return super().generate_prompts( - docs=[ - Doc(doc.vocab, words=RELTask._preannotate(doc).split()) for doc in docs - ], - labels=list(self._label_dict.values()), - label_definitions=self._label_definitions, - preannotate=RELTask._preannotate, - ) + def _preprocess_docs_for_prompt(self, docs: Iterable[Doc]) -> Iterable[Doc]: + return [Doc(doc.vocab, words=RELTask._preannotate(doc).split()) for doc in docs] + + @property + def _prompt_data(self) -> Dict[str, Any]: + return { + "labels": list(self._label_dict.values()), + "label_definitions": self._label_definitions, + "preannotate": RELTask._preannotate, + } @staticmethod def _preannotate(doc: Union[Doc, RELExample]) -> str: diff --git a/spacy_llm/tasks/sentiment/task.py b/spacy_llm/tasks/sentiment/task.py index 54c82572..ffde2368 100644 --- a/spacy_llm/tasks/sentiment/task.py +++ b/spacy_llm/tasks/sentiment/task.py @@ -61,9 +61,6 @@ def initialize( get_examples=get_examples, nlp=nlp, n_prompt_examples=n_prompt_examples ) - def generate_prompts(self, docs: Iterable[Doc], **kwargs) -> Iterable[str]: - return super().generate_prompts(docs=docs) - def parse_responses( self, docs: Iterable[Doc], responses: Iterable[str] ) -> Iterable[Doc]: diff --git a/spacy_llm/tasks/span/task.py b/spacy_llm/tasks/span/task.py index a7495c19..4ba69018 100644 --- a/spacy_llm/tasks/span/task.py +++ b/spacy_llm/tasks/span/task.py @@ -1,6 +1,6 @@ import abc import typing -from typing import Callable, Dict, Iterable, List, Optional, Type, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Type, Union from spacy.tokens import Doc, Span @@ -64,16 +64,15 @@ def __init__( if self._prompt_examples: self._prompt_examples = list(self._check_label_consistency(self)) - def generate_prompts(self, docs: Iterable[Doc], **kwargs) -> Iterable[str]: - return super().generate_prompts( - docs=docs, - description=self._description, - labels=list(self._label_dict.values()), - label_definitions=self._label_definitions, - examples=self._prompt_examples, - allow_overlap=self._allow_overlap, - **kwargs, - ) + @property + def _prompt_data(self) -> Dict[str, Any]: + return { + "description": self._description, + "labels": list(self._label_dict.values()), + "label_definitions": self._label_definitions, + "examples": self._prompt_examples, + "allow_overlap": self._allow_overlap, + } @staticmethod def _validate_alignment(alignment_mode: str): diff --git a/spacy_llm/tasks/summarization/task.py b/spacy_llm/tasks/summarization/task.py index d951e59e..dc1f4f1f 100644 --- a/spacy_llm/tasks/summarization/task.py +++ b/spacy_llm/tasks/summarization/task.py @@ -1,5 +1,5 @@ import warnings -from typing import Callable, Iterable, List, Optional, Type +from typing import Any, Callable, Dict, Iterable, List, Optional, Type from spacy.language import Language from spacy.tokens import Doc @@ -78,12 +78,16 @@ def _check_prompt_example_summary_len(self) -> None: f"LLM will likely produce responses that are too long." ) - def generate_prompts(self, docs: Iterable[Doc], **kwargs) -> Iterable[str]: + @property + def _prompt_data(self) -> Dict[str, Any]: + """Returns data injected into prompt template. No-op if not overridden by inheriting task class. + RETURNS (Dict[str, Any]): Data injected into prompt template. + """ if self._check_example_summaries: self._check_prompt_example_summary_len() self._check_example_summaries = False - return super().generate_prompts(docs=docs, max_n_words=self._max_n_words) + return {"max_n_words": self._max_n_words} def parse_responses( self, docs: Iterable[Doc], responses: Iterable[str] diff --git a/spacy_llm/tasks/textcat/task.py b/spacy_llm/tasks/textcat/task.py index b638db17..04e3b54e 100644 --- a/spacy_llm/tasks/textcat/task.py +++ b/spacy_llm/tasks/textcat/task.py @@ -83,14 +83,14 @@ def __init__( ) self._exclusive_classes = True - def generate_prompts(self, docs: Iterable[Doc], **kwargs) -> Iterable[str]: - return super().generate_prompts( - docs=docs, - labels=list(self._label_dict.values()), - label_definitions=self._label_definitions, - exclusive_classes=self._exclusive_classes, - allow_none=self._allow_none, - ) + @property + def _prompt_data(self) -> Dict[str, Any]: + return { + "labels": list(self._label_dict.values()), + "label_definitions": self._label_definitions, + "exclusive_classes": self._exclusive_classes, + "allow_none": self._allow_none, + } def parse_responses( self, docs: Iterable[Doc], responses: Iterable[str] diff --git a/spacy_llm/ty.py b/spacy_llm/ty.py index 0673b1d0..e36fdccd 100644 --- a/spacy_llm/ty.py +++ b/spacy_llm/ty.py @@ -94,7 +94,7 @@ def __call__(self, examples: Iterable[Example], **kwargs) -> Dict[str, Any]: @runtime_checkable class LLMTask(Protocol): - def generate_prompts(self, docs: Iterable[Doc], **kwargs) -> Iterable[_PromptType]: + def generate_prompts(self, docs: Iterable[Doc]) -> Iterable[_PromptType]: """Generate prompts from docs. docs (Iterable[Doc]): Docs to generate prompts from. RETURNS (Iterable[_PromptType]): Iterable with one prompt per doc. From 04675a764c2a92d16bc96bb9266d3dd2dcc59bb9 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 20 Sep 2023 18:04:45 +0200 Subject: [PATCH 2/3] Revert examples to kwargs structure. --- spacy_llm/tasks/textcat/util.py | 8 +++++--- spacy_llm/ty.py | 6 ++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/spacy_llm/tasks/textcat/util.py b/spacy_llm/tasks/textcat/util.py index 0b7d8689..f4bad72b 100644 --- a/spacy_llm/tasks/textcat/util.py +++ b/spacy_llm/tasks/textcat/util.py @@ -11,11 +11,13 @@ class TextCatExample(BaseModel): answer: str @classmethod - def generate(cls, example: Example, **kwargs) -> Self: - if kwargs["use_binary"]: + def generate( + cls, example: Example, use_binary: bool, label_dict: Dict[str, str], **kwargs + ) -> Self: + if use_binary: answer = ( "POS" - if example.reference.cats[list(kwargs["label_dict"].values())[0]] == 1.0 + if example.reference.cats[list(label_dict.values())[0]] == 1.0 else "NEG" ) else: diff --git a/spacy_llm/ty.py b/spacy_llm/ty.py index e36fdccd..d7af596d 100644 --- a/spacy_llm/ty.py +++ b/spacy_llm/ty.py @@ -58,6 +58,12 @@ def from_disk( class FewshotExample(abc.ABC, BaseModel): + """Base fewshot-example. + From Python 3.7 onwards it's possible to make Pydantic models generic, which allows for a clean solution (see + https://github.com/pydantic/pydantic/issues/4171) using the controller pattern and Pydantic's GenericModel + (BaseModel in Pydantic v2). Until then passing **kwargs seems like the sanest option. + """ + @classmethod @abc.abstractmethod def generate(cls, example: Example, **kwargs) -> Self: From 52e712aea2e20a7c9924474b2ecd7760224a089e Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Thu, 21 Sep 2023 10:08:50 +0200 Subject: [PATCH 3/3] Fix TextCatExample. --- spacy_llm/tasks/textcat/util.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/spacy_llm/tasks/textcat/util.py b/spacy_llm/tasks/textcat/util.py index f4bad72b..bd88c8aa 100644 --- a/spacy_llm/tasks/textcat/util.py +++ b/spacy_llm/tasks/textcat/util.py @@ -3,21 +3,20 @@ from spacy.scorer import Scorer from spacy.training import Example -from ...compat import BaseModel, Self +from ...compat import Self +from ...ty import FewshotExample -class TextCatExample(BaseModel): +class TextCatExample(FewshotExample): text: str answer: str @classmethod - def generate( - cls, example: Example, use_binary: bool, label_dict: Dict[str, str], **kwargs - ) -> Self: - if use_binary: + def generate(cls, example: Example, **kwargs) -> Self: + if kwargs["use_binary"]: answer = ( "POS" - if example.reference.cats[list(label_dict.values())[0]] == 1.0 + if example.reference.cats[list(kwargs["label_dict"].values())[0]] == 1.0 else "NEG" ) else: @@ -29,7 +28,7 @@ def generate( ] ) - return TextCatExample( + return cls( text=example.reference.text, answer=answer, )