diff --git a/.gitignore b/.gitignore index 8d3eea734..33d85946f 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ /ScoNe/ testing/outputs/ testing/playbook.ipynb +testing/outputs/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/dsp/utils/settings.py b/dsp/utils/settings.py new file mode 100644 index 000000000..4c45bbf46 --- /dev/null +++ b/dsp/utils/settings.py @@ -0,0 +1,142 @@ +import copy +import threading +from contextlib import contextmanager +from dsp.utils.utils import dotdict + +DEFAULT_CONFIG = dotdict( + lm=None, + adapter=None, + rm=None, + branch_idx=0, + reranker=None, + compiled_lm=None, + force_reuse_cached_compilation=False, + compiling=False, + skip_logprobs=False, + trace=[], + release=0, + bypass_assert=False, + bypass_suggest=False, + assert_failures=0, + suggest_failures=0, + langchain_history=[], + experimental=False, + backoff_time=10, + callbacks=[], + async_max_workers=8, + request_cache=None, + send_stream=None, +) + +# Global base configuration +main_thread_config = copy.deepcopy(DEFAULT_CONFIG) + + +class ThreadLocalOverrides(threading.local): + def __init__(self): + self.overrides = dotdict() # Initialize thread-local overrides + + +# Create the thread-local storage +thread_local_overrides = ThreadLocalOverrides() + + +class Settings: + """ + A singleton class for DSPy configuration settings. + + This is thread-safe. User threads are supported both through ParallelExecutor and native threading. + - If native threading is used, the thread inherits the initial config from the main thread. + - If ParallelExecutor is used, the thread inherits the initial config from its parent thread. + """ + + _instance = None + + def __new__(cls): + if cls._instance is None: + cls._instance = super().__new__(cls) + cls._instance.lock = threading.Lock() # maintained here for DSPy assertions.py + return cls._instance + + def __getattr__(self, name): + overrides = getattr(thread_local_overrides, 'overrides', dotdict()) + if name in overrides: + return overrides[name] + elif name in main_thread_config: + return main_thread_config[name] + else: + raise AttributeError(f"'Settings' object has no attribute '{name}'") + + def __setattr__(self, name, value): + if name in ('_instance',): + super().__setattr__(name, value) + else: + self.configure(**{name: value}) + + # Dictionary-like access + + def __getitem__(self, key): + return self.__getattr__(key) + + def __setitem__(self, key, value): + self.__setattr__(key, value) + + def __contains__(self, key): + overrides = getattr(thread_local_overrides, 'overrides', dotdict()) + return key in overrides or key in main_thread_config + + def get(self, key, default=None): + try: + return self[key] + except AttributeError: + return default + + def copy(self): + overrides = getattr(thread_local_overrides, 'overrides', dotdict()) + return dotdict({**main_thread_config, **overrides}) + + @property + def config(self): + config = self.copy() + if 'lock' in config: + del config['lock'] + return config + + # Configuration methods + + def configure(self, **kwargs): + global main_thread_config + + # Get or initialize thread-local overrides + overrides = getattr(thread_local_overrides, 'overrides', dotdict()) + thread_local_overrides.overrides = dotdict( + {**copy.deepcopy(DEFAULT_CONFIG), **main_thread_config, **overrides, **kwargs} + ) + + # Update main_thread_config, in the main thread only + if threading.current_thread() is threading.main_thread(): + main_thread_config = thread_local_overrides.overrides + + @contextmanager + def context(self, **kwargs): + """Context manager for temporary configuration changes.""" + global main_thread_config + original_overrides = getattr(thread_local_overrides, 'overrides', dotdict()).copy() + original_main_thread_config = main_thread_config.copy() + + self.configure(**kwargs) + try: + yield + finally: + thread_local_overrides.overrides = original_overrides + + if threading.current_thread() is threading.main_thread(): + main_thread_config = original_main_thread_config + + def __repr__(self): + overrides = getattr(thread_local_overrides, 'overrides', dotdict()) + combined_config = {**main_thread_config, **overrides} + return repr(combined_config) + + +settings = Settings() diff --git a/dspy/__init__.py b/dspy/__init__.py index ce92241b9..abe30cbd8 100644 --- a/dspy/__init__.py +++ b/dspy/__init__.py @@ -5,7 +5,6 @@ from dspy.teleprompt import * import dspy.retrievers -import dspy.teleprompt from dspy.evaluate import Evaluate # isort: skip from dspy.clients import * # isort: skip @@ -25,6 +24,7 @@ configure = settings.configure context = settings.context +import dspy.teleprompt LabeledFewShot = dspy.teleprompt.LabeledFewShot BootstrapFewShot = dspy.teleprompt.BootstrapFewShot @@ -36,4 +36,5 @@ BetterTogether = dspy.teleprompt.BetterTogether COPRO = dspy.teleprompt.COPRO MIPROv2 = dspy.teleprompt.MIPROv2 +MIPROv2KNN = dspy.teleprompt.MIPROv2KNN Ensemble = dspy.teleprompt.Ensemble diff --git a/dspy/clients/__init__.py b/dspy/clients/__init__.py index fbeec22e2..1ebf61912 100644 --- a/dspy/clients/__init__.py +++ b/dspy/clients/__init__.py @@ -1,11 +1,13 @@ -from dspy.clients.lm import LM -from dspy.clients.provider import Provider, TrainingJob -from dspy.clients.base_lm import BaseLM, inspect_history -from dspy.clients.embedding import Embedder -import litellm -import os from pathlib import Path +import os + from litellm.caching import Cache +import litellm + +from dspy.clients.base_lm import BaseLM, inspect_history +from dspy.clients.embedding import Embedder +from dspy.clients.lm import LM +from dspy.clients.provider import Provider, TrainingJob DISK_CACHE_DIR = os.environ.get("DSPY_CACHEDIR") or os.path.join(Path.home(), ".dspy_cache") DISK_CACHE_LIMIT = int(os.environ.get("DSPY_CACHE_LIMIT", 3e10)) # 30 GB default diff --git a/dspy/clients/embedding.py b/dspy/clients/embedding.py index ec7c1174e..f8e8922d2 100644 --- a/dspy/clients/embedding.py +++ b/dspy/clients/embedding.py @@ -1,6 +1,9 @@ +from typing import Callable, List, Optional, Union import litellm import numpy as np +from .lm import request_cache + class Embedder: """DSPy embedding class. @@ -56,13 +59,28 @@ def my_embedder(texts): ``` """ - def __init__(self, model, batch_size=200, caching=True, **kwargs): + def __init__(self, model: Union[str, Callable], batch_size=200, **kwargs): + if not isinstance(model, str) and not callable(model): + raise ValueError(f"`model` in `dspy.Embedder` must be a string or a callable, but got {type(model)}.") + self.model = model self.batch_size = batch_size - self.caching = caching self.default_kwargs = kwargs - def __call__(self, inputs, batch_size=None, caching=None, **kwargs): + def _embed(self, inputs: List[str], cache: bool, **kwargs): + if callable(self.model): + return self.model(inputs, **kwargs) + + response = litellm_embedding({"model": self.model, "input": inputs, **kwargs}).data + return [data["embedding"] for data in response] + + def __call__( + self, + inputs: Union[str, List[str]], + batch_size: Optional[int] = None, + cache: Optional[bool] = None, + **kwargs, + ) -> np.ndarray: """Compute embeddings for the given inputs. Args: @@ -76,46 +94,29 @@ def __call__(self, inputs, batch_size=None, caching=None, **kwargs): If the input is a list of strings, returns a 2D numpy array of embeddings, one embedding per row. """ - if isinstance(inputs, str): - is_single_input = True + multi_input = isinstance(inputs, list) + if not multi_input: inputs = [inputs] - else: - is_single_input = False assert all(isinstance(inp, str) for inp in inputs), "All inputs must be strings." - if batch_size is None: - batch_size = self.batch_size - if caching is None: - caching = self.caching - - merged_kwargs = self.default_kwargs.copy() - merged_kwargs.update(kwargs) - - embeddings_list = [] - - def chunk(inputs_list, size): - for i in range(0, len(inputs_list), size): - yield inputs_list[i : i + size] - - for batch_inputs in chunk(inputs, batch_size): - if isinstance(self.model, str): - embedding_response = litellm.embedding( - model=self.model, input=batch_inputs, caching=caching, **merged_kwargs - ) - batch_embeddings = [data["embedding"] for data in embedding_response.data] - elif callable(self.model): - batch_embeddings = self.model(batch_inputs, **merged_kwargs) - else: - raise ValueError( - f"`model` in `dspy.Embedder` must be a string or a callable, but got {type(self.model)}." - ) - - embeddings_list.extend(batch_embeddings) - - embeddings = np.array(embeddings_list, dtype=np.float32) - - if is_single_input: - return embeddings[0] - else: - return embeddings + batch_size = batch_size or self.batch_size + kwargs = {**self.default_kwargs, **kwargs} + + embeddings = flatten([self._embed(c, cache, **kwargs) for c in chunk(inputs, batch_size)]) + embeddings = embeddings if multi_input else embeddings[0] + return np.array(embeddings, dtype=np.float32) + + +def chunk(inputs_list, size): + for i in range(0, len(inputs_list), size): + yield inputs_list[i : i + size] + + +def flatten(list_of_lists): + return [item for sublist in list_of_lists for item in sublist] + + +@request_cache(maxsize=None) +def litellm_embedding(request): + return litellm.embedding(**request, cache={"no-cache": False, "no-store": False}) diff --git a/dspy/evaluate/auto_evaluation.py b/dspy/evaluate/auto_evaluation.py index 001a9043e..ff600dba0 100644 --- a/dspy/evaluate/auto_evaluation.py +++ b/dspy/evaluate/auto_evaluation.py @@ -45,13 +45,15 @@ def __init__(self, threshold=0.66, decompositional=False): self.module = dspy.ChainOfThought(SemanticRecallPrecision) def forward(self, example, pred, trace=None): - scores = self.module(question=example.question, ground_truth=example.response, system_response=pred.response) + ground_truth = example.response if hasattr(example, "response") else getattr(example, "answer", None) + system_response = pred.response if hasattr(pred, "response") else getattr(pred, "answer", None) + + scores = self.module(question=example.question, ground_truth=ground_truth, system_response=system_response) score = f1_score(scores.precision, scores.recall) return score if trace is None else score >= self.threshold - ########### @@ -70,7 +72,6 @@ class AnswerCompleteness(dspy.Signature): completeness: float = dspy.OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response") - class AnswerGroundedness(dspy.Signature): """ Estimate the groundedness of a system's responses, against real retrieved documents written by people. @@ -81,9 +82,13 @@ class AnswerGroundedness(dspy.Signature): question: str = dspy.InputField() retrieved_context: str = dspy.InputField() system_response: str = dspy.InputField() - system_response_claims: str = dspy.OutputField(desc="enumeration of non-trivial or check-worthy claims in the system response") + system_response_claims: str = dspy.OutputField( + desc="enumeration of non-trivial or check-worthy claims in the system response" + ) discussion: str = dspy.OutputField(desc="discussion of how supported the claims are by the retrieved context") - groundedness: float = dspy.OutputField(desc="fraction (out of 1.0) of system response supported by the retrieved context") + groundedness: float = dspy.OutputField( + desc="fraction (out of 1.0) of system response supported by the retrieved context" + ) class CompleteAndGrounded(dspy.Module): @@ -93,8 +98,12 @@ def __init__(self, threshold=0.66): self.groundedness_module = dspy.ChainOfThought(AnswerGroundedness) def forward(self, example, pred, trace=None): - completeness = self.completeness_module(question=example.question, ground_truth=example.response, system_response=pred.response) - groundedness = self.groundedness_module(question=example.question, retrieved_context=pred.context, system_response=pred.response) + completeness = self.completeness_module( + question=example.question, ground_truth=example.response, system_response=pred.response + ) + groundedness = self.groundedness_module( + question=example.question, retrieved_context=pred.context, system_response=pred.response + ) score = f1_score(groundedness.groundedness, completeness.completeness) return score if trace is None else score >= self.threshold diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py index 9ef0bf733..af8e17999 100644 --- a/dspy/evaluate/evaluate.py +++ b/dspy/evaluate/evaluate.py @@ -38,9 +38,6 @@ def HTML(x: str) -> str: logger = logging.getLogger(__name__) -logger = logging.getLogger(__name__) - - class Evaluate: def __init__( self, diff --git a/dspy/evaluate/metrics.py b/dspy/evaluate/metrics.py index 4cce6adfd..b3a50e592 100644 --- a/dspy/evaluate/metrics.py +++ b/dspy/evaluate/metrics.py @@ -5,6 +5,7 @@ def _passage_match(passages: list[str], answers: list[str]) -> bool: """Returns True if any of the passages contains the answer.""" from dspy.dsp.utils import passage_has_answers + return any(passage_has_answers(psg, answers) for psg in passages) @@ -24,13 +25,34 @@ def answer_exact_match(example, pred, trace=None, frac=1.0): return _answer_match(pred.answer, [example.answer], frac=frac) elif isinstance(example.answer, list): return _answer_match(pred.answer, example.answer, frac=frac) - + raise ValueError(f"Invalid answer type: {type(example.answer)}") -def answer_passage_match(example, pred, trace=None): + +def answer_passage_match(example, pred, trace=None): if isinstance(example.answer, str): return _passage_match(pred.context, [example.answer]) elif isinstance(example.answer, list): return _passage_match(pred.context, example.answer) - + raise ValueError(f"Invalid answer type: {type(example.answer)}") + + +def answer_exact_match_and_semantic(example, pred, trace=None, frac=1.0, threshold=0.95): + """ + Combines exact match and semantic F1 score checks. + Returns True if either exact match succeeds or semantic F1 score is above threshold. + """ + # Check exact match first + exact_match = answer_exact_match(example, pred, trace=trace, frac=frac) + + if exact_match: + return True + + # If no exact match, check semantic similarity + from dspy.evaluate import SemanticF1 + + semantic_f1 = SemanticF1(threshold=threshold) + semantic_score = semantic_f1(example, pred, trace=True) + + return semantic_score diff --git a/dspy/predict/knn.py b/dspy/predict/knn.py index f59f4f474..58ab06017 100644 --- a/dspy/predict/knn.py +++ b/dspy/predict/knn.py @@ -1,8 +1,19 @@ +from concurrent.futures import ThreadPoolExecutor +from typing import List, TYPE_CHECKING + import numpy as np +if TYPE_CHECKING: + import dspy + class KNN: - def __init__(self, k: int, trainset: list, vectorizer=None): + k: int + trainset: List["dspy.Example"] + trainset_vectors: np.ndarray + embedding: "dspy.Embedder" + + def __init__(self, k: int, trainset: list, vectorizer=None, lazy=True): """ A k-nearest neighbors retriever that finds similar examples from a training set. @@ -23,15 +34,37 @@ def __init__(self, k: int, trainset: list, vectorizer=None): self.k = k self.trainset = trainset self.embedding = vectorizer or dspy.Embedder(dsp.SentenceTransformersVectorizer()) - trainset_casted_to_vectorize = [ - " | ".join([f"{key}: {value}" for key, value in example.items() if key in example._input_keys]) - for example in self.trainset - ] - self.trainset_vectors = self.embedding(trainset_casted_to_vectorize).astype(np.float32) + if not lazy: + self.embed() + + def _prepare_example(self, example: "dspy.Example"): + return " | ".join([f"{key}: {value}" for key, value in example.items() if key in example._input_keys]) + + def embed(self): + if not hasattr(self, "trainset_vectors"): + self.trainset_vectors = self.embedding(list(map(self._prepare_example, self.trainset))).astype(np.float32) + + def add(self, *examples: "dspy.Example"): + self.trainset.extend(examples) + self.trainset_vectors = np.concatenate( + [self.trainset_vectors, self.embedding(list(map(self._prepare_example, examples))).astype(np.float32)] + ) def __call__(self, **kwargs) -> list: + if not hasattr(self, "trainset_vectors"): + self.embed() input_example_vector = self.embedding([" | ".join([f"{key}: {val}" for key, val in kwargs.items()])]) scores = np.dot(self.trainset_vectors, input_example_vector.T).squeeze() nearest_samples_idxs = scores.argsort()[-self.k :][::-1] train_sampled = [self.trainset[cur_idx] for cur_idx in nearest_samples_idxs] return train_sampled + + +def load_knn_embeddings(program, num_threads=6): + knns = [p.retrieve_demos for p in program.predictors() if hasattr(p, "retrieve_demos")] + + def do_hydrate(knn): + knn.embed() + + with ThreadPoolExecutor(max_workers=num_threads) as executor: + list(executor.map(do_hydrate, knns)) diff --git a/dspy/predict/predict.py b/dspy/predict/predict.py index 51917e291..cf07c41a1 100644 --- a/dspy/predict/predict.py +++ b/dspy/predict/predict.py @@ -1,7 +1,7 @@ from functools import lru_cache import random - import logging +from typing import Optional from pydantic import BaseModel @@ -139,7 +139,6 @@ def forward(self, **kwargs): # Extract the three privileged keyword arguments. assert "new_signature" not in kwargs, "new_signature is no longer a valid keyword argument." signature = ensure_signature(kwargs.pop("signature", self.signature)) - demos = kwargs.pop("demos", self.demos) config = dict(**self.config, **kwargs.pop("config", {})) # Get the right LM to use. @@ -164,9 +163,7 @@ def forward(self, **kwargs): import dspy - if hasattr(self, "retrieve_demos") and callable(self.retrieve_demos): - demos = demos[:] + self.retrieve_demos(**inputs) - random.Random(self.random_seed).shuffle(demos) + demos = self.demos_for(inputs, static=None) adapter = dspy.settings.adapter or dspy.ChatAdapter() completions = adapter(lm, lm_kwargs=config, signature=signature, demos=demos, inputs=kwargs) @@ -188,6 +185,13 @@ def get_config(self): def __repr__(self): return f"{self.__class__.__name__}({self.signature})" + def demos_for(self, inputs, static: Optional[list] = None): + demos = static or self.demos + if hasattr(self, "retrieve_demos") and callable(self.retrieve_demos): + demos = demos[:] + self.retrieve_demos(**inputs) + random.Random(self.random_seed).shuffle(demos) + return demos + # TODO: get some defaults during init from the context window? # # TODO: FIXME: Hmm, I guess expected behavior is that contexts can diff --git a/dspy/propose/grounded_proposer.py b/dspy/propose/grounded_proposer.py index 15ef13130..484922bf8 100644 --- a/dspy/propose/grounded_proposer.py +++ b/dspy/propose/grounded_proposer.py @@ -1,30 +1,39 @@ import random +import logging import dspy from dspy.propose.dataset_summary_generator import create_dataset_summary -from dspy.propose.utils import create_example_string, create_predictor_level_history_string, strip_prefix, get_dspy_source_code +from dspy.propose.utils import ( + create_example_string, + create_predictor_level_history_string, + strip_prefix, + get_dspy_source_code, +) from dspy.teleprompt.utils import get_signature, get_prompt_model +from dspy.utils.parallelizer import ParallelExecutor from dspy.propose.propose_base import Proposer +logger = logging.getLogger(__name__) + # Hardcoded variables (TODO: update) MAX_INSTRUCT_IN_HISTORY = 5 # 10 TIPS = { - "none": "", - "creative": "Don't be afraid to be creative when creating the new instruction!", - "simple": "Keep the instruction clear and concise.", - "description": "Make sure your instruction is very informative and descriptive.", - "high_stakes": "The instruction should include a high stakes scenario in which the LM must solve the task!", - "persona": 'Include a persona that is relevant to the task in the instruction (ie. "You are a ...")', - } + "none": "", + "creative": "Don't be afraid to be creative when creating the new instruction!", + "simple": "Keep the instruction clear and concise.", + "description": "Make sure your instruction is very informative and descriptive.", + "high_stakes": "The instruction should include a high stakes scenario in which the LM must solve the task!", + "persona": 'Include a persona that is relevant to the task in the instruction (ie. "You are a ...")', +} ### SIGNATURES USED TO HELP WITH INSTRUCTION GENERATION ### + class DescribeProgram(dspy.Signature): - ( - """Below is some pseudo-code for a pipeline that solves tasks with calls to language models. Please describe what type of task this program appears to be designed to solve, and how it appears to work.""" - ) + """Below is some pseudo-code for a pipeline that solves tasks with calls to language models. Please describe what type of task this program appears to be designed to solve, and how it appears to work.""" + program_code = dspy.InputField( format=str, desc="Pseudocode for a language model program designed to solve a particular task.", @@ -42,9 +51,8 @@ class DescribeProgram(dspy.Signature): class DescribeModule(dspy.Signature): - ( - """Below is some pseudo-code for a pipeline that solves tasks with calls to language models. Please describe the purpose of one of the specified module in this pipeline.""" - ) + """Below is some pseudo-code for a pipeline that solves tasks with calls to language models. Please describe the purpose of one of the specified module in this pipeline.""" + program_code = dspy.InputField( format=str, desc="Pseudocode for a language model program designed to solve a particular task.", @@ -60,7 +68,8 @@ class DescribeModule(dspy.Signature): prefix="SUMMARY OF PROGRAM ABOVE:", ) module = dspy.InputField( - desc="The module in the program that we want to describe.", prefix="MODULE:", + desc="The module in the program that we want to describe.", + prefix="MODULE:", ) module_description = dspy.OutputField( desc="Description of the module's role in the broader program.", @@ -76,9 +85,8 @@ def generate_instruction_class( use_tip=True, ): class GenerateSingleModuleInstruction(dspy.Signature): - ( - """Use the information below to learn about a task that we are trying to solve using calls to an LM, then generate a new instruction that will be used to prompt a Language Model to better solve the task.""" - ) + """Use the information below to learn about a task that we are trying to solve using calls to an LM, then generate a new instruction that will be used to prompt a Language Model to better solve the task.""" + if use_dataset_summary: dataset_description = dspy.InputField( desc="A description of the dataset that we are using.", @@ -95,7 +103,8 @@ class GenerateSingleModuleInstruction(dspy.Signature): prefix="PROGRAM DESCRIPTION:", ) module = dspy.InputField( - desc="The module to create an instruction for.", prefix="MODULE:", + desc="The module to create an instruction for.", + prefix="MODULE:", ) task_demos = dspy.InputField( format=str, @@ -109,7 +118,9 @@ class GenerateSingleModuleInstruction(dspy.Signature): prefix="PREVIOUS INSTRUCTIONS:", ) basic_instruction = dspy.InputField( - format=str, desc="Basic instruction.", prefix="BASIC INSTRUCTION:", + format=str, + desc="Basic instruction.", + prefix="BASIC INSTRUCTION:", ) if use_tip: tip = dspy.InputField( @@ -124,8 +135,10 @@ class GenerateSingleModuleInstruction(dspy.Signature): return dspy.Predict(GenerateSingleModuleInstruction) + ### CLASS RESPONSIBLE FOR GENERATING A NEW INSTRUCTION, USING THE HELPER SIGNATURES ABOVE ### + class GenerateModuleInstruction(dspy.Module): def __init__( self, @@ -168,21 +181,23 @@ def forward( tip=None, ): # Construct full program demo or single module demo depending on whether or not we're using the full program - task_demos = "" basic_instruction = get_signature(program.predictors()[pred_i]).instructions curr_demos_num = 0 - + if self.use_task_demos: + task_demos = [] for example in demo_candidates[pred_i][demo_set_i]: if "augmented" in example.keys(): fields_to_use = get_signature(program.predictors()[pred_i]).fields example_string = create_example_string(fields_to_use, example) - task_demos += f"{example_string}\n" + task_demos.append(example_string) curr_demos_num += 1 if curr_demos_num >= max_demos: break else: - task_demos = "No task demos provided." + task_demos = ["No task demos provided."] + + task_demos = "\n".join(task_demos) # Summarize the program program_description = "Not available" @@ -192,24 +207,28 @@ def forward( try: program_description = strip_prefix( self.describe_program( - program_code=self.program_code_string, program_example=task_demos, + program_code=self.program_code_string, + program_example=task_demos, ).program_description, ) - if self.verbose: print(f"PROGRAM DESCRIPTION: {program_description}") + if self.verbose: + print(f"PROGRAM DESCRIPTION: {program_description}") inputs = [] outputs = [] for field_name, field in get_signature(program.predictors()[pred_i]).fields.items(): # Access the '__dspy_field_type' from the extra metadata - dspy_field_type = field.json_schema_extra.get('__dspy_field_type') - + dspy_field_type = field.json_schema_extra.get("__dspy_field_type") + # Based on the '__dspy_field_type', append to the respective list if dspy_field_type == "input": inputs.append(field_name) else: outputs.append(field_name) - module_code = f"{program.predictors()[pred_i].__class__.__name__}({', '.join(inputs)}) -> {', '.join(outputs)}" + module_code = ( + f"{program.predictors()[pred_i].__class__.__name__}({', '.join(inputs)}) -> {', '.join(outputs)}" + ) module_description = self.describe_module( program_code=self.program_code_string, @@ -219,11 +238,13 @@ def forward( max_depth=10, ).module_description except: - if self.verbose: print("Error getting program description. Running without program aware proposer.") + if self.verbose: + print("Error getting program description. Running without program aware proposer.") self.program_aware = False # Generate an instruction for our chosen module - if self.verbose: print(f"task_demos {task_demos}") + if self.verbose: + print(f"task_demos {task_demos}") instruct = self.generate_module_instruction( dataset_description=data_summary, program_code=self.program_code_string, @@ -237,13 +258,16 @@ def forward( ) if hasattr(instruct, "module_description"): module_description = strip_prefix(instruct.module_description) - if self.verbose: print(f"MODULE DESCRIPTION: {module_description}") + if self.verbose: + print(f"MODULE DESCRIPTION: {module_description}") proposed_instruction = strip_prefix(instruct.proposed_instruction) return dspy.Prediction(proposed_instruction=proposed_instruction) + ### CLASS USED TO GENERATE THE FULL SET OF INSTRUCTIONS GIVEN THE SPECIFIED CRITERIA ### + class GroundedProposer(Proposer): def __init__( self, @@ -259,7 +283,8 @@ def __init__( set_tip_randomly=True, set_history_randomly=True, verbose=False, - rng=None + rng=None, + num_threads=6, ): super().__init__() self.program_aware = program_aware @@ -267,10 +292,11 @@ def __init__( self.use_task_demos = use_task_demos self.use_instruct_history = use_instruct_history self.use_tip = use_tip - self.set_tip_randomly=set_tip_randomly - self.set_history_randomly=set_history_randomly + self.set_tip_randomly = set_tip_randomly + self.set_history_randomly = set_history_randomly self.verbose = verbose self.rng = rng or random + self.num_threads = num_threads self.prompt_model = get_prompt_model(prompt_model) @@ -278,22 +304,25 @@ def __init__( if self.program_aware: try: self.program_code_string = get_dspy_source_code(program) - if self.verbose: print("SOURCE CODE:",self.program_code_string) + if self.verbose: + logger.info("SOURCE CODE:", self.program_code_string) except Exception as e: - print(f"Error getting source code: {e}.\n\nRunning without program aware proposer.") + logger.error(f"Error getting source code: {e}.\n\nRunning without program aware proposer.") self.program_aware = False - self.data_summary = None + self.data_summary = None if self.use_dataset_summary: try: self.data_summary = create_dataset_summary( - trainset=trainset, view_data_batch_size=view_data_batch_size, prompt_model=prompt_model, + trainset=trainset, + view_data_batch_size=view_data_batch_size, + prompt_model=prompt_model, ) - if self.verbose: print(f"DATA SUMMARY: {self.data_summary}") + if self.verbose: + logger.info(f"DATA SUMMARY: {self.data_summary}") except Exception as e: - print(f"Error getting data summary: {e}.\n\nRunning without data aware proposer.") + logger.error(f"Error getting data summary: {e}.\n\nRunning without data aware proposer.\n") self.use_dataset_summary = False - print("") def propose_instructions_for_program( self, @@ -307,49 +336,56 @@ def propose_instructions_for_program( ): """This method is responsible for returning the full set of new instructions for our program, given the specified criteria.""" - proposed_instructions = {} + proposed_instructions = {} if self.set_history_randomly: # Randomly select whether or not we're using instruction history use_history = self.rng.random() < 0.5 self.use_instruct_history = use_history - if self.verbose: print(f"Use history T/F: {self.use_instruct_history}") + if self.verbose: + logging.info(f"Use history T/F: {self.use_instruct_history}") num_demos = max(len(demo_candidates[0]) if demo_candidates else N, 1) + num_predictors = len(program.predictors()) if not demo_candidates: - if self.verbose: print("No demo candidates provided. Running without task demos.") + if self.verbose: + logging.info("No demo candidates provided. Running without task demos.") self.use_task_demos = False - # Create an instruction for each predictor for pred_i, predictor in enumerate(program.predictors()): - for demo_set_i in range(num_demos): - if pred_i not in proposed_instructions: - proposed_instructions[pred_i] = [] + executor = ParallelExecutor( + num_threads=self.num_threads, + disable_progress_bar=False, + max_errors=num_predictors * num_demos // 5, + compare_results=False, + ) + + def generate_instructions_for_predictor(item): + demo_set_i, selected_tip = item if self.set_tip_randomly: - if self.verbose: print("Using a randomly generated configuration for our grounded proposer.") - # Randomly select the tip - selected_tip_key = self.rng.choice(list(TIPS.keys())) - selected_tip = TIPS[selected_tip_key] - self.use_tip = bool( - selected_tip, - ) - if self.verbose: print(f"Selected tip: {selected_tip_key}") - - proposed_instructions[pred_i].append( - self.propose_instruction_for_predictor( - program=program, - predictor=predictor, - pred_i=pred_i, - # prompt_model=prompt_model, - T=T, - demo_candidates=demo_candidates, - demo_set_i=demo_set_i, - trial_logs=trial_logs, - tip=selected_tip, - ), + if self.verbose: + print("Using a randomly generated configuration for our grounded proposer.") + if self.verbose: + print(f"Selected tip: {selected_tip}") + + return self.propose_instruction_for_predictor( + program=program, + predictor=predictor, + pred_i=pred_i, + # prompt_model=prompt_model, + T=T, + demo_candidates=demo_candidates, + demo_set_i=demo_set_i, + trial_logs=trial_logs, + tip=selected_tip, ) + logging.info(f"Generating instructions for predictor {pred_i} of {num_predictors}.") + tips = [TIPS[self.rng.choice(list(TIPS.keys()))] for _ in range(num_demos)] + items = list(zip(range(num_demos), tips)) + proposed_instructions[pred_i] = list(executor.execute(generate_instructions_for_predictor, items)) + return proposed_instructions def propose_instruction_for_predictor( @@ -368,7 +404,10 @@ def propose_instruction_for_predictor( # Create an instruction history string for our predictor instruction_history = create_predictor_level_history_string( - program, pred_i, trial_logs, MAX_INSTRUCT_IN_HISTORY, + program, + pred_i, + trial_logs, + MAX_INSTRUCT_IN_HISTORY, ) # Create our instruction generator class (given specific criteria for this round of proposal) @@ -379,7 +418,7 @@ def propose_instruction_for_predictor( use_task_demos=self.use_task_demos, use_instruct_history=self.use_instruct_history and instruction_history, use_tip=self.use_tip, - verbose=self.verbose + verbose=self.verbose, ) # Generate a new instruction for our predictor, using the temperature specified for this round @@ -399,7 +438,9 @@ def propose_instruction_for_predictor( self.prompt_model.kwargs["temperature"] = original_temp # Log the trace used to generate the new instruction, along with the new instruction itself - if self.verbose: self.prompt_model.inspect_history(n=1) - if self.verbose: print(f"PROPOSED INSTRUCTION: {proposed_instruction}") + if self.verbose: + self.prompt_model.inspect_history(n=1) + if self.verbose: + print(f"PROPOSED INSTRUCTION: {proposed_instruction}") return strip_prefix(proposed_instruction) diff --git a/dspy/teleprompt/__init__.py b/dspy/teleprompt/__init__.py index 81ed01144..821a62320 100644 --- a/dspy/teleprompt/__init__.py +++ b/dspy/teleprompt/__init__.py @@ -9,7 +9,7 @@ # from .mipro_optimizer import MIPRO from dspy.teleprompt.mipro_optimizer_v2 import MIPROv2 from dspy.teleprompt.random_search import BootstrapFewShotWithRandomSearch, BootstrapKNNWithRandomSearch - +from dspy.teleprompt.mipro_optimizer_v2_knn import MIPROv2KNN # from .signature_opt import SignatureOptimizer # from .signature_opt_bayesian import BayesianSignatureOptimizer @@ -27,6 +27,7 @@ "Ensemble", "KNNFewShot", "MIPROv2", + "MIPROv2KNN", "BootstrapFewShotWithRandomSearch", "BootstrapKNNWithRandomSearch", "BootstrapFewShotWithOptuna", diff --git a/dspy/teleprompt/bootstrap.py b/dspy/teleprompt/bootstrap.py index ada83bc0d..bf1f0c6e4 100644 --- a/dspy/teleprompt/bootstrap.py +++ b/dspy/teleprompt/bootstrap.py @@ -1,3 +1,5 @@ +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor, Future, as_completed import logging import random import threading @@ -44,6 +46,7 @@ def __init__( max_labeled_demos=16, max_rounds=1, max_errors=5, + num_threads=6, ): """ A Teleprompter class that composes a set of demos/examples to go into a predictor's prompt. @@ -75,14 +78,16 @@ def __init__( self.max_labeled_demos = max_labeled_demos self.max_rounds = max_rounds self.max_errors = max_errors + + self.num_threads = num_threads + self.error_count = 0 - self.error_lock = threading.Lock() + self._lock = threading.Lock() def compile(self, student, *, teacher=None, trainset): self.trainset = trainset self._prepare_student_and_teacher(student, teacher) - self._prepare_predictor_mappings() self._bootstrap() self.student = self._train() @@ -106,10 +111,9 @@ def _prepare_student_and_teacher(self, student, teacher): teleprompter = LabeledFewShot(k=self.max_labeled_demos) self.teacher = teleprompter.compile(self.teacher.reset_copy(), trainset=self.trainset) - def _prepare_predictor_mappings(self): - name2predictor, predictor2name = {}, {} - student, teacher = self.student, self.teacher + self._assert_student_teacher_compatibility(self.student, self.teacher) + def _assert_student_teacher_compatibility(self, student, teacher): assert len(student.predictors()) == len( teacher.predictors(), ), "Student and teacher must have the same number of predictors." @@ -131,58 +135,78 @@ def _prepare_predictor_mappings(self): ) assert id(predictor1) != id(predictor2), "Student and teacher must be different objects." - name2predictor[name1] = None # dict(student=predictor1, teacher=predictor2) - predictor2name[id(predictor1)] = name1 - - # FIXME(shangyint): This is an ugly hack to bind traces of - # retry.module to retry - # if isinstance(predictor1, Retry): - # predictor2name[id(predictor1.module)] = name1 + def _prepare_predictor_mappings(self, student, teacher): + predictor2name = {} + for (name1, predictor1), (name2, predictor2) in zip(student.named_predictors(), teacher.named_predictors()): + predictor2name[id(predictor1)] = name1 predictor2name[id(predictor2)] = name2 - self.name2predictor = name2predictor - self.predictor2name = predictor2name + return predictor2name def _bootstrap(self, *, max_bootstraps=None): max_bootstraps = max_bootstraps or self.max_bootstrapped_demos - bootstrap_attempts = 0 + rounds_attempted = 0 + bootstrapped = set() + self.name2traces = defaultdict(list) - bootstrapped = {} - self.name2traces = {name: [] for name in self.name2predictor} + for round_idx in range(self.max_rounds): + rounds_attempted += 1 - for example_idx, example in enumerate(tqdm.tqdm(self.trainset)): - if len(bootstrapped) >= max_bootstraps: - break + progress_bar = tqdm.tqdm(total=len(self.trainset)) - for round_idx in range(self.max_rounds): - bootstrap_attempts += 1 + futures: dict[Future, int] = {} - if self._bootstrap_one_example(example, round_idx): - bootstrapped[example_idx] = True - break + with ThreadPoolExecutor(max_workers=self.num_threads) as executor: + for example_idx, example in enumerate(self.trainset): + f = executor.submit(self._bootstrap_one_example, example, round_idx) + futures[f] = example_idx - print( - f"Bootstrapped {len(bootstrapped)} full traces after {example_idx} examples " - f"for up to {self.max_rounds} rounds, amounting to {bootstrap_attempts} attempts." - ) + for f in as_completed(futures.keys()): + progress_bar.update(1) + + if f.cancelled(): + continue + + success, name2traces = f.result() + if not success: + continue + + bootstrapped.add(futures[f]) + for name, traces in name2traces.items(): + self.name2traces[name].extend(traces) + + if len(bootstrapped) >= max_bootstraps: + for f in futures: + if not f.done(): + f.cancel() + break + + progress_bar.close() + + if len(bootstrapped) >= max_bootstraps: + print( + f"Bootstrapped {len(bootstrapped)} full traces after {progress_bar.n} examples after {rounds_attempted} rounds." + ) + break # Unbootstrapped training examples self.validation = [x for idx, x in enumerate(self.trainset) if idx not in bootstrapped] random.Random(0).shuffle(self.validation) - self.validation = self.validation - # NOTE: Can't yet use evaluate because we need to trace *per example* # evaluate = Evaluate(program=self.teacher, metric=self.metric, num_threads=12) # score = evaluate(self.metric, display_table=False, display_progress=True) def _bootstrap_one_example(self, example, round_idx=0): - name2traces = {} # self.name2traces - teacher = self.teacher # .deepcopy() + teacher = self.teacher.deepcopy() + predictor2name = self._prepare_predictor_mappings(self.student, teacher) + predictor_cache = {} + trace = [] + try: with dspy.settings.context(trace=[], **self.teacher_settings): lm = dspy.settings.lm @@ -195,7 +219,7 @@ def _bootstrap_one_example(self, example, round_idx=0): predictor.demos = [x for x in predictor.demos if x != example] prediction = teacher(**example.inputs()) - trace = dspy.settings.trace + trace = dspy.settings.trace[:] for name, predictor in teacher.named_predictors(): predictor.demos = predictor_cache[name] @@ -210,48 +234,40 @@ def _bootstrap_one_example(self, example, round_idx=0): success = True except Exception as e: success = False - with self.error_lock: + with self._lock: self.error_count += 1 current_error_count = self.error_count if current_error_count >= self.max_errors: raise e logger.error(f"Failed to run or to evaluate example {example} with {self.metric} due to {e}.") - if success: - for step in trace: - predictor, inputs, outputs = step - demo = dspy.Example(augmented=True, **inputs, **outputs).with_inputs(*list(inputs.keys())) - - try: - predictor_name = self.predictor2name[id(predictor)] - except KeyError: - continue # FIXME: ! - - # # TODO: Look closer into this. It's a bit tricky to reproduce. - # print(f"Failed to find predictor {predictor} in {self.predictor2name}.") - # print( - # "Are you doing this in a notebook (Jupyter)? This might be caused by redefining values by rerunning cells.", - # ) - # print("Try restarting the notebook, or open an issue.") - # raise KeyError( - # f"Failed to find predictor {id(predictor)} {predictor} in {self.predictor2name}.", - # ) from e - - name2traces[predictor_name] = name2traces.get(predictor_name, []) - name2traces[predictor_name].append(demo) - - # Update the traces - for name, demos in name2traces.items(): - from datasets.fingerprint import Hasher - - # If there are multiple traces for the same predictor in the sample example, - # sample 50/50 from the first N-1 traces or the last trace. - if len(demos) > 1: - rng = random.Random(Hasher.hash(tuple(demos))) - demos = [rng.choice(demos[:-1]) if rng.random() < 0.5 else demos[-1]] - self.name2traces[name].extend(demos) - - return success + if not success: + return False, {} + + assert trace, "No trace found." + + name2traces = defaultdict(list) + for step in trace: + predictor, inputs, outputs = step + demo = dspy.Example(augmented=True, **inputs, **outputs).with_inputs(*list(inputs.keys())) + + predictor_name = predictor2name[id(predictor)] + name2traces[predictor_name].append(demo) + + # Update the traces + final_demos = {} + for name, demos in name2traces.items(): + from datasets.fingerprint import Hasher + + # If there are multiple traces for the same predictor in the sample example, + # sample 50/50 from the first N-1 traces or the last trace. + if len(demos) > 1: + rng = random.Random(Hasher.hash(tuple(demos))) + demos = [rng.choice(demos[:-1]) if rng.random() < 0.5 else demos[-1]] + + final_demos[name] = demos + + return True, final_demos def _train(self): rng = random.Random(0) @@ -278,12 +294,13 @@ def __init__( teacher_settings: Optional[Dict] = None, max_bootstrapped_demos=64, num_static_demos=0, + num_threads=6, max_labeled_demos=16, max_rounds=1, max_errors=10, random_seed=0, ): - assert num_static_demos < max_labeled_demos, "static demos must be less than max labeled demos." + assert num_static_demos < max_labeled_demos, "static demos must be < max labeled demos." super().__init__( metric=metric, @@ -293,9 +310,10 @@ def __init__( max_labeled_demos=max_labeled_demos, max_rounds=max_rounds, max_errors=max_errors, + num_threads=num_threads, ) - self.num_static_demos = num_static_demos self.embedder = embedder + self.num_static_demos = num_static_demos self.random_seed = random_seed def _train(self): @@ -304,13 +322,16 @@ def _train(self): for name, predictor in self.student.named_predictors(): predictor.random_seed = self.random_seed + predictor.augmented_demos = self.name2traces[name][: self.max_bootstrapped_demos] - augmented_demos = self.name2traces[name] - - static_demos = rng.sample(augmented_demos, k=self.num_static_demos) + static_demos = rng.sample(predictor.augmented_demos, k=self.num_static_demos) predictor.demos = static_demos - dynamic_demos = [x for x in augmented_demos if x not in static_demos] - predictor.retrieve_demos = dspy.KNN(k=k, trainset=dynamic_demos, vectorizer=self.embedder) + dynamic_demos = [demo for demo in predictor.augmented_demos if demo not in static_demos] + predictor.retrieve_demos = dspy.KNN( + k=k, + trainset=dynamic_demos, + vectorizer=self.embedder, + ) return self.student diff --git a/dspy/teleprompt/mipro_optimizer_v2.py b/dspy/teleprompt/mipro_optimizer_v2.py index bd8eb48ad..3f445e855 100644 --- a/dspy/teleprompt/mipro_optimizer_v2.py +++ b/dspy/teleprompt/mipro_optimizer_v2.py @@ -62,13 +62,12 @@ def __init__( track_stats: bool = True, log_dir: Optional[str] = None, metric_threshold: Optional[float] = None, + logger: logging.Logger = logger, ): # Validate 'auto' parameter allowed_modes = {None, "light", "medium", "heavy"} if auto not in allowed_modes: - raise ValueError( - f"Invalid value for auto: {auto}. Must be one of {allowed_modes}." - ) + raise ValueError(f"Invalid value for auto: {auto}. Must be one of {allowed_modes}.") self.auto = auto self.num_candidates = num_candidates @@ -89,6 +88,7 @@ def __init__( self.metric_threshold = metric_threshold self.seed = seed self.rng = None + self.logger = logger def compile( self, @@ -125,9 +125,7 @@ def compile( trainset, valset = self._set_and_validate_datasets(trainset, valset) # Set hyperparameters based on run mode (if set) - zeroshot_opt = (self.max_bootstrapped_demos == 0) and ( - self.max_labeled_demos == 0 - ) + zeroshot_opt = (self.max_bootstrapped_demos == 0) and (self.max_labeled_demos == 0) num_trials, valset, minibatch = self._set_hyperparams_from_run_mode( student, num_trials, minibatch, zeroshot_opt, valset ) @@ -136,9 +134,7 @@ def compile( self._print_auto_run_settings(num_trials, minibatch, valset) if minibatch and minibatch_size > len(valset): - raise ValueError( - f"Minibatch size cannot exceed the size of the valset. Valset size: {len(valset)}." - ) + raise ValueError(f"Minibatch size cannot exceed the size of the valset. Valset size: {len(valset)}.") # Estimate LM calls and get user confirmation if requires_permission_to_run: @@ -151,7 +147,7 @@ def compile( valset, program_aware_proposer, ): - logger.info("Compilation aborted by the user.") + self.logger.info("Compilation aborted by the user.") return student # Return the original student program # Initialize program and evaluator @@ -199,10 +195,8 @@ def compile( ) return best_program - - def _set_random_seeds(self, - seed - ): + + def _set_random_seeds(self, seed): self.rng = random.Random(seed) np.random.seed(seed) @@ -225,9 +219,7 @@ def _set_hyperparams_from_run_mode( num_trials = auto_settings["num_trials"] valset = create_minibatch(valset, batch_size=auto_settings["val_size"], rng=self.rng) minibatch = len(valset) > MIN_MINIBATCH_SIZE - self.num_candidates = int( - np.round(np.min([num_trials * num_vars, (1.5 * num_trials) / num_vars])) - ) + self.num_candidates = int(round(min(num_trials * num_vars, 1.5 * num_trials / num_vars))) return num_trials, valset, minibatch @@ -237,9 +229,7 @@ def _set_and_validate_datasets(self, trainset: List, valset: Optional[List]): if valset is None: if len(trainset) < 2: - raise ValueError( - "Trainset must have at least 2 examples if no valset specified." - ) + raise ValueError("Trainset must have at least 2 examples if no valset specified.") valset_size = min(1000, max(1, int(len(trainset) * 0.80))) cutoff = len(trainset) - valset_size valset = trainset[cutoff:] @@ -251,7 +241,7 @@ def _set_and_validate_datasets(self, trainset: List, valset: Optional[List]): return trainset, valset def _print_auto_run_settings(self, num_trials: int, minibatch: bool, valset: List): - logger.info( + self.logger.info( f"\nRUNNING WITH THE FOLLOWING {self.auto.upper()} AUTO RUN SETTINGS:" f"\nnum_trials: {num_trials}" f"\nminibatch: {minibatch}" @@ -275,9 +265,7 @@ def _estimate_lm_calls( estimated_prompt_model_calls = ( 10 # Data summarizer calls + self.num_candidates * num_predictors # Candidate generation - + ( - num_predictors + 1 if program_aware_proposer else 0 - ) # Program-aware proposer + + (num_predictors + 1 if program_aware_proposer else 0) # Program-aware proposer ) prompt_model_line = ( f"{YELLOW}- Prompt Generation: {BLUE}{BOLD}10{ENDC}{YELLOW} data summarizer calls + " @@ -297,9 +285,7 @@ def _estimate_lm_calls( ) else: full_eval_steps = num_trials // minibatch_full_eval_steps + 1 - estimated_task_model_calls = ( - minibatch_size * num_trials + len(valset) * full_eval_steps - ) + estimated_task_model_calls = minibatch_size * num_trials + len(valset) * full_eval_steps task_model_line = ( f"{YELLOW}- Program Evaluation: {BLUE}{BOLD}{minibatch_size}{ENDC}{YELLOW} examples in minibatch * " f"{BLUE}{BOLD}{num_trials}{ENDC}{YELLOW} batches + " @@ -341,7 +327,7 @@ def _get_user_confirmation( {YELLOW}{BOLD}Estimated Cost Calculation:{ENDC} - {YELLOW}Total Cost = (Number of calls to task model * (Avg Input Token Length per Call * Task Model Price per Input Token + Avg Output Token Length per Call * Task Model Price per Output Token) + {YELLOW}Total Cost = (Number of calls to task model * (Avg Input Token Length per Call * Task Model Price per Input Token + Avg Output Token Length per Call * Task Model Price per Output Token) + (Number of program calls * (Avg Input Token Length per Call * Task Prompt Price per Input Token + Avg Output Token Length per Call * Prompt Model Price per Output Token).{ENDC} For a preliminary estimate of potential costs, we recommend you perform your own calculations based on the task @@ -362,24 +348,21 @@ def _get_user_confirmation( """ ) - user_input = input( - f"{user_message}\n{user_confirmation_message}\n" - "Do you wish to continue? (y/n): " - ).strip().lower() + user_input = ( + input(f"{user_message}\n{user_confirmation_message}\n" "Do you wish to continue? (y/n): ").strip().lower() + ) return user_input == "y" - def _bootstrap_fewshot_examples( - self, program: Any, trainset: List, seed: int, teacher: Any - ) -> Optional[List]: - logger.info("\n==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==") + def _bootstrap_fewshot_examples(self, program: Any, trainset: List, seed: int, teacher: Any) -> Optional[List]: + self.logger.info("\n==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==") if self.max_bootstrapped_demos > 0: - logger.info( + self.logger.info( "These will be used as few-shot example candidates for our program and for creating instructions.\n" ) else: - logger.info("These will be used for informing instruction proposal.\n") + self.logger.info("These will be used for informing instruction proposal.\n") - logger.info(f"Bootstrapping N={self.num_candidates} sets of demonstrations...") + self.logger.info(f"Bootstrapping N={self.num_candidates} sets of demonstrations...") zeroshot = self.max_bootstrapped_demos == 0 and self.max_labeled_demos == 0 @@ -388,15 +371,9 @@ def _bootstrap_fewshot_examples( student=program, num_candidate_sets=self.num_candidates, trainset=trainset, - max_labeled_demos=( - LABELED_FEWSHOT_EXAMPLES_IN_CONTEXT - if zeroshot - else self.max_labeled_demos - ), + max_labeled_demos=(LABELED_FEWSHOT_EXAMPLES_IN_CONTEXT if zeroshot else self.max_labeled_demos), max_bootstrapped_demos=( - BOOTSTRAPPED_FEWSHOT_EXAMPLES_IN_CONTEXT - if zeroshot - else self.max_bootstrapped_demos + BOOTSTRAPPED_FEWSHOT_EXAMPLES_IN_CONTEXT if zeroshot else self.max_bootstrapped_demos ), metric=self.metric, max_errors=self.max_errors, @@ -405,10 +382,11 @@ def _bootstrap_fewshot_examples( seed=seed, metric_threshold=self.metric_threshold, rng=self.rng, + num_threads=self.num_threads, ) except Exception as e: - logger.info(f"Error generating few-shot examples: {e}") - logger.info("Running without few-shot examples.") + self.logger.info(f"Error generating few-shot examples: {e}") + self.logger.info("Running without few-shot examples.") demo_candidates = None return demo_candidates @@ -424,8 +402,8 @@ def _propose_instructions( tip_aware_proposer: bool, fewshot_aware_proposer: bool, ) -> Dict[int, List[str]]: - logger.info("\n==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==") - logger.info( + self.logger.info("\n==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==") + self.logger.info( "We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions." ) @@ -442,10 +420,11 @@ def _propose_instructions( use_instruct_history=False, set_history_randomly=False, verbose=self.verbose, - rng=self.rng + rng=self.rng, + num_threads=self.num_threads, ) - logger.info("\nProposing instructions...\n") + self.logger.info("\nProposing instructions...\n") instruction_candidates = proposer.propose_instructions_for_program( trainset=trainset, program=program, @@ -456,11 +435,11 @@ def _propose_instructions( ) for i, pred in enumerate(program.predictors()): - logger.info(f"Proposed Instructions for Predictor {i}:\n") + self.logger.info(f"Proposed Instructions for Predictor {i}:\n") instruction_candidates[i][0] = get_signature(pred).instructions for j, instruction in enumerate(instruction_candidates[i]): - logger.info(f"{j}: {instruction}\n") - logger.info("\n") + self.logger.info(f"{j}: {instruction}\n") + self.logger.info("\n") return instruction_candidates @@ -477,15 +456,13 @@ def _optimize_prompt_parameters( minibatch_full_eval_steps: int, seed: int, ) -> Optional[Any]: - logger.info("Evaluating the default program...\n") + self.logger.info("Evaluating the default program...\n") default_score = eval_candidate_program(len(valset), valset, program, evaluate, self.rng) - logger.info(f"Default program score: {default_score}\n") + self.logger.info(f"Default program score: {default_score}\n") trial_logs = {} trial_logs[-1] = {} - trial_logs[-1]["full_eval_program_path"] = save_candidate_program( - program, self.log_dir, -1 - ) + trial_logs[-1]["full_eval_program_path"] = save_candidate_program(program, self.log_dir, -1) trial_logs[-1]["full_eval_score"] = default_score trial_logs[-1]["total_eval_calls_so_far"] = len(valset) trial_logs[-1]["full_eval_program"] = program.deepcopy() @@ -494,7 +471,7 @@ def _optimize_prompt_parameters( best_score = default_score best_program = program.deepcopy() total_eval_calls = len(valset) - score_data= [(best_score, program.deepcopy(), True)] + score_data = [(best_score, program.deepcopy(), True)] param_score_dict = defaultdict(list) fully_evaled_param_combos = {} @@ -504,9 +481,9 @@ def objective(trial): trial_num = trial.number + 1 if minibatch: - logger.info(f"== Minibatch Trial {trial_num} / {num_trials} ==") + self.logger.info(f"== Minibatch Trial {trial_num} / {num_trials} ==") else: - logger.info(f"===== Trial {trial_num} / {num_trials} =====") + self.logger.info(f"===== Trial {trial_num} / {num_trials} =====") trial_logs[trial_num] = {} @@ -525,24 +502,22 @@ def objective(trial): # Log assembled program if self.verbose: - logger.info("Evaluating the following candidate program...\n") + self.logger.info("Evaluating the following candidate program...\n") print_full_program(candidate_program) # Evaluate the candidate program (on minibatch if minibatch=True) batch_size = minibatch_size if minibatch else len(valset) - score = eval_candidate_program( - batch_size, valset, candidate_program, evaluate, self.rng - ) + score = eval_candidate_program(batch_size, valset, candidate_program, evaluate, self.rng) total_eval_calls += batch_size # Update best score and program if not minibatch and score > best_score: best_score = score best_program = candidate_program.deepcopy() - logger.info(f"{GREEN}Best full score so far!{ENDC} Score: {score}") + self.logger.info(f"{GREEN}Best full score so far!{ENDC} Score: {score}") # Log evaluation results - score_data.append((score, candidate_program, batch_size >= len(valset))) # score, prog, full_eval + score_data.append((score, candidate_program, batch_size >= len(valset))) # score, prog, full_eval if minibatch: self._log_minibatch_eval( score, @@ -559,7 +534,18 @@ def objective(trial): ) else: self._log_normal_eval( - score, best_score, chosen_params, score_data, trial, num_trials, trial_logs, trial_num, valset, batch_size, candidate_program, total_eval_calls + score, + best_score, + chosen_params, + score_data, + trial, + num_trials, + trial_logs, + trial_num, + valset, + batch_size, + candidate_program, + total_eval_calls, ) categorical_key = ",".join(map(str, chosen_params)) param_score_dict[categorical_key].append( @@ -567,10 +553,7 @@ def objective(trial): ) # If minibatch, perform full evaluation at intervals - if minibatch and ( - (trial_num % minibatch_full_eval_steps == 0) - or (trial_num == num_trials) - ): + if minibatch and ((trial_num % minibatch_full_eval_steps == 0) or (trial_num == num_trials)): best_score, best_program, total_eval_calls = self._perform_full_evaluation( trial_num, param_score_dict, @@ -588,8 +571,8 @@ def objective(trial): # Run optimization optuna.logging.set_verbosity(optuna.logging.WARNING) - logger.info("==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==") - logger.info( + self.logger.info("==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==") + self.logger.info( "We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n" ) @@ -605,11 +588,13 @@ def objective(trial): best_program.total_calls = self.total_calls sorted_candidate_programs = sorted(score_data, key=lambda x: x[0], reverse=True) # Attach all minibatch programs - best_program.mb_candidate_programs = [score_data for score_data in sorted_candidate_programs if not score_data[2]] + best_program.mb_candidate_programs = [ + score_data for score_data in sorted_candidate_programs if not score_data[2] + ] # Attach all programs that were evaluated on the full trainset, in descending order of score best_program.candidate_programs = [score_data for score_data in sorted_candidate_programs if score_data[2]] - logger.info(f"Returning best identified program with score {best_score}!") + self.logger.info(f"Returning best identified program with score {best_score}!") return best_program @@ -627,26 +612,32 @@ def _log_minibatch_eval( candidate_program, total_eval_calls, ): - trial_logs[trial_num]["mb_program_path"] = save_candidate_program( - candidate_program, self.log_dir, trial_num - ) + trial_logs[trial_num]["mb_program_path"] = save_candidate_program(candidate_program, self.log_dir, trial_num) trial_logs[trial_num]["mb_score"] = score trial_logs[trial_num]["total_eval_calls_so_far"] = total_eval_calls trial_logs[trial_num]["mb_program"] = candidate_program.deepcopy() - logger.info( - f"Score: {score} on minibatch of size {batch_size} with parameters {chosen_params}." - ) - logger.info(f"Minibatch scores so far: {'['+', '.join([f'{s[0]}' for s in score_data if not s[2]]) +']'}") + self.logger.info(f"Score: {score} on minibatch of size {batch_size} with parameters {chosen_params}.") + self.logger.info(f"Minibatch scores so far: {'['+', '.join([f'{s[0]}' for s in score_data if not s[2]]) +']'}") trajectory = "[" + ", ".join([f"{s[0]}" for s in score_data if s[2]]) + "]" - logger.info(f"Full eval scores so far: {trajectory}") - logger.info(f"Best full score so far: {best_score}") - logger.info( - f'{"="*len(f"== Minibatch Trial {trial.number+1} / {num_trials} ==")}\n\n' - ) + self.logger.info(f"Full eval scores so far: {trajectory}") + self.logger.info(f"Best full score so far: {best_score}") + self.logger.info(f'{"="*len(f"== Minibatch Trial {trial.number+1} / {num_trials} ==")}\n\n') def _log_normal_eval( - self, score, best_score, chosen_params, score_data, trial, num_trials, trial_logs, trial_num, valset, batch_size, candidate_program, total_eval_calls + self, + score, + best_score, + chosen_params, + score_data, + trial, + num_trials, + trial_logs, + trial_num, + valset, + batch_size, + candidate_program, + total_eval_calls, ): trial_logs[trial_num]["full_eval_program_path"] = save_candidate_program( candidate_program, self.log_dir, trial_num @@ -655,10 +646,10 @@ def _log_normal_eval( trial_logs[trial_num]["total_eval_calls_so_far"] = total_eval_calls trial_logs[trial_num]["full_eval_program"] = candidate_program.deepcopy() - logger.info(f"Score: {score} with parameters {chosen_params}.") - logger.info(f"Scores so far: {'['+', '.join([f'{s[0]}' for s in score_data if s[2]])+']'}") - logger.info(f"Best score so far: {best_score}") - logger.info(f'{"="*len(f"===== Trial {trial.number+1} / {num_trials} =====")}\n\n') + self.logger.info(f"Score: {score} with parameters {chosen_params}.") + self.logger.info(f"Scores so far: {'['+', '.join([f'{s[0]}' for s in score_data if s[2]])+']'}") + self.logger.info(f"Best score so far: {best_score}") + self.logger.info(f'{"="*len(f"===== Trial {trial.number+1} / {num_trials} =====")}\n\n') def _select_and_insert_instructions_and_demos( self, @@ -677,18 +668,14 @@ def _select_and_insert_instructions_and_demos( f"{i}_predictor_instruction", range(len(instruction_candidates[i])) ) selected_instruction = instruction_candidates[i][instruction_idx] - updated_signature = get_signature(predictor).with_instructions( - selected_instruction - ) + updated_signature = get_signature(predictor).with_instructions(selected_instruction) set_signature(predictor, updated_signature) trial_logs[trial_num][f"{i}_predictor_instruction"] = instruction_idx chosen_params.append(f"Predictor {i}: Instruction {instruction_idx}") # Select demos if available if demo_candidates: - demos_idx = trial.suggest_categorical( - f"{i}_predictor_demos", range(len(demo_candidates[i])) - ) + demos_idx = trial.suggest_categorical(f"{i}_predictor_demos", range(len(demo_candidates[i]))) predictor.demos = demo_candidates[i][demos_idx] trial_logs[trial_num][f"{i}_predictor_demos"] = demos_idx chosen_params.append(f"Predictor {i}: Few-Shot Set {demos_idx}") @@ -708,20 +695,16 @@ def _perform_full_evaluation( best_score: float, best_program: Any, ): - logger.info(f"===== Full Eval {len(fully_evaled_param_combos)+1} =====") + self.logger.info(f"===== Full Eval {len(fully_evaled_param_combos)+1} =====") # Identify best program to evaluate fully - highest_mean_program, mean_score, combo_key = ( - get_program_with_highest_avg_score( - param_score_dict, fully_evaled_param_combos - ) + highest_mean_program, mean_score, combo_key = get_program_with_highest_avg_score( + param_score_dict, fully_evaled_param_combos ) - logger.info( + self.logger.info( f"Doing full eval on next top averaging program (Avg Score: {mean_score}) from minibatch trials..." ) - full_eval_score = eval_candidate_program( - len(valset), valset, highest_mean_program, evaluate, self.rng - ) + full_eval_score = eval_candidate_program(len(valset), valset, highest_mean_program, evaluate, self.rng) score_data.append((full_eval_score, highest_mean_program, True)) # Log full evaluation results @@ -742,13 +725,13 @@ def _perform_full_evaluation( # Update best score and program if necessary if full_eval_score > best_score: - logger.info(f"{GREEN}New best full eval score!{ENDC} Score: {full_eval_score}") + self.logger.info(f"{GREEN}New best full eval score!{ENDC} Score: {full_eval_score}") best_score = full_eval_score best_program = highest_mean_program.deepcopy() trajectory = "[" + ", ".join([f"{s[0]}" for s in score_data if s[2]]) + "]" - logger.info(f"Full eval scores so far: {trajectory}") - logger.info(f"Best full score so far: {best_score}") - logger.info(len(f"===== Full Eval {len(fully_evaled_param_combos)+1} =====") * "=") - logger.info("\n") + self.logger.info(f"Full eval scores so far: {trajectory}") + self.logger.info(f"Best full score so far: {best_score}") + self.logger.info(len(f"===== Full Eval {len(fully_evaled_param_combos)+1} =====") * "=") + self.logger.info("\n") return best_score, best_program, total_eval_calls diff --git a/dspy/teleprompt/mipro_optimizer_v2_knn.py b/dspy/teleprompt/mipro_optimizer_v2_knn.py new file mode 100644 index 000000000..bd1df0857 --- /dev/null +++ b/dspy/teleprompt/mipro_optimizer_v2_knn.py @@ -0,0 +1,228 @@ +import logging +import random +from collections import defaultdict +from typing import Any, Callable, Dict, List, Optional, NamedTuple + +import optuna + +import dspy +from dspy.predict.knn import load_knn_embeddings +from dspy.teleprompt.bootstrap import BootstrapKNN +from dspy.teleprompt.utils import get_signature, set_signature + +from .mipro_optimizer_v2 import MIPROv2, BOOTSTRAPPED_FEWSHOT_EXAMPLES_IN_CONTEXT, LABELED_FEWSHOT_EXAMPLES_IN_CONTEXT + +logger = logging.getLogger(__name__) + + +class DemoCandidate(NamedTuple): + demos: list[dspy.Example] + random_seed: int + retrieve_demos: Callable[[str], list[dspy.Example]] + augmented_demos: list[dspy.Example] + + +class MIPROv2KNN(MIPROv2): + def __init__( + self, + metric: Callable, + embedder: "dspy.Embedder", + prompt_model: Optional[Any] = None, + task_model: Optional[Any] = None, + teacher_settings: Dict = {}, + max_bootstrapped_demos: int = 16, + max_labeled_demos: int = 4, + auto: Optional[str] = None, + num_candidates: int = 10, + num_threads: int = 6, + max_errors: int = 10, + seed: int = 9, + init_temperature: float = 0.5, + verbose: bool = False, + track_stats: bool = True, + log_dir: Optional[str] = None, + metric_threshold: Optional[float] = None, + logger: logging.Logger = logger, + ): + super().__init__( + metric=metric, + prompt_model=prompt_model, + task_model=task_model, + teacher_settings=teacher_settings, + max_bootstrapped_demos=max_bootstrapped_demos, + max_labeled_demos=max_labeled_demos, + num_threads=num_threads, + max_errors=max_errors, + seed=seed, + init_temperature=init_temperature, + verbose=verbose, + track_stats=track_stats, + log_dir=log_dir, + metric_threshold=metric_threshold, + num_candidates=num_candidates, + auto=auto, + logger=logger, + ) + self.embedder = embedder + + def _bootstrap_fewshot_examples( + self, program: Any, trainset: List, seed: int, teacher: Any + ) -> dict[int, DemoCandidate]: + self.logger.info("\n==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==") + if self.max_bootstrapped_demos > 0: + self.logger.info( + "These will be used as few-shot example candidates for our program and for creating instructions.\n" + ) + else: + self.logger.info("These will be used for informing instruction proposal.\n") + + self.logger.info(f"Bootstrapping N={self.num_candidates} sets of demonstrations...") + + zeroshot = self.max_bootstrapped_demos == 0 and self.max_labeled_demos == 0 + + try: + demo_candidates = bootstrap_knn_demos( + student=program, + embedder=self.embedder, + num_candidate_sets=self.num_candidates, + trainset=trainset, + max_labeled_demos=(LABELED_FEWSHOT_EXAMPLES_IN_CONTEXT if zeroshot else self.max_labeled_demos), + max_bootstrapped_demos=( + BOOTSTRAPPED_FEWSHOT_EXAMPLES_IN_CONTEXT if zeroshot else self.max_bootstrapped_demos + ), + metric=self.metric, + max_errors=self.max_errors, + teacher=teacher, + teacher_settings=self.teacher_settings, + seed=seed, + metric_threshold=self.metric_threshold, + rng=self.rng, + logger=self.logger, + ) + except Exception as e: + self.logger.error(f"Error generating few-shot examples: {e}\nRunning without few-shot examples.") + demo_candidates = None + + return demo_candidates + + def _select_and_insert_instructions_and_demos( + self, + candidate_program: Any, + instruction_candidates: Dict[int, List[str]], + demo_candidates: dict[int, list[DemoCandidate]], + trial: optuna.trial.Trial, + trial_logs: Dict, + trial_num: int, + ) -> List[str]: + chosen_params = [] + + for i, predictor in enumerate(candidate_program.predictors()): + # Select instruction + instruction_idx = trial.suggest_categorical( + f"{i}_predictor_instruction", range(len(instruction_candidates[i])) + ) + selected_instruction = instruction_candidates[i][instruction_idx] + updated_signature = get_signature(predictor).with_instructions(selected_instruction) + set_signature(predictor, updated_signature) + trial_logs[trial_num][f"{i}_predictor_instruction"] = instruction_idx + chosen_params.append(f"Predictor {i}: Instruction {instruction_idx}") + + # Select demos if available + if demo_candidates: + demos_idx = trial.suggest_categorical(f"{i}_predictor_demos", range(len(demo_candidates[i]))) + candidate = demo_candidates[i][demos_idx] + + predictor.demos = candidate.demos + predictor.random_seed = candidate.random_seed + predictor.retrieve_demos = candidate.retrieve_demos + + trial_logs[trial_num][f"{i}_predictor_demos"] = demos_idx + chosen_params.append(f"Predictor {i}: Few-Shot Set {demos_idx}") + + self.logger.info("Generating KNN embeddings as required...") + load_knn_embeddings(candidate_program, self.num_threads) + return chosen_params + + def _propose_instructions( + self, + program: Any, + trainset: List, + demo_candidates: Optional[dict[int, list[DemoCandidate]]], + view_data_batch_size: int, + program_aware_proposer: bool, + data_aware_proposer: bool, + tip_aware_proposer: bool, + fewshot_aware_proposer: bool, + ): + if demo_candidates: + demo_candidates = {i: [demos for _, _, _, demos in candidates] for i, candidates in demo_candidates.items()} + + return super()._propose_instructions( + program, + trainset, + demo_candidates, + view_data_batch_size, + program_aware_proposer, + data_aware_proposer, + tip_aware_proposer, + fewshot_aware_proposer, + ) + + +def bootstrap_knn_demos( + student, + embedder: "dspy.Embedder", + num_candidate_sets, + trainset, + max_labeled_demos, + max_bootstrapped_demos, + metric, + teacher_settings, + max_errors=10, + max_rounds=1, + metric_threshold=None, + teacher=None, + seed=0, + num_threads=6, + rng=None, + logger=logger, +) -> dict[int, list[DemoCandidate]]: + rng = rng or random.Random(seed) + + demo_candidates = defaultdict[int, list[DemoCandidate]](list) + + # Go through and create each candidate set + for seed in range(num_candidate_sets): + logger.info(f"Bootstrapping set {seed+1}/{num_candidate_sets}") + + trainset_copy = list(trainset) + rng.shuffle(trainset_copy) + + num_static_demos = 0 if seed == 0 else rng.randint(1, max_labeled_demos - 1) + optimizer = BootstrapKNN( + metric=metric, + embedder=embedder, + max_errors=max_errors, + max_bootstrapped_demos=max_bootstrapped_demos, + max_labeled_demos=max_labeled_demos, + teacher_settings=teacher_settings, + max_rounds=max_rounds, + metric_threshold=metric_threshold, + num_threads=num_threads, + random_seed=seed, + num_static_demos=num_static_demos, + ) + + optimized_student = optimizer.compile(student, teacher=teacher, trainset=trainset_copy) + + for i, predictor in enumerate(optimized_student.predictors()): + demo_candidates[i].append( + DemoCandidate( + demos=predictor.demos, + random_seed=predictor.random_seed, + retrieve_demos=predictor.retrieve_demos, + augmented_demos=rng.sample(predictor.augmented_demos, k=max_labeled_demos), + ) + ) + + return demo_candidates diff --git a/dspy/teleprompt/random_search.py b/dspy/teleprompt/random_search.py index a8d44a498..98fe16099 100644 --- a/dspy/teleprompt/random_search.py +++ b/dspy/teleprompt/random_search.py @@ -1,6 +1,7 @@ import random from dspy.evaluate.evaluate import Evaluate +from dspy.predict.knn import load_knn_embeddings from dspy.teleprompt.teleprompt import Teleprompter from .bootstrap import BootstrapFewShot, BootstrapKNN @@ -50,10 +51,10 @@ def __init__( self.num_candidate_sets = num_candidate_programs self.max_labeled_demos = max_labeled_demos + def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None, labeled_sample=True): print(f"Going to sample between {self.min_num_samples} and {self.max_num_samples} traces per predictor.") print(f"Will attempt to bootstrap {self.num_candidate_sets} candidate sets.") - def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None, labeled_sample=True): self.trainset = trainset self.valset = valset or trainset # TODO: FIXME: Note this choice. @@ -61,7 +62,7 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None all_subscores = [] score_data = [] - for seed in range(-1, self.num_candidate_sets): + for seed in range(-1, self.num_candidate_sets - 1): if (restrict is not None) and (seed not in restrict): continue @@ -86,6 +87,7 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None teacher_settings=self.teacher_settings, max_rounds=self.max_rounds, max_errors=self.max_errors, + num_threads=self.num_threads, ) program = optimizer.compile(student, teacher=teacher, trainset=trainset_copy) @@ -103,10 +105,12 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None teacher_settings=self.teacher_settings, max_rounds=self.max_rounds, max_errors=self.max_errors, + num_threads=self.num_threads, ) program = optimizer.compile(student, teacher=teacher, trainset=trainset_copy) + load_knn_embeddings(program, self.num_threads) evaluate = Evaluate( devset=self.valset, metric=self.metric, @@ -179,10 +183,10 @@ def __init__( self.max_labeled_demos = max_labeled_demos self.max_bootstrapped_demos = max_bootstrapped_demos + def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None, labeled_sample=True): print(f"Going to sample between 1 and {self.max_static_demos} static demos per predictor.") print(f"Will attempt to bootstrap {self.num_candidate_sets} candidate sets.") - def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None, labeled_sample=True): self.trainset = trainset self.valset = valset or trainset # TODO: FIXME: Note this choice. @@ -190,7 +194,7 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None all_subscores = [] score_data = [] - for seed in range(-1, self.num_candidate_sets): + for seed in range(-1, self.num_candidate_sets - 1): if (restrict is not None) and (seed not in restrict): continue @@ -217,6 +221,7 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None teacher_settings=self.teacher_settings, max_rounds=self.max_rounds, max_errors=self.max_errors, + num_threads=self.num_threads, ) program = optimizer.compile(student, teacher=teacher, trainset=trainset_copy) @@ -234,6 +239,7 @@ def compile(self, student, *, teacher=None, trainset, valset=None, restrict=None max_rounds=self.max_rounds, max_errors=self.max_errors, num_static_demos=num_static_demos, + num_threads=self.num_threads, random_seed=seed, ) diff --git a/dspy/teleprompt/utils.py b/dspy/teleprompt/utils.py index bdc6a529a..c1a413d7c 100644 --- a/dspy/teleprompt/utils.py +++ b/dspy/teleprompt/utils.py @@ -42,24 +42,22 @@ def create_minibatch(trainset, batch_size=50, rng=None): return minibatch -def eval_candidate_program(batch_size, trainset, candidate_program, evaluate, rng=None): +def eval_candidate_program(batch_size, trainset, candidate_program, evaluate, rng=None, **kwargs): """Evaluate a candidate program on the trainset, using the specified batch size.""" try: # Evaluate on the full trainset if batch_size >= len(trainset): - score = evaluate(candidate_program, devset=trainset) + return evaluate(candidate_program, devset=trainset) # Or evaluate on a minibatch else: - score = evaluate( + return evaluate( candidate_program, devset=create_minibatch(trainset, batch_size, rng), ) except Exception as e: print(f"Exception occurred: {e}") - score = 0.0 # TODO: Handle this better, as -ve scores are possible - - return score + return 0.0 # TODO: Handle this better, as -ve scores are possible def eval_candidate_program_with_pruning( @@ -282,6 +280,7 @@ def create_n_fewshot_demo_sets( teacher=None, include_non_bootstrapped=True, seed=0, + num_threads=6, rng=None ): """ @@ -294,8 +293,8 @@ def create_n_fewshot_demo_sets( num_candidate_sets -= 3 # Initialize demo_candidates dictionary - for i, _ in enumerate(student.predictors()): - demo_candidates[i] = [] + for name, _ in enumerate(student.predictors()): + demo_candidates[name] = [] rng = rng or random.Random(seed) @@ -330,6 +329,7 @@ def create_n_fewshot_demo_sets( max_labeled_demos=max_labeled_demos, teacher_settings=teacher_settings, max_rounds=max_rounds, + num_threads=num_threads, ) program2 = program.compile(student, teacher=teacher, trainset=trainset_copy) @@ -346,14 +346,15 @@ def create_n_fewshot_demo_sets( max_labeled_demos=max_labeled_demos, teacher_settings=teacher_settings, max_rounds=max_rounds, + num_threads=num_threads, ) program2 = teleprompter.compile( student, teacher=teacher, trainset=trainset_copy, ) - for i, _ in enumerate(student.predictors()): - demo_candidates[i].append(program2.predictors()[i].demos) + for name, _ in enumerate(student.predictors()): + demo_candidates[name].append(program2.predictors()[name].demos) return demo_candidates diff --git a/dspy/utils/caching.py b/dspy/utils/caching.py index 7bd0d50c4..b3bf5373c 100644 --- a/dspy/utils/caching.py +++ b/dspy/utils/caching.py @@ -1,5 +1,11 @@ -import os +from collections import OrderedDict +from hashlib import sha256 from pathlib import Path +from typing import Union +import os +import pickle + +import ujson _DEFAULT_CACHE_DIR = os.path.join(Path.home(), ".dspy_cache") @@ -12,3 +18,58 @@ def create_subdir_in_cachedir(subdir: str) -> str: subdir = os.path.abspath(subdir) os.makedirs(subdir, exist_ok=True) return subdir + + +class LRUCache(OrderedDict): + maxsize: int + + def __init__(self, iterable, maxsize: int): + super().__init__(iterable) + self.maxsize = maxsize + + @classmethod + def load(cls, file, maxsize: int) -> "LRUCache": + return cls(pickle.load(file), maxsize) + + @staticmethod + def dump(obj, file) -> None: + pickle.dump([[k, v] for k, v in obj.items()], file) + + def __setitem__(self, request: dict, value): + key = self.cache_key(request) + + if key in self: + self.move_to_end(key) + return + + if len(self) == self.maxsize: + self.popitem(last=False) + + super().__setitem__(key, value) + + def __getitem__(self, request: dict): + key = self.cache_key(request) + return super().__getitem__(key) + + def __contains__(self, request: dict): + key = self.cache_key(request) + return super().__contains__(key) + + def get(self, request: dict, default=None): + key = self.cache_key(request) + return super().get(key, default) + + def __delitem__(self, request: dict): + key = self.cache_key(request) + super().__delitem__(key) + + def pop(self, request: dict, default=None): + key = self.cache_key(request) + return super().pop(key, default) + + @staticmethod + def cache_key(request: Union[dict, str]) -> str: + params = request + if isinstance(request, dict): + params = {k: v for k, v in request.items() if not callable(v)} + return sha256(ujson.dumps(params, sort_keys=True).encode()).hexdigest() diff --git a/testing/optimizer_tester.py b/testing/optimizer_tester.py index 3e392b43d..c4d61bf3c 100644 --- a/testing/optimizer_tester.py +++ b/testing/optimizer_tester.py @@ -60,20 +60,14 @@ def __init__( openai.api_base = os.environ.get("OPENAI_API_BASE") # Prompt gen model - if not prompt_model: - self.prompt_model = dspy.OpenAI(model=self.PROMPT_MODEL_NAME, max_tokens=700) - else: - self.prompt_model = prompt_model + self.prompt_model = prompt_model or dspy.OpenAI(model=self.PROMPT_MODEL_NAME, max_tokens=700) # Task model - if not task_model: - self.task_model = dspy.HFClientTGI( - model=self.TASK_MODEL_NAME, - port=[7140, 7141, 7142, 7143], - max_tokens=150, - ) - else: - self.task_model = task_model + self.task_model = task_model or dspy.HFClientTGI( + model=self.TASK_MODEL_NAME, + port=[7140, 7141, 7142, 7143], + max_tokens=150, + ) self.colbertv2 = dspy.ColBERTv2(url=colbert_v2_endpoint) dspy.settings.configure(rm=self.colbertv2, lm=self.task_model) diff --git a/testing/outputs/results.csv b/testing/outputs/results.csv deleted file mode 100644 index f99d0c04c..000000000 --- a/testing/outputs/results.csv +++ /dev/null @@ -1,37 +0,0 @@ -test_name,train_score,dev_score,test_score,run_time (sec),train_size,dev_size,test_size,task_name,signature_optimized,prompt_model_name,task_model_name,breadth,depth,meta_prompt_style,fewshot_before,fewshot_after,temperature,fewshot_candidates_num,max_bootstrapped_demos,bootstrapping,view_data,optimizer_log_dir,additional_notes,misc -bootstrap-knn-hotpotqa-default,58.0,51.0,56.5,450.65799974999993,200,100,200,hotpotqa,True,gpt-3.5-turbo-1106,meta-llama/Llama-2-13b-chat-hf,10,3,default,False,False,1.1,0,0,False,False,log_dir/hotpotqa_default_20241122_134313/,, -bootstrap-knn-tweet-default,18.5,21.0,20.17,1.8680969580000237,200,100,200,tweet,True,gpt-3.5-turbo-1106,meta-llama/Llama-2-13b-chat-hf,10,3,default,False,False,1.1,0,0,False,False,log_dir/tweet_default_20241122_141116/,, -bootstrap-fewshot-hotpotqa-default,55.5,55.0,57.5,0.3491807090003931,200,100,200,hotpotqa,True,gpt-3.5-turbo-1106,meta-llama/Llama-2-13b-chat-hf,10,3,default,False,False,1.1,0,0,False,False,log_dir/hotpotqa_default_20241122_144551/,, -bootstrap-fewshot-tweet-default,18.83,22.67,21.83,0.5466496250001001,200,100,200,tweet,True,gpt-3.5-turbo-1106,meta-llama/Llama-2-13b-chat-hf,10,3,default,False,False,1.1,0,0,False,False,log_dir/tweet_default_20241122_150736/,, -bootstrap-knn-hotpotqa-default,58.5,51.0,56.5,2899.5347310840007,200,100,200,hotpotqa,True,gpt-3.5-turbo-1106,meta-llama/Llama-2-13b-chat-hf,10,3,default,False,False,1.1,0,0,False,False,log_dir/hotpotqa_default_20241122_152929/,, -bootstrap-knn-hover_retrieve_discrete-default,37.5,50.0,45.5,2086.384129125001,200,100,200,hover_retrieve_discrete,True,gpt-3.5-turbo-1106,meta-llama/Llama-2-13b-chat-hf,10,3,default,False,False,1.1,0,0,False,False,log_dir/hover_retrieve_discrete_default_20241122_164516/,, -bootstrap-fewshot-hover_retrieve_discrete-default,34.5,38.0,41.5,0.23261224999987462,200,100,200,hover_retrieve_discrete,True,gpt-3.5-turbo-1106,meta-llama/Llama-2-13b-chat-hf,10,3,default,False,False,1.1,0,0,False,False,log_dir/hover_retrieve_discrete_default_20241122_174226/,, -bootstrap-knn-hover_retrieve_discrete-default,41.0,46.0,47.0,7733.980304125001,200,100,200,hover_retrieve_discrete,True,gpt-4o-mini,gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/hover_retrieve_discrete_default_20241122_183924/,, -bootstrap-fewshot-hover_retrieve_discrete-default,41.0,45.0,43.0,6167.043331082999,200,100,200,hover_retrieve_discrete,True,gpt-4o-mini,gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/hover_retrieve_discrete_default_20241122_210452/,, -bootstrap-knn-hotpotqa-default,59.5,48.0,59.0,6819.343262,200,100,200,hotpotqa,True,gpt-4o-mini,gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/hotpotqa_default_20241123_002837/,, -bootstrap-kmn-hover_retrieve_discrete-default,41.0,42.0,42.0,1.1559860840000056,200,100,200,hover_retrieve_discrete,True,gpt-4o-mini,gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/hover_retrieve_discrete_default_20241123_025957/,, -bootstrap-knn-heart_disease-default,63.33,63.33,50.82,3982.8160289590005,120,120,183,heart_disease,True,gpt-4o-mini,gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/heart_disease_default_20241123_042647/,, -bootstrap-fewshot-hotpotqa-default,57.5,52.0,58.0,6186.669083209,200,100,200,hotpotqa,True,gpt-4o-mini,gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/hotpotqa_default_20241123_054636/,, -bootstrap-fewshot-heart_disease-default,62.5,62.5,59.02,11386.399131417,120,120,183,heart_disease,True,gpt-4o-mini,gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/heart_disease_default_20241123_074527/,, -hover_retrieve_discrete_baseline,30.0,39.0,39.5,0,200,100,200,hover_retrieve_discrete,False,gpt-3.5-turbo-1106,meta-llama/Llama-2-13b-chat-hf,NA,NA,default,False,False,1.1,0,0,False,False,NA,, -heart_disease_baseline,44.17,44.17,46.45,0,120,120,183,heart_disease,False,gpt-3.5-turbo-1106,meta-llama/Llama-2-13b-chat-hf,NA,NA,default,False,False,1.1,0,0,False,False,NA,, -hotpotqa_baseline,22.0,25.0,29.0,0,200,100,200,hotpotqa,False,gpt-3.5-turbo-1106,meta-llama/Llama-2-13b-chat-hf,NA,NA,default,False,False,1.1,0,0,False,False,NA,, -bootstrap-knn-hotpotqa-default,58.5,51.0,56.5,51.13316199999997,200,100,200,hotpotqa,True,gpt-4o-mini,gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/hotpotqa_default_20241123_183520/,, -bootstrap-knn-heart_disease-default,57.5,57.5,51.37,41.37025941599995,120,120,183,heart_disease,True,gpt-4o-mini,gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/heart_disease_default_20241123_183619/,, -bootstrap-knn-hover_retrieve_discrete-default,39.5,47.0,44.5,70.4879585839999,200,100,200,hover_retrieve_discrete,True,gpt-4o-mini,gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/hover_retrieve_discrete_default_20241123_184012/,, -bootstrap-fewshot-hotpotqa-default,57.5,53.0,58.0,13.718924457999947,200,100,200,hotpotqa,True,gpt-4o-mini,gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/hotpotqa_default_20241123_185709/,, -bootstrap-fewshot-heart_disease-default,50.0,50.0,44.81,9.320043417000306,120,120,183,heart_disease,True,gpt-4o-mini,gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/heart_disease_default_20241123_185741/,, -bootstrap-fewshot-hover_retrieve_discrete-default,41.0,45.0,43.0,16.555457208999997,200,100,200,hover_retrieve_discrete,True,gpt-4o-mini,gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/hover_retrieve_discrete_default_20241123_193512/,, -bootstrap-knn-heart_disease-default,56.67,56.67,45.9,2.398595249999744,120,120,183,heart_disease,True,gpt-4o-mini,gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/heart_disease_default_20241123_195341/,, -bootstrap-knn-heart_disease-default,57.5,57.5,51.37,50.493944916000146,120,120,183,heart_disease,True,gpt-4o-mini,gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/heart_disease_default_20241123_200314/,, -bootstrap-fewshot-heart_disease-default,50.0,50.0,44.81,4.706272333000015,120,120,183,heart_disease,True,gpt-4o-mini,gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/heart_disease_default_20241123_200441/,, -bootstrap-knn-hotpotqa-default,58.5,51.0,56.5,171.786966041,200,100,200,hotpotqa,True,openai/gpt-4o-mini,openai/gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/hotpotqa_default_20241123_232549/,, -bootstrap-knn-hover_retrieve_discrete-default,39.5,47.0,44.5,255.54141504199998,200,100,200,hover_retrieve_discrete,True,openai/gpt-4o-mini,openai/gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/hover_retrieve_discrete_default_20241123_232855/,, -miprov2-light-hotpotqa-default,53.5,48.0,52.0,265.50126825000007,200,100,200,hotpotqa,True,openai/gpt-4o-mini,openai/gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/hotpotqa_default_20241123_233603/,, -miprov2-light-hover_retrieve_discrete-default,30.0,39.0,39.5,31.533997583999962,200,100,200,hover_retrieve_discrete,True,openai/gpt-4o-mini,openai/gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/hover_retrieve_discrete_default_20241123_234607/,, -bootstrap-knn-hotpotqa-default,59.5,48.0,59.0,4604.210739208001,200,100,200,hotpotqa,True,openai/gpt-4o-mini,openai/gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/hotpotqa_default_20241123_235009/,, -bootstrap-knn-hover_retrieve_discrete-default,41.0,46.0,47.0,5284.831311208,200,100,200,hover_retrieve_discrete,True,openai/gpt-4o-mini,openai/gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/hover_retrieve_discrete_default_20241124_010742/,, -miprov2-medium-hotpotqa-default,57.0,46.0,57.5,23081.980955084,200,100,200,hotpotqa,True,openai/gpt-4o-mini,openai/gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/hotpotqa_default_20241124_023702/,, -miprov2-medium-hover_retrieve_discrete-default,37.5,45.0,41.0,5582.9553148330015,200,100,200,hover_retrieve_discrete,True,openai/gpt-4o-mini,openai/gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/hover_retrieve_discrete_default_20241124_090541/,, -miprov2-heavy-hotpotqa-default,54.5,53.0,58.5,495.168939167,200,100,200,hotpotqa,True,openai/gpt-4o-mini,openai/gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/hotpotqa_default_20241124_110831/,, -miprov2-heavy-hover_retrieve_discrete-default,38.5,49.0,44.0,2737.171699459,200,100,200,hover_retrieve_discrete,True,openai/gpt-4o-mini,openai/gpt-4o-mini,10,3,default,False,False,1.1,0,0,False,False,log_dir/hover_retrieve_discrete_default_20241124_112101/,, diff --git a/testing/tasks/hotpotqa.py b/testing/tasks/hotpotqa.py index 7a8966407..cd131c629 100644 --- a/testing/tasks/hotpotqa.py +++ b/testing/tasks/hotpotqa.py @@ -1,6 +1,6 @@ import dspy from dspy.datasets import HotPotQA -from dspy.evaluate import Evaluate +from dspy.evaluate import Evaluate, normalize_text, answer_exact_match_and_semantic from .base_task import BaseTask @@ -14,7 +14,7 @@ def __init__(self, passages_per_hop): def forward(self, question): context = [] - for hop in range(2): + for _ in range(2): query = self.generate_query(context=context, question=question).search_query context += self.retrieve(query).passages return dspy.Prediction( @@ -34,32 +34,26 @@ def __init__(self): # Set up metrics NUM_THREADS = 16 - metric_EM = dspy.evaluate.answer_exact_match - self.metric = metric_EM - - def gold_passages_retrieved(example, pred, trace=None): - gold_titles = set(map(dspy.evaluate.normalize_text, example["gold_titles"])) - found_titles = set( - map( - dspy.evaluate.normalize_text, - [c.split(" | ")[0] for c in pred.context], - ) - ) - return gold_titles.issubset(found_titles) - kwargs = dict(num_threads=NUM_THREADS, display_progress=True, display_table=15) - self.evaluate_EM = Evaluate(devset=self.trainset, metric=metric_EM, **kwargs) - self.evaluate_retrieval = Evaluate( - devset=self.trainset, metric=gold_passages_retrieved, **kwargs - ) + self.evaluate_EM = Evaluate(devset=self.trainset, metric=self.get_metric(), **kwargs) + self.evaluate_retrieval = Evaluate(devset=self.trainset, metric=self.get_metric(retrieval=True), **kwargs) self.set_splits(TRAIN_NUM=100, DEV_NUM=100, TEST_NUM=100) def get_program(self): return MultiHop(passages_per_hop=3) - def get_metric(self): - return self.metric + def get_metric(self, retrieval=False): + if not retrieval: + return answer_exact_match_and_semantic + + def gold_passages_retrieved(example, pred, trace=None): + gold_titles = example["gold_titles"] + found_titles = [c.split(" | ")[0] for c in pred.context] + + return set(map(normalize_text, gold_titles)) <= set(map(normalize_text, found_titles)) + + return gold_passages_retrieved def get_retrieval_metric(self): return self.evaluate_retrieval diff --git a/testing/tasks/hover.py b/testing/tasks/hover.py index 1a7e4378c..ae1e57036 100644 --- a/testing/tasks/hover.py +++ b/testing/tasks/hover.py @@ -2,62 +2,57 @@ import tqdm from datasets import load_dataset -import pandas as pd + +from dspy.evaluate import Evaluate, normalize_text import dspy -from dsp.utils.utils import deduplicate -from dspy.evaluate import Evaluate from .base_task import BaseTask def count_unique_docs(example): - return len(set([fact["key"] for fact in example["supporting_facts"]])) - - -def discrete_retrieval_eval(example, pred, trace=None): - gold_titles = set( - map( - dspy.evaluate.normalize_text, - [doc["key"] for doc in example["supporting_facts"]], - ) - ) - found_titles = set( - map( - dspy.evaluate.normalize_text, - [c.split(" | ")[0] for c in pred.retrieved_docs], - ) - ) - return gold_titles.issubset(found_titles) - - -class RetrieveMultiHop(dspy.Module): + return len(set(fact["key"] for fact in example["supporting_facts"])) + + +class RetrieveMultiHop2(dspy.Module): + def __init__(self): + super().__init__() + self.retrieve = dspy.Retrieve(k=16) + self.create_query = dspy.ChainOfThought("claim,summary->query") + self.summarize = dspy.ChainOfThought("claim,context,passages->summary") + + def forward(self, claim): + passages = self.retrieve(claim).passages + summary = "" + + for _ in range(2): + summary = self.summarize(claim=claim, context=summary, passages=passages).summary + query = self.create_query(claim=claim, summary=summary).query + passages.extend(self.retrieve(query).passages) + + return dspy.Prediction(retrieved_docs=passages) + + +class RetrieveMultiHop4(dspy.Module): def __init__(self): super().__init__() - self.k = 7 self.create_query_hop2 = dspy.ChainOfThought("claim,summary_1->query") self.create_query_hop3 = dspy.ChainOfThought("claim,summary_1,summary_2->query") - self.retrieve_k = dspy.Retrieve(k=self.k) + self.retrieve_k = dspy.Retrieve(k=7) self.summarize1 = dspy.ChainOfThought("claim,passages->summary") self.summarize2 = dspy.ChainOfThought("claim,context,passages->summary") def forward(self, claim): # HOP 1 hop1_docs = self.retrieve_k(claim).passages - summary_1 = self.summarize1( - claim=claim, passages=hop1_docs - ).summary # Summarize top k docs + summary_1 = self.summarize1(claim=claim, passages=hop1_docs).summary # Summarize top k docs # HOP 2 hop2_query = self.create_query_hop2(claim=claim, summary_1=summary_1).query hop2_docs = self.retrieve_k(hop2_query).passages - summary_2 = self.summarize2( - claim=claim, context=summary_1, passages=hop2_docs - ).summary + summary_2 = self.summarize2(claim=claim, context=summary_1, passages=hop2_docs).summary # HOP 3 - hop3_query = self.create_query_hop3( - claim=claim, summary_1=summary_1, summary_2=summary_2 - ).query + hop3_query = self.create_query_hop3(claim=claim, summary_1=summary_1, summary_2=summary_2).query hop3_docs = self.retrieve_k(hop3_query).passages return dspy.Prediction(retrieved_docs=hop1_docs + hop2_docs + hop3_docs) @@ -79,9 +74,7 @@ def __init__(self): label = example["label"] if count_unique_docs(example) == 3: # Limit to 3 hop examples - reformatted_hf_trainset.append( - dict(claim=claim, supporting_facts=supporting_facts, label=label) - ) + reformatted_hf_trainset.append(dict(claim=claim, supporting_facts=supporting_facts, label=label)) for example in tqdm.tqdm(hf_testset): claim = example["claim"] @@ -89,13 +82,11 @@ def __init__(self): label = example["label"] if count_unique_docs(example) == 3: - reformatted_hf_testset.append( - dict(claim=claim, supporting_facts=supporting_facts, label=label) - ) + reformatted_hf_testset.append(dict(claim=claim, supporting_facts=supporting_facts, label=label)) - rng = random.Random(0) - rng.shuffle(reformatted_hf_trainset) rng = random.Random(1) + rng.shuffle(reformatted_hf_trainset) + rng = random.Random(2) rng.shuffle(reformatted_hf_testset) trainset = reformatted_hf_trainset @@ -107,18 +98,21 @@ def __init__(self): # Set up metrics NUM_THREADS = 16 - self.metric = discrete_retrieval_eval - kwargs = dict(num_threads=NUM_THREADS, display_progress=True, display_table=15) - self.evaluate = Evaluate(devset=self.trainset, metric=self.metric, **kwargs) + self.evaluate = Evaluate(devset=self.trainset, metric=self.get_metric(), **kwargs) self.set_splits(TRAIN_NUM=100, DEV_NUM=100, TEST_NUM=100) def get_program(self): - return RetrieveMultiHop() + return RetrieveMultiHop4() def get_metric(self): - return self.metric + def discrete_retrieval_eval(example, pred, trace=None): + gold_titles = [fact["key"] for fact in example["supporting_facts"]] + found_titles = [c.split(" | ")[0] for c in pred.retrieved_docs] + return set(map(normalize_text, gold_titles)) <= set(map(normalize_text, found_titles)) + + return discrete_retrieval_eval def get_default_max_bootstrapped_demos(self): return 1 diff --git a/tests/clients/test_embedding.py b/tests/clients/test_embedding.py index 0ac9e24ba..a0ca7d65e 100644 --- a/tests/clients/test_embedding.py +++ b/tests/clients/test_embedding.py @@ -31,7 +31,7 @@ def test_litellm_embedding(): result = embedding(inputs) # Verify litellm was called with correct parameters. - mock_litellm.assert_called_once_with(model=model, input=inputs, caching=True) + mock_litellm.assert_called_once_with(model=model, input=inputs, cache={"no-cache": False, "no-store": False}) assert len(result) == len(inputs) np.testing.assert_allclose(result, mock_embeddings)