From ee6c16605c89f02becb6795b3a48b11098f652b6 Mon Sep 17 00:00:00 2001 From: Chen Qian Date: Thu, 21 Nov 2024 18:00:51 -0800 Subject: [PATCH 01/19] Style fix for Dataloader (#1838) --- dspy/datasets/dataloader.py | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/dspy/datasets/dataloader.py b/dspy/datasets/dataloader.py index 87d534f9d..351b84418 100644 --- a/dspy/datasets/dataloader.py +++ b/dspy/datasets/dataloader.py @@ -10,9 +10,7 @@ class DataLoader(Dataset): - def __init__( - self, - ): + def __init__(self): pass def from_huggingface( @@ -97,8 +95,7 @@ def from_parquet(self, file_path: str, fields: List[str] = None, input_keys: Tup return [dspy.Example({field: row[field] for field in fields}).with_inputs(input_keys) for row in dataset] - def from_rm(self, num_samples: int, - fields: List[str], input_keys: List[str]) -> List[dspy.Example]: + def from_rm(self, num_samples: int, fields: List[str], input_keys: List[str]) -> List[dspy.Example]: try: rm = dspy.settings.rm try: @@ -107,9 +104,13 @@ def from_rm(self, num_samples: int, for row in rm.get_objects(num_samples=num_samples, fields=fields) ] except AttributeError: - raise ValueError("Retrieval module does not support `get_objects`. Please use a different retrieval module.") + raise ValueError( + "Retrieval module does not support `get_objects`. Please use a different retrieval module." + ) except AttributeError: - raise ValueError("Retrieval module not found. Please set a retrieval module using `dspy.settings.configure`.") + raise ValueError( + "Retrieval module not found. Please set a retrieval module using `dspy.settings.configure`." + ) def sample( self, @@ -119,7 +120,9 @@ def sample( **kwargs, ) -> List[dspy.Example]: if not isinstance(dataset, list): - raise ValueError(f"Invalid dataset provided of type {type(dataset)}. Please provide a list of examples.") + raise ValueError( + f"Invalid dataset provided of type {type(dataset)}. Please provide a list of `dspy.Example`s." + ) return random.sample(dataset, n, *args, **kwargs) @@ -141,7 +144,11 @@ def train_test_split( elif train_size is not None and isinstance(train_size, int): train_end = train_size else: - raise ValueError("Invalid train_size. Please provide a float between 0 and 1 or an int.") + raise ValueError( + "Invalid `train_size`. Please provide a float between 0 and 1 to represent the proportion of the " + "dataset to include in the train split or an int to represent the absolute number of samples to " + f"include in the train split. Received `train_size`: {train_size}." + ) if test_size is not None: if isinstance(test_size, float) and (0 < test_size < 1): @@ -149,9 +156,16 @@ def train_test_split( elif isinstance(test_size, int): test_end = test_size else: - raise ValueError("Invalid test_size. Please provide a float between 0 and 1 or an int.") + raise ValueError( + "Invalid `test_size`. Please provide a float between 0 and 1 to represent the proportion of the " + "dataset to include in the test split or an int to represent the absolute number of samples to " + f"include in the test split. Received `test_size`: {test_size}." + ) if train_end + test_end > len(dataset_shuffled): - raise ValueError("train_size + test_size cannot exceed the total number of samples.") + raise ValueError( + "`train_size` + `test_size` cannot exceed the total number of samples. Received " + f"`train_size`: {train_end}, `test_size`: {test_end}, and `dataset_size`: {len(dataset_shuffled)}." + ) else: test_end = len(dataset_shuffled) - train_end From 44b333106dcd072cd43cd73c98e8765421603579 Mon Sep 17 00:00:00 2001 From: Omar Khattab Date: Thu, 21 Nov 2024 18:20:51 -0800 Subject: [PATCH 02/19] Fix dspy.Evaluate's handling of exceptions (from 2.5.30) (#1839) --- dspy/evaluate/evaluate.py | 51 +++++++++++++++------------------------ 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py index 04a135537..9ef0bf733 100644 --- a/dspy/evaluate/evaluate.py +++ b/dspy/evaluate/evaluate.py @@ -54,6 +54,7 @@ def __init__( return_all_scores=False, return_outputs=False, provide_traceback=False, + failure_score=0.0, **_kwargs, ): self.devset = devset @@ -65,6 +66,7 @@ def __init__( self.return_all_scores = return_all_scores self.return_outputs = return_outputs self.provide_traceback = provide_traceback + self.failure_score = failure_score def __call__( self, @@ -85,7 +87,6 @@ def __call__( return_all_scores = return_all_scores if return_all_scores is not None else self.return_all_scores return_outputs = return_outputs if return_outputs is not None else self.return_outputs - devset = list(enumerate(devset)) tqdm.tqdm._instances.clear() executor = ParallelExecutor( @@ -96,39 +97,27 @@ def __call__( compare_results=True, ) - def process_item(item): - try: - example_idx, example = item - prediction = program(**example.inputs()) - score = metric(example, prediction) + def process_item(example): + prediction = program(**example.inputs()) + score = metric(example, prediction) - # Increment assert and suggest failures to program's attributes - if hasattr(program, "_assert_failures"): - program._assert_failures += dspy.settings.get("assert_failures") - if hasattr(program, "_suggest_failures"): - program._suggest_failures += dspy.settings.get("suggest_failures") + # Increment assert and suggest failures to program's attributes + if hasattr(program, "_assert_failures"): + program._assert_failures += dspy.settings.get("assert_failures") + if hasattr(program, "_suggest_failures"): + program._suggest_failures += dspy.settings.get("suggest_failures") - return example_idx, example, prediction, score - except Exception: - return example_idx, example, {}, 0.0 + return prediction, score results = executor.execute(process_item, devset) - reordered_devset = [r for r in results if r is not None] + assert len(devset) == len(results) - ncorrect = sum(score for _, _, _, score in reordered_devset) - ntotal = len(reordered_devset) - - if ntotal == 0: - logger.warning("No valid results to compute metrics.") - return 0.0 + results = [((dspy.Prediction(), self.failure_score) if r is None else r) for r in results] + results = [(example, prediction, score) for example, (prediction, score) in zip(devset, results)] + ncorrect, ntotal = sum(score for *_, score in results), len(devset) logger.info(f"Average Metric: {ncorrect} / {ntotal} ({round(100 * ncorrect / ntotal, 1)}%)") - - predicted_devset = sorted(reordered_devset) - - if return_outputs: # Handle the return_outputs logic - results = [(example, prediction, score) for _, example, prediction, score in predicted_devset] - + def prediction_is_dictlike(prediction): # Downstream logic for displaying dictionary-like predictions depends solely on the predictions # having a method called `items()` for iterating through key/value pairs @@ -140,12 +129,12 @@ def prediction_is_dictlike(prediction): if prediction_is_dictlike(prediction) else dict(example) | {"prediction": prediction, "correct": score} ) - for _, example, prediction, score in predicted_devset + for example, prediction, score in results ] - result_df = pd.DataFrame(data) # Truncate every cell in the DataFrame (DataFrame.applymap was renamed to DataFrame.map in Pandas 2.1.0) + result_df = pd.DataFrame(data) result_df = result_df.map(truncate_cell) if hasattr(result_df, "map") else result_df.applymap(truncate_cell) # Rename the 'correct' column to the name of the metric object @@ -179,9 +168,9 @@ def prediction_is_dictlike(prediction): display(HTML(message)) if return_all_scores and return_outputs: - return round(100 * ncorrect / ntotal, 2), results, [score for *_, score in predicted_devset] + return round(100 * ncorrect / ntotal, 2), results, [score for *_, score in results] if return_all_scores: - return round(100 * ncorrect / ntotal, 2), [score for *_, score in predicted_devset] + return round(100 * ncorrect / ntotal, 2), [score for *_, score in results] if return_outputs: return round(100 * ncorrect / ntotal, 2), results From 1a577f8515e5b1dd857465870d7c12ae69b27a3f Mon Sep 17 00:00:00 2001 From: Omar Khattab Date: Fri, 22 Nov 2024 11:39:18 -0800 Subject: [PATCH 03/19] Introduce Embeddings index, CompleteAndGrounded metric, Unbatchify utils (#1843) * Introduce Embeddings (faiss NN index), CompleteAndGrounded metric, and Unbatchify utils * adjust faiss import * adjust tests * adjust to dspy.Embedder --- dspy/__init__.py | 2 + dspy/clients/__init__.py | 2 +- dspy/clients/embedding.py | 76 ++++++++++++++---- dspy/evaluate/auto_evaluation.py | 129 ++++++++++++++++++++++++------- dspy/predict/knn.py | 4 +- dspy/retrievers/__init__.py | 1 + dspy/retrievers/embeddings.py | 83 ++++++++++++++++++++ dspy/utils/unbatchify.py | 111 ++++++++++++++++++++++++++ tests/clients/test_embedding.py | 8 +- 9 files changed, 364 insertions(+), 52 deletions(-) create mode 100644 dspy/retrievers/__init__.py create mode 100644 dspy/retrievers/embeddings.py create mode 100644 dspy/utils/unbatchify.py diff --git a/dspy/__init__.py b/dspy/__init__.py index 28e0a352b..9e3e85fd2 100644 --- a/dspy/__init__.py +++ b/dspy/__init__.py @@ -6,6 +6,8 @@ from .retrieve import * from .signatures import * +import dspy.retrievers + # Functional must be imported after primitives, predict and signatures from .functional import * # isort: skip from dspy.evaluate import Evaluate # isort: skip diff --git a/dspy/clients/__init__.py b/dspy/clients/__init__.py index dc10f865f..2fc0e2543 100644 --- a/dspy/clients/__init__.py +++ b/dspy/clients/__init__.py @@ -1,7 +1,7 @@ from .lm import LM from .provider import Provider, TrainingJob from .base_lm import BaseLM, inspect_history -from .embedding import Embedding +from .embedding import Embedder import litellm import os from pathlib import Path diff --git a/dspy/clients/embedding.py b/dspy/clients/embedding.py index eec41c32b..ec7c1174e 100644 --- a/dspy/clients/embedding.py +++ b/dspy/clients/embedding.py @@ -2,7 +2,7 @@ import numpy as np -class Embedding: +class Embedder: """DSPy embedding class. The class for computing embeddings for text inputs. This class provides a unified interface for both: @@ -10,7 +10,7 @@ class Embedding: 1. Hosted embedding models (e.g. OpenAI's text-embedding-3-small) via litellm integration 2. Custom embedding functions that you provide - For hosted models, simply pass the model name as a string (e.g. "openai/text-embedding-3-small"). The class will use + For hosted models, simply pass the model name as a string (e.g., "openai/text-embedding-3-small"). The class will use litellm to handle the API calls and caching. For custom embedding models, pass a callable function that: @@ -24,6 +24,9 @@ class Embedding: model: The embedding model to use. This can be either a string (representing the name of the hosted embedding model, must be an embedding model supported by litellm) or a callable that represents a custom embedding model. + batch_size (int, optional): The default batch size for processing inputs in batches. Defaults to 200. + caching (bool, optional): Whether to cache the embedding response when using a hosted model. Defaults to True. + **kwargs: Additional default keyword arguments to pass to the embedding model. Examples: Example 1: Using a hosted model. @@ -31,7 +34,7 @@ class Embedding: ```python import dspy - embedder = dspy.Embedding("openai/text-embedding-3-small") + embedder = dspy.Embedder("openai/text-embedding-3-small", batch_size=100) embeddings = embedder(["hello", "world"]) assert embeddings.shape == (2, 1536) @@ -41,37 +44,78 @@ class Embedding: ```python import dspy + import numpy as np def my_embedder(texts): return np.random.rand(len(texts), 10) - embedder = dspy.Embedding(my_embedder) - embeddings = embedder(["hello", "world"]) + embedder = dspy.Embedder(my_embedder) + embeddings = embedder(["hello", "world"], batch_size=1) assert embeddings.shape == (2, 10) ``` """ - def __init__(self, model): + def __init__(self, model, batch_size=200, caching=True, **kwargs): self.model = model + self.batch_size = batch_size + self.caching = caching + self.default_kwargs = kwargs - def __call__(self, inputs, caching=True, **kwargs): + def __call__(self, inputs, batch_size=None, caching=None, **kwargs): """Compute embeddings for the given inputs. Args: inputs: The inputs to compute embeddings for, can be a single string or a list of strings. - caching: Whether to cache the embedding response, only valid when using a hosted embedding model. - kwargs: Additional keyword arguments to pass to the embedding model. + batch_size (int, optional): The batch size for processing inputs. If None, defaults to the batch_size set during initialization. + caching (bool, optional): Whether to cache the embedding response when using a hosted model. If None, defaults to the caching setting from initialization. + **kwargs: Additional keyword arguments to pass to the embedding model. These will override the default kwargs provided during initialization. Returns: - A 2-D numpy array of embeddings, one embedding per row. + numpy.ndarray: If the input is a single string, returns a 1D numpy array representing the embedding. + If the input is a list of strings, returns a 2D numpy array of embeddings, one embedding per row. """ + if isinstance(inputs, str): + is_single_input = True inputs = [inputs] - if isinstance(self.model, str): - embedding_response = litellm.embedding(model=self.model, input=inputs, caching=caching, **kwargs) - return np.array([data["embedding"] for data in embedding_response.data], dtype=np.float32) - elif callable(self.model): - return np.array(self.model(inputs, **kwargs), dtype=np.float32) else: - raise ValueError(f"`model` in `dspy.Embedding` must be a string or a callable, but got {type(self.model)}.") + is_single_input = False + + assert all(isinstance(inp, str) for inp in inputs), "All inputs must be strings." + + if batch_size is None: + batch_size = self.batch_size + if caching is None: + caching = self.caching + + merged_kwargs = self.default_kwargs.copy() + merged_kwargs.update(kwargs) + + embeddings_list = [] + + def chunk(inputs_list, size): + for i in range(0, len(inputs_list), size): + yield inputs_list[i : i + size] + + for batch_inputs in chunk(inputs, batch_size): + if isinstance(self.model, str): + embedding_response = litellm.embedding( + model=self.model, input=batch_inputs, caching=caching, **merged_kwargs + ) + batch_embeddings = [data["embedding"] for data in embedding_response.data] + elif callable(self.model): + batch_embeddings = self.model(batch_inputs, **merged_kwargs) + else: + raise ValueError( + f"`model` in `dspy.Embedder` must be a string or a callable, but got {type(self.model)}." + ) + + embeddings_list.extend(batch_embeddings) + + embeddings = np.array(embeddings_list, dtype=np.float32) + + if is_single_input: + return embeddings[0] + else: + return embeddings diff --git a/dspy/evaluate/auto_evaluation.py b/dspy/evaluate/auto_evaluation.py index 38b02fe35..d96d58f21 100644 --- a/dspy/evaluate/auto_evaluation.py +++ b/dspy/evaluate/auto_evaluation.py @@ -14,14 +14,35 @@ class SemanticRecallPrecision(dspy.Signature): precision: float = dspy.OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth") +class DecompositionalSemanticRecallPrecision(dspy.Signature): + """ + Compare a system's response to the ground truth to compute recall and precision of key ideas. + You will first enumerate key ideas in each response, discuss their overlap, and then report recall and precision. + """ + + question: str = dspy.InputField() + ground_truth: str = dspy.InputField() + system_response: str = dspy.InputField() + ground_truth_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the ground truth") + system_response_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the system response") + discussion: str = dspy.OutputField(desc="discussion of the overlap between ground truth and system response") + recall: float = dspy.OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response") + precision: float = dspy.OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth") + + def f1_score(precision, recall): + precision, recall = max(0.0, min(1.0, precision)), max(0.0, min(1.0, recall)) return 0.0 if precision + recall == 0 else 2 * (precision * recall) / (precision + recall) class SemanticF1(dspy.Module): - def __init__(self, threshold=0.66): + def __init__(self, threshold=0.66, decompositional=False): self.threshold = threshold - self.module = dspy.ChainOfThought(SemanticRecallPrecision) + + if decompositional: + self.module = dspy.ChainOfThought(DecompositionalSemanticRecallPrecision) + else: + self.module = dspy.ChainOfThought(SemanticRecallPrecision) def forward(self, example, pred, trace=None): scores = self.module(question=example.question, ground_truth=example.response, system_response=pred.response) @@ -30,42 +51,92 @@ def forward(self, example, pred, trace=None): return score if trace is None else score >= self.threshold -""" -Soon-to-be deprecated Signatures & Modules Below. -""" + +########### + + +class DecompositionalSemanticRecall(dspy.Signature): + """ + Estimate the completeness of a system's responses, against the ground truth. + You will first enumerate key ideas in each response, discuss their overlap, and then report completeness. + """ + + question: str = dspy.InputField() + ground_truth: str = dspy.InputField() + system_response: str = dspy.InputField() + ground_truth_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the ground truth") + system_response_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the system response") + discussion: str = dspy.OutputField(desc="discussion of the overlap between ground truth and system response") + completeness: float = dspy.OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response") + + + +class DecompositionalGroundedness(dspy.Signature): + """ + Estimate the groundedness of a system's responses, against real retrieved documents written by people. + You will first enumerate whatever non-trivial or check-worthy claims are made in the system response, and then + discuss the extent to which some or all of them can be deduced from the retrieved context and basic commonsense. + """ + + question: str = dspy.InputField() + retrieved_context: str = dspy.InputField() + system_response: str = dspy.InputField() + system_response_claims: str = dspy.OutputField(desc="enumeration of non-trivial or check-worthy claims in the system response") + discussion: str = dspy.OutputField(desc="discussion of how supported the claims are by the retrieved context") + groundedness: float = dspy.OutputField(desc="fraction (out of 1.0) of system response supported by the retrieved context") + + +class CompleteAndGrounded(dspy.Module): + def __init__(self, threshold=0.66): + self.threshold = threshold + self.completeness_module = dspy.ChainOfThought(DecompositionalSemanticRecall) + self.groundedness_module = dspy.ChainOfThought(DecompositionalGroundedness) + + def forward(self, example, pred, trace=None): + completeness = self.completeness_module(question=example.question, ground_truth=example.response, system_response=pred.response) + groundedness = self.groundedness_module(question=example.question, retrieved_context=pred.context, system_response=pred.response) + score = f1_score(groundedness.groundedness, completeness.completeness) + + return score if trace is None else score >= self.threshold + + + +# """ +# Soon-to-be deprecated Signatures & Modules Below. +# """ -class AnswerCorrectnessSignature(dspy.Signature): - """Verify that the predicted answer matches the gold answer.""" +# class AnswerCorrectnessSignature(dspy.Signature): +# """Verify that the predicted answer matches the gold answer.""" - question = dspy.InputField() - gold_answer = dspy.InputField(desc="correct answer for question") - predicted_answer = dspy.InputField(desc="predicted answer for question") - is_correct = dspy.OutputField(desc="True or False") +# question = dspy.InputField() +# gold_answer = dspy.InputField(desc="correct answer for question") +# predicted_answer = dspy.InputField(desc="predicted answer for question") +# is_correct = dspy.OutputField(desc="True or False") -class AnswerCorrectness(dspy.Module): - def __init__(self): - super().__init__() - self.evaluate_correctness = dspy.ChainOfThought(AnswerCorrectnessSignature) +# class AnswerCorrectness(dspy.Module): +# def __init__(self): +# super().__init__() +# self.evaluate_correctness = dspy.ChainOfThought(AnswerCorrectnessSignature) - def forward(self, question, gold_answer, predicted_answer): - return self.evaluate_correctness(question=question, gold_answer=gold_answer, predicted_answer=predicted_answer) +# def forward(self, question, gold_answer, predicted_answer): +# return self.evaluate_correctness(question=question, gold_answer=gold_answer, predicted_answer=predicted_answer) -class AnswerFaithfulnessSignature(dspy.Signature): - """Verify that the predicted answer is based on the provided context.""" +# class AnswerFaithfulnessSignature(dspy.Signature): +# """Verify that the predicted answer is based on the provided context.""" - context = dspy.InputField(desc="relevant facts for producing answer") - question = dspy.InputField() - answer = dspy.InputField(desc="often between 1 and 5 words") - is_faithful = dspy.OutputField(desc="True or False") +# context = dspy.InputField(desc="relevant facts for producing answer") +# question = dspy.InputField() +# answer = dspy.InputField(desc="often between 1 and 5 words") +# is_faithful = dspy.OutputField(desc="True or False") -class AnswerFaithfulness(dspy.Module): - def __init__(self): - super().__init__() - self.evaluate_faithfulness = dspy.ChainOfThought(AnswerFaithfulnessSignature) +# class AnswerFaithfulness(dspy.Module): +# def __init__(self): +# super().__init__() +# self.evaluate_faithfulness = dspy.ChainOfThought(AnswerFaithfulnessSignature) - def forward(self, context, question, answer): - return self.evaluate_faithfulness(context=context, question=question, answer=answer) +# def forward(self, context, question, answer): +# return self.evaluate_faithfulness(context=context, question=question, answer=answer) diff --git a/dspy/predict/knn.py b/dspy/predict/knn.py index 434a07aaa..17a5a3fb7 100644 --- a/dspy/predict/knn.py +++ b/dspy/predict/knn.py @@ -13,7 +13,7 @@ def __init__(self, k: int, trainset: List[dsp.Example], vectorizer=None): Args: k: Number of nearest neighbors to retrieve trainset: List of training examples to search through - vectorizer: Optional dspy.Embedding for computing embeddings. If None, uses sentence-transformers. + vectorizer: Optional dspy.Embedder for computing embeddings. If None, uses sentence-transformers. Example: >>> trainset = [dsp.Example(input="hello", output="world"), ...] @@ -24,7 +24,7 @@ def __init__(self, k: int, trainset: List[dsp.Example], vectorizer=None): self.k = k self.trainset = trainset - self.embedding = vectorizer or dspy.Embedding(dsp.SentenceTransformersVectorizer()) + self.embedding = vectorizer or dspy.Embedder(dsp.SentenceTransformersVectorizer()) trainset_casted_to_vectorize = [ " | ".join([f"{key}: {value}" for key, value in example.items() if key in example._input_keys]) for example in self.trainset diff --git a/dspy/retrievers/__init__.py b/dspy/retrievers/__init__.py new file mode 100644 index 000000000..3fdc977bb --- /dev/null +++ b/dspy/retrievers/__init__.py @@ -0,0 +1 @@ +from .embeddings import Embeddings \ No newline at end of file diff --git a/dspy/retrievers/embeddings.py b/dspy/retrievers/embeddings.py new file mode 100644 index 000000000..75e1ff1fb --- /dev/null +++ b/dspy/retrievers/embeddings.py @@ -0,0 +1,83 @@ +import numpy as np +from typing import Any, List, Optional +from dspy.utils.unbatchify import Unbatchify + +# TODO: Add .save and .load methods! + + +class Embeddings: + def __init__( + self, + corpus: List[str], + embedder, + k: int = 5, + callbacks: Optional[List[Any]] = None, + cache: bool = False, + brute_force_threshold: int = 20_000, + normalize: bool = True + ): + assert cache is False, "Caching is not supported for embeddings-based retrievers" + + self.embedder = embedder + self.k = k + self.corpus = corpus + self.normalize = normalize + + self.corpus_embeddings = self.embedder(self.corpus) + self.corpus_embeddings = self._normalize(self.corpus_embeddings) if self.normalize else self.corpus_embeddings + + self.index = self._build_faiss() if len(corpus) >= brute_force_threshold else None + self.search_fn = Unbatchify(self._batch_forward) + + def __call__(self, query: str): + return self.forward(query) + + def forward(self, query: str): + import dspy + return dspy.Prediction(passages=self.search_fn(query)) + + def _batch_forward(self, queries: List[str]): + q_embeds = self.embedder(queries) + q_embeds = self._normalize(q_embeds) if self.normalize else q_embeds + + pids = self._faiss_search(q_embeds, self.k * 10) if self.index else None + pids = np.tile(np.arange(len(self.corpus)), (len(queries), 1)) if pids is None else pids + + return self._rerank_and_predict(q_embeds, pids) + + def _build_faiss(self): + nbytes = 32 + partitions = int(2 * np.sqrt(len(self.corpus))) + dim = self.corpus_embeddings.shape[1] + + try: + import faiss + except ImportError: + raise ImportError("Please `pip install faiss-cpu` or increase `brute_force_threshold` to avoid FAISS.") + + quantizer = faiss.IndexFlatL2(dim) + index = faiss.IndexIVFPQ(quantizer, dim, partitions, nbytes, 8) + + print(f"Training a {nbytes}-byte FAISS index with {partitions} partitions, based on " + f"{len(self.corpus)} x {dim}-dim embeddings") + index.train(self.corpus_embeddings) + index.add(self.corpus_embeddings) + index.nprobe = min(16, partitions) + + return index + + def _faiss_search(self, query_embeddings: np.ndarray, num_candidates: int): + return self.index.search(query_embeddings, num_candidates)[1] + + def _rerank_and_predict(self, q_embeds: np.ndarray, candidate_indices: np.ndarray): + candidate_embeddings = self.corpus_embeddings[candidate_indices] + scores = np.einsum('qd,qkd->qk', q_embeds, candidate_embeddings) + + top_k_indices = np.argsort(-scores, axis=1)[:, :self.k] + top_indices = candidate_indices[np.arange(len(q_embeds))[:, None], top_k_indices] + + return [[self.corpus[idx] for idx in indices] for indices in top_indices] + + def _normalize(self, embeddings: np.ndarray): + norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + return embeddings / np.maximum(norms, 1e-10) diff --git a/dspy/utils/unbatchify.py b/dspy/utils/unbatchify.py new file mode 100644 index 000000000..bafdc8cb3 --- /dev/null +++ b/dspy/utils/unbatchify.py @@ -0,0 +1,111 @@ +import time +import queue +import threading +from typing import Any, Callable, List +from concurrent.futures import Future + +class Unbatchify: + def __init__( + self, + batch_fn: Callable[[List[Any]], List[Any]], + max_batch_size: int = 32, + max_wait_time: float = 0.1 + ): + """ + Initializes the Unbatchify. + + Args: + batch_fn: The batch-processing function that accepts a list of inputs and returns a list of outputs. + max_batch_size: The maximum number of items to include in a batch. + max_wait_time: The maximum time (in seconds) to wait for batch to fill before processing. + """ + + self.batch_fn = batch_fn + self.max_batch_size = max_batch_size + self.max_wait_time = max_wait_time + self.input_queue = queue.Queue() + self.stop_event = threading.Event() + self.worker_thread = threading.Thread(target=self._worker) + self.worker_thread.daemon = True # Ensures thread exits when main program exits + self.worker_thread.start() + + def __call__(self, input_item: Any) -> Any: + """ + Thread-safe function that accepts a single input and returns the corresponding output. + + Args: + input_item: The single input item to process. + + Returns: + The output corresponding to the input_item after processing through batch_fn. + """ + future = Future() + self.input_queue.put((input_item, future)) + try: + result = future.result() + except Exception as e: + raise e + return result + + def _worker(self): + """ + Worker thread that batches inputs and processes them using batch_fn. + """ + while not self.stop_event.is_set(): + batch = [] + futures = [] + start_time = time.time() + while len(batch) < self.max_batch_size and (time.time() - start_time) < self.max_wait_time: + try: + input_item, future = self.input_queue.get(timeout=self.max_wait_time) + batch.append(input_item) + futures.append(future) + except queue.Empty: + break + + if batch: + try: + outputs = self.batch_fn(batch) + for output, future in zip(outputs, futures): + future.set_result(output) + except Exception as e: + for future in futures: + future.set_exception(e) + else: + time.sleep(0.01) + + # Clean up remaining items when stopping + while True: + try: + _, future = self.input_queue.get_nowait() + future.set_exception(RuntimeError("Unbatchify is closed")) + except queue.Empty: + break + + print("Worker thread has been terminated.") + + def close(self): + """ + Stops the worker thread and cleans up resources. + """ + if not self.stop_event.is_set(): + self.stop_event.set() + self.worker_thread.join() + + def __enter__(self): + """ + Enables use as a context manager. + """ + return self + + def __exit__(self, exc_type, exc_value, traceback): + """ + Ensures resources are cleaned up when exiting context. + """ + self.close() + + def __del__(self): + """ + Ensures the worker thread is terminated when the object is garbage collected. + """ + self.close() diff --git a/tests/clients/test_embedding.py b/tests/clients/test_embedding.py index d12850e52..0ac9e24ba 100644 --- a/tests/clients/test_embedding.py +++ b/tests/clients/test_embedding.py @@ -2,7 +2,7 @@ from unittest.mock import Mock, patch import numpy as np -from dspy.clients.embedding import Embedding +from dspy.clients.embedding import Embedder # Mock response format similar to litellm's embedding response. @@ -27,7 +27,7 @@ def test_litellm_embedding(): mock_litellm.return_value = MockEmbeddingResponse(mock_embeddings) # Create embedding instance and call it. - embedding = Embedding(model) + embedding = Embedder(model) result = embedding(inputs) # Verify litellm was called with correct parameters. @@ -51,7 +51,7 @@ def mock_embedding_fn(texts): return expected_embeddings # Create embedding instance with callable - embedding = Embedding(mock_embedding_fn) + embedding = Embedder(mock_embedding_fn) result = embedding(inputs) np.testing.assert_allclose(result, expected_embeddings) @@ -60,5 +60,5 @@ def mock_embedding_fn(texts): def test_invalid_model_type(): # Test that invalid model type raises ValueError with pytest.raises(ValueError): - embedding = Embedding(123) # Invalid model type + embedding = Embedder(123) # Invalid model type embedding(["test"]) From 756b619e6d74fc951755093a16ddaba3ee7aa5c5 Mon Sep 17 00:00:00 2001 From: Chen Qian Date: Fri, 22 Nov 2024 19:22:34 -0800 Subject: [PATCH 04/19] Style fix for chat_adapter (#1846) --- dspy/adapters/chat_adapter.py | 165 +++++++++++++++++++--------------- 1 file changed, 92 insertions(+), 73 deletions(-) diff --git a/dspy/adapters/chat_adapter.py b/dspy/adapters/chat_adapter.py index edb5e1870..a8ae380cd 100644 --- a/dspy/adapters/chat_adapter.py +++ b/dspy/adapters/chat_adapter.py @@ -1,26 +1,23 @@ -import re -from typing import Any, Union -from dsp.adapters.base_template import Field -from dspy.signatures.signature import Signature -from .base import Adapter -from .image_utils import encode_image, Image - import ast -import json import enum import inspect -import pydantic +import json +import re import textwrap +from collections.abc import Mapping from itertools import chain +from typing import Any, Dict, List, Literal, NamedTuple, Union, get_args, get_origin + +import pydantic from pydantic import TypeAdapter -from collections.abc import Mapping from pydantic.fields import FieldInfo -from typing import Dict, KeysView, List, Literal, NamedTuple, get_args, get_origin +from dsp.adapters.base_template import Field from dspy.adapters.base import Adapter -from ..signatures.field import OutputField -from ..signatures.signature import SignatureMeta -from ..signatures.utils import get_dspy_field_type +from dspy.adapters.image_utils import Image, encode_image +from dspy.signatures.field import OutputField +from dspy.signatures.signature import Signature, SignatureMeta +from dspy.signatures.utils import get_dspy_field_type field_header_pattern = re.compile(r"\[\[ ## (\w+) ## \]\]") @@ -33,12 +30,15 @@ class FieldInfoWithName(NamedTuple): # Built-in field indicating that a chat turn has been completed. BuiltInCompletedOutputFieldInfo = FieldInfoWithName(name="completed", info=OutputField()) + class ChatAdapter(Adapter): def format(self, signature: Signature, demos: list[dict[str, Any]], inputs: dict[str, Any]) -> list[dict[str, Any]]: messages: list[dict[str, Any]] = [] # Extract demos where some of the output_fields are not filled in. - incomplete_demos = [demo for demo in demos if not all(k in demo and demo[k] is not None for k in signature.fields)] + incomplete_demos = [ + demo for demo in demos if not all(k in demo and demo[k] is not None for k in signature.fields) + ] complete_demos = [demo for demo in demos if demo not in incomplete_demos] # Filter out demos that don't have at least one input and one output field. incomplete_demos = [ @@ -99,6 +99,7 @@ def format_finetune_data(self, signature, demos, inputs, outputs): # Wrap the messages in a dictionary with a "messages" key return dict(messages=messages) + def format_turn(self, signature, values, role, incomplete=False): return format_turn(signature, values, role, incomplete) @@ -112,8 +113,7 @@ def format_fields(self, signature, values, role): } return format_fields(fields_with_values) - - + def format_blob(blob): if "\n" not in blob and "«" not in blob and "»" not in blob: @@ -139,6 +139,7 @@ def format_input_list_field_value(value: List[Any]) -> str: return "\n".join([f"[{idx+1}] {format_blob(txt)}" for idx, txt in enumerate(value)]) + def _serialize_for_json(value): if isinstance(value, pydantic.BaseModel): return value.model_dump() @@ -149,6 +150,7 @@ def _serialize_for_json(value): else: return value + def _format_field_value(field_info: FieldInfo, value: Any, assume_text=True) -> Union[str, dict]: """ Formats the value of the specified field according to the field's DSPy type (input or output), @@ -171,7 +173,7 @@ def _format_field_value(field_info: FieldInfo, value: Any, assume_text=True) -> if assume_text: return string_value - elif (isinstance(value, Image) or field_info.annotation == Image): + elif isinstance(value, Image) or field_info.annotation == Image: # This validation should happen somewhere else # Safe to import PIL here because it's only imported when an image is actually being formatted try: @@ -193,7 +195,6 @@ def _format_field_value(field_info: FieldInfo, value: Any, assume_text=True) -> return {"type": "text", "text": string_value} - def format_fields(fields_with_values: Dict[FieldInfoWithName, Any], assume_text=True) -> Union[str, List[dict]]: """ Formats the values of the specified fields according to the field's DSPy type (input or output), @@ -222,10 +223,11 @@ def format_fields(fields_with_values: Dict[FieldInfoWithName, Any], assume_text= else: return output + def parse_value(value, annotation): if annotation is str: return str(value) - + parsed_value = value if isinstance(annotation, enum.EnumMeta): @@ -238,70 +240,85 @@ def parse_value(value, annotation): parsed_value = ast.literal_eval(value) except (ValueError, SyntaxError): parsed_value = value - + return TypeAdapter(annotation).validate_python(parsed_value) -def format_turn(signature, values, role, incomplete=False): - fields_to_collapse = [] +def format_turn(signature, values, role, incomplete=False): """ Constructs a new message ("turn") to append to a chat thread. The message is carefully formatted so that it can instruct an LLM to generate responses conforming to the specified DSPy signature. Args: - signature: The DSPy signature to which future LLM responses should conform. - values: A dictionary mapping field names (from the DSPy signature) to corresponding values - that should be included in the message. - role: The role of the message, which can be either "user" or "assistant". - incomplete: If True, indicates that output field values are present in the set of specified - ``values``. If False, indicates that ``values`` only contains input field values. + signature: The DSPy signature to which future LLM responses should conform. + values: A dictionary mapping field names (from the DSPy signature) to corresponding values + that should be included in the message. + role: The role of the message, which can be either "user" or "assistant". + incomplete: If True, indicates that output field values are present in the set of specified + ``values``. If False, indicates that ``values`` only contains input field values. + Returns: - A chat message that can be appended to a chat thread. The message contains two string fields: - ``role`` ("user" or "assistant") and ``content`` (the message text). + A chat message that can be appended to a chat thread. The message contains two string fields: + ``role`` ("user" or "assistant") and ``content`` (the message text). """ + fields_to_collapse = [] content = [] if role == "user": - fields: Dict[str, FieldInfo] = signature.input_fields + fields = signature.input_fields if incomplete: - fields_to_collapse.append({"type": "text", "text": "This is an example of the task, though some input or output fields are not supplied."}) + fields_to_collapse.append( + { + "type": "text", + "text": "This is an example of the task, though some input or output fields are not supplied.", + } + ) else: - fields: Dict[str, FieldInfo] = signature.output_fields + fields = signature.output_fields # Add the built-in field indicating that the chat turn has been completed fields[BuiltInCompletedOutputFieldInfo.name] = BuiltInCompletedOutputFieldInfo.info values = {**values, BuiltInCompletedOutputFieldInfo.name: ""} - field_names: KeysView = fields.keys() + field_names = fields.keys() if not incomplete: if not set(values).issuperset(set(field_names)): raise ValueError(f"Expected {field_names} but got {values.keys()}") - - fields_to_collapse.extend(format_fields( - fields_with_values={ - FieldInfoWithName(name=field_name, info=field_info): values.get( - field_name, "Not supplied for this particular example." - ) - for field_name, field_info in fields.items() - }, - assume_text=False - )) + + fields_to_collapse.extend( + format_fields( + fields_with_values={ + FieldInfoWithName(name=field_name, info=field_info): values.get( + field_name, "Not supplied for this particular example." + ) + for field_name, field_info in fields.items() + }, + assume_text=False, + ) + ) if role == "user": output_fields = list(signature.output_fields.keys()) + def type_info(v): - return f" (must be formatted as a valid Python {get_annotation_name(v.annotation)})" \ - if v.annotation is not str else "" + return ( + f" (must be formatted as a valid Python {get_annotation_name(v.annotation)})" + if v.annotation is not str + else "" + ) + if output_fields: - fields_to_collapse.append({ - "type": "text", - "text": "Respond with the corresponding output fields, starting with the field " - + ", then ".join(f"`[[ ## {f} ## ]]`{type_info(v)}" for f, v in signature.output_fields.items()) - + ", and then ending with the marker for `[[ ## completed ## ]]`." - }) - + fields_to_collapse.append( + { + "type": "text", + "text": "Respond with the corresponding output fields, starting with the field " + + ", then ".join(f"`[[ ## {f} ## ]]`{type_info(v)}" for f, v in signature.output_fields.items()) + + ", and then ending with the marker for `[[ ## completed ## ]]`.", + } + ) + # flatmap the list if any items are lists otherwise keep the item - flattened_list = list(chain.from_iterable( - item if isinstance(item, list) else [item] for item in fields_to_collapse - )) + flattened_list = list( + chain.from_iterable(item if isinstance(item, list) else [item] for item in fields_to_collapse) + ) if all(message.get("type", None) == "text" for message in flattened_list): content = "\n\n".join(message.get("text") for message in flattened_list) @@ -314,16 +331,16 @@ def type_info(v): if not collapsed_messages: collapsed_messages.append(item) continue - - # If current item is image, add to collapsed_messages + + # If the current item is image, add to collapsed_messages if item.get("type") == "image_url": if collapsed_messages[-1].get("type") == "text": collapsed_messages[-1]["text"] += "\n" collapsed_messages.append(item) - # If previous item is text and current item is text, append to previous item + # If the previous item is text and current item is text, append to the previous item elif collapsed_messages[-1].get("type") == "text": collapsed_messages[-1]["text"] += "\n\n" + item["text"] - # If previous item is not text(aka image), add current item as a new item + # If the previous item is not text(aka image), add the current item as a new item else: item["text"] = "\n\n" + item["text"] collapsed_messages.append(item) @@ -357,16 +374,18 @@ def enumerate_fields(fields: dict[str, Field]) -> str: def move_type_to_front(d): # Move the 'type' key to the front of the dictionary, recursively, for LLM readability/adherence. if isinstance(d, Mapping): - return {k: move_type_to_front(v) for k, v in sorted(d.items(), key=lambda item: (item[0] != 'type', item[0]))} + return {k: move_type_to_front(v) for k, v in sorted(d.items(), key=lambda item: (item[0] != "type", item[0]))} elif isinstance(d, list): return [move_type_to_front(item) for item in d] return d + def prepare_schema(type_): schema = pydantic.TypeAdapter(type_).json_schema() schema = move_type_to_front(schema) return schema + def prepare_instructions(signature: SignatureMeta): parts = [] parts.append("Your input fields are:\n" + enumerate_fields(signature.input_fields)) @@ -374,21 +393,21 @@ def prepare_instructions(signature: SignatureMeta): parts.append("All interactions will be structured in the following way, with the appropriate values filled in.") def field_metadata(field_name, field_info): - type_ = field_info.annotation + field_type = field_info.annotation - if get_dspy_field_type(field_info) == 'input' or type_ is str: + if get_dspy_field_type(field_info) == "input" or field_type is str: desc = "" - elif type_ is bool: + elif field_type is bool: desc = "must be True or False" - elif type_ in (int, float): - desc = f"must be a single {type_.__name__} value" - elif inspect.isclass(type_) and issubclass(type_, enum.Enum): - desc= f"must be one of: {'; '.join(type_.__members__)}" - elif hasattr(type_, '__origin__') and type_.__origin__ is Literal: - desc = f"must be one of: {'; '.join([str(x) for x in type_.__args__])}" + elif field_type in (int, float): + desc = f"must be a single {field_type.__name__} value" + elif inspect.isclass(field_type) and issubclass(field_type, enum.Enum): + desc = f"must be one of: {'; '.join(field_type.__members__)}" + elif hasattr(field_type, "__origin__") and field_type.__origin__ is Literal: + desc = f"must be one of: {'; '.join([str(x) for x in field_type.__args__])}" else: desc = "must be pareseable according to the following JSON schema: " - desc += json.dumps(prepare_schema(type_), ensure_ascii=False) + desc += json.dumps(prepare_schema(field_type), ensure_ascii=False) desc = (" " * 8) + f"# note: the value you produce {desc}" if desc else "" return f"{{{field_name}}}{desc}" @@ -399,7 +418,7 @@ def format_signature_fields_for_instructions(fields: Dict[str, FieldInfo]): FieldInfoWithName(name=field_name, info=field_info): field_metadata(field_name, field_info) for field_name, field_info in fields.items() }, - assume_text=True + assume_text=True, ) parts.append(format_signature_fields_for_instructions(signature.input_fields)) From 2b6b20489bb249a54d39716ed7e35be5350524b9 Mon Sep 17 00:00:00 2001 From: Bryan <32337174+b-d055@users.noreply.github.com> Date: Sat, 23 Nov 2024 11:45:32 -0500 Subject: [PATCH 05/19] Removing duplicate sentence in signature docs (#1848) --- docs/docs/learn/programming/signatures.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/docs/learn/programming/signatures.md b/docs/docs/learn/programming/signatures.md index af400b11a..450f69f64 100644 --- a/docs/docs/learn/programming/signatures.md +++ b/docs/docs/learn/programming/signatures.md @@ -64,7 +64,6 @@ The 21-year-old Lee made seven appearances and scored one goal for West Ham last Many DSPy modules (except `dspy.Predict`) return auxiliary information by expanding your signature under the hood. -For example, `dspy.ChainOfThought` also adds a `reasoning` field that includes the LM's reasoning before it generates the output `summary`. For example, `dspy.ChainOfThought` also adds a `reasoning` field that includes the LM's reasoning before it generates the output `summary`. ```python From 1b9b280f6b456114e6929ba74d9f317d8fd03f60 Mon Sep 17 00:00:00 2001 From: Omar Khattab Date: Sat, 23 Nov 2024 22:20:12 -0800 Subject: [PATCH 06/19] Expand default cache limit to 30GB (#1850) --- dspy/clients/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/dspy/clients/__init__.py b/dspy/clients/__init__.py index 2fc0e2543..de2e76f3c 100644 --- a/dspy/clients/__init__.py +++ b/dspy/clients/__init__.py @@ -8,8 +8,15 @@ from litellm.caching import Cache DISK_CACHE_DIR = os.environ.get("DSPY_CACHEDIR") or os.path.join(Path.home(), ".dspy_cache") +DISK_CACHE_LIMIT = int(os.environ.get("DSPY_CACHE_LIMIT", 3e10)) # 30 GB default + +# TODO: There's probably value in getting litellm to support FanoutCache and to separate the limit for +# the LM cache from the embeddings cache. Then we can lower the default 30GB limit. litellm.cache = Cache(disk_cache_dir=DISK_CACHE_DIR, type="disk") +litellm.cache.cache.disk_cache.reset('size_limit', DISK_CACHE_LIMIT) + litellm.telemetry = False + # Turn off by default to avoid LiteLLM logging during every LM call. litellm.suppress_debug_info = True From 55ddec1c8a789dc1a0bae3687711a93fda4bf168 Mon Sep 17 00:00:00 2001 From: Omar Khattab Date: Sat, 23 Nov 2024 22:22:09 -0800 Subject: [PATCH 07/19] Update RAG tutorial to use the downsampled RAG-QA Arena upload (#1851) --- docs/docs/index.md | 2 +- docs/docs/tutorials/rag/index.ipynb | 2082 ++++++++++++++++++++++----- dspy/evaluate/auto_evaluation.py | 8 +- dspy/utils/__init__.py | 14 + 4 files changed, 1725 insertions(+), 381 deletions(-) diff --git a/docs/docs/index.md b/docs/docs/index.md index cb16c7c69..81b0b7f28 100644 --- a/docs/docs/index.md +++ b/docs/docs/index.md @@ -368,7 +368,7 @@ BootstrapFS on MATH with a tiny LM like Llama-3.2 with Ollama (maybe with a big ## 3) **DSPy's Ecosystem** advances open-source AI research. -Compared to working on or with monolithic LMs, DSPy's modular paradigm aims to enable a large community to improve the compositional architectures, inference-time strategies, and optimizers for LM programs in an open, distributed way. It gives you more control, helps you iterate much faster, and allows your programs to get better over time by applying the latest optimizers or modules. +Compared to monolithic LMs, DSPy's modular paradigm enables a large community to improve the compositional architectures, inference-time strategies, and optimizers for LM programs in an open, distributed way. This gives DSPy users more control, helps them iterate much faster, and allows their programs to get better over time by applying the latest optimizers or modules. The DSPy research effort started at Stanford NLP in Feb 2022, building on what we learned from developing early [compound LM systems](https://bair.berkeley.edu/blog/2024/02/18/compound-ai-systems/) like [ColBERT-QA](https://arxiv.org/abs/2007.00814), [Baleen](https://arxiv.org/abs/2101.00436), and [Hindsight](https://arxiv.org/abs/2110.07752). The first version was released as [DSP](https://arxiv.org/abs/2212.14024) in Dec 2022 and evolved by Oct 2023 into [DSPy](https://arxiv.org/abs/2310.03714). Thanks to [250 contributors](https://github.com/stanfordnlp/dspy/graphs/contributors), DSPy has introduced tens of thousands of people to building and optimizing modular LM programs. diff --git a/docs/docs/tutorials/rag/index.ipynb b/docs/docs/tutorials/rag/index.ipynb index 59f3b07d3..a2dbe1108 100644 --- a/docs/docs/tutorials/rag/index.ipynb +++ b/docs/docs/tutorials/rag/index.ipynb @@ -55,11 +55,11 @@ "text": [ "In Linux, \"high memory\" and \"low memory\" refer to different regions of the system's memory address space, particularly in the context of 32-bit architectures.\n", "\n", - "- **Low Memory**: This typically refers to the first 896 MB of memory in a 32-bit system. The kernel can directly access this memory without any special handling. It is used for kernel data structures and for user processes. The low memory region is crucial for the kernel's operation, as it allows for efficient memory management and access.\n", + "- **Low Memory**: This typically refers to the memory that is directly accessible by the kernel. In a 32-bit system, this is usually the first 896 MB of RAM (from 0 to 896 MB). The kernel can directly map this memory, making it faster for the kernel to access and manage. Low memory is used for kernel data structures and for user processes that require direct access to memory.\n", "\n", - "- **High Memory**: This refers to memory above the 896 MB threshold in a 32-bit system. The kernel cannot directly access this memory; instead, it must use special mechanisms to map it into the kernel's address space when needed. High memory is often used for user processes and can be allocated dynamically, but it requires additional overhead for the kernel to manage.\n", + "- **High Memory**: This refers to the memory above the low memory limit, which is not directly accessible by the kernel in a 32-bit system. This area is typically above 896 MB. The kernel cannot directly access this memory without using special mechanisms, such as mapping it into the kernel's address space when needed. High memory is used for user processes that require more memory than what is available in low memory.\n", "\n", - "In 64-bit systems, the distinction between high and low memory is less relevant, as the addressable memory space is significantly larger, and the kernel can access most of the memory directly.\n" + "In summary, low memory is directly accessible by the kernel, while high memory requires additional steps for the kernel to access it, especially in 32-bit systems. In 64-bit systems, this distinction is less significant as the kernel can address a much larger memory space directly.\n" ] } ], @@ -94,7 +94,7 @@ "\n", "\n", "\n", - "\u001b[34m[2024-11-10T12:39:19.458514]\u001b[0m\n", + "\u001b[34m[2024-11-23T22:12:48.901453]\u001b[0m\n", "\n", "\u001b[31mSystem message:\u001b[0m\n", "\n", @@ -131,11 +131,11 @@ "\u001b[32m[[ ## response ## ]]\n", "In Linux, \"high memory\" and \"low memory\" refer to different regions of the system's memory address space, particularly in the context of 32-bit architectures.\n", "\n", - "- **Low Memory**: This typically refers to the first 896 MB of memory in a 32-bit system. The kernel can directly access this memory without any special handling. It is used for kernel data structures and for user processes. The low memory region is crucial for the kernel's operation, as it allows for efficient memory management and access.\n", + "- **Low Memory**: This typically refers to the memory that is directly accessible by the kernel. In a 32-bit system, this is usually the first 896 MB of RAM (from 0 to 896 MB). The kernel can directly map this memory, making it faster for the kernel to access and manage. Low memory is used for kernel data structures and for user processes that require direct access to memory.\n", "\n", - "- **High Memory**: This refers to memory above the 896 MB threshold in a 32-bit system. The kernel cannot directly access this memory; instead, it must use special mechanisms to map it into the kernel's address space when needed. High memory is often used for user processes and can be allocated dynamically, but it requires additional overhead for the kernel to manage.\n", + "- **High Memory**: This refers to the memory above the low memory limit, which is not directly accessible by the kernel in a 32-bit system. This area is typically above 896 MB. The kernel cannot directly access this memory without using special mechanisms, such as mapping it into the kernel's address space when needed. High memory is used for user processes that require more memory than what is available in low memory.\n", "\n", - "In 64-bit systems, the distinction between high and low memory is less relevant, as the addressable memory space is significantly larger, and the kernel can access most of the memory directly.\n", + "In summary, low memory is directly accessible by the kernel, while high memory requires additional steps for the kernel to access it, especially in 32-bit systems. In 64-bit systems, this distinction is less significant as the kernel can address a much larger memory space directly.\n", "\n", "[[ ## completed ## ]]\u001b[0m\n", "\n", @@ -170,8 +170,8 @@ "data": { "text/plain": [ "Prediction(\n", - " reasoning=\"The placement of curly braces on their own line is largely a matter of coding style and conventions. In some programming languages and style guides, such as those used in C, C++, and Java, it is common to place opening curly braces on the same line as the control statement (like `if`, `for`, etc.) and closing braces on a new line. However, other styles, such as the Allman style, advocate for placing both opening and closing braces on their own lines. Ultimately, the decision should be based on the team's coding standards or personal preference, as long as it maintains readability and consistency.\",\n", - " response=\"Curly braces can either appear on their own line or not, depending on the coding style you choose to follow. It's important to adhere to a consistent style throughout your codebase.\"\n", + " reasoning='The placement of curly braces on their own line depends on the coding style and conventions being followed. In some programming languages and style guides, such as the Allman style, curly braces are placed on their own line to enhance readability. In contrast, other styles, like K&R style, place the opening brace on the same line as the control statement. Ultimately, it is a matter of personal or team preference, and consistency within a project is key.',\n", + " response='Curly braces can appear on their own line depending on the coding style you are following. If you prefer a style that enhances readability, such as the Allman style, then yes, they should be on their own line. However, if you are following a different style, like K&R, they may not need to be. Consistency is important, so choose a style and stick with it.'\n", ")" ] }, @@ -191,7 +191,7 @@ "source": [ "\n", "\n", - "Interestingly, asking for reasoning made the output `response` shorter in this case. Is this a good thing or a bad thing? It depends on what you need: there's no free lunch, but DSPy gives you the tools to experiment with different strategies extremely quickly.\n", + "Interestingly, asking for reasoning can make the output `response` shorter in this case. Is this a good thing or a bad thing? It depends on what you need: there's no free lunch, but DSPy gives you the tools to experiment with different strategies extremely quickly.\n", "\n", "By the way, `dspy.ChainOfThought` is implemented in DSPy, using `dspy.Predict`. This is a good place to `dspy.inspect_history` if you're curious.\n" ] @@ -220,25 +220,14 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", "import ujson\n", - "import requests\n", + "from dspy.utils import download\n", "\n", - "def download(url):\n", - " filename = os.path.basename(url)\n", - " remote_size = int(requests.head(url, allow_redirects=True).headers.get('Content-Length', 0))\n", - " local_size = os.path.getsize(filename) if os.path.exists(filename) else 0\n", + "# Download question--answer pairs from the RAG-QA Arena \"Tech\" dataset.\n", + "download(\"https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_examples.jsonl\")\n", "\n", - " if local_size != remote_size:\n", - " print(f\"Downloading '{filename}'...\")\n", - " with requests.get(url, stream=True) as r, open(filename, 'wb') as f:\n", - " for chunk in r.iter_content(chunk_size=8192): f.write(chunk)\n", - "\n", - "# Download 500 question--answer pairs from the RAG-QA Arena \"Tech\" dataset.\n", - "download(\"https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_500.json\")\n", - "\n", - "with open('ragqa_arena_tech_500.json') as f:\n", - " data = ujson.load(f)" + "with open(\"ragqa_arena_tech_examples.jsonl\") as f:\n", + " data = [ujson.loads(line) for line in f]" ] }, { @@ -249,8 +238,9 @@ { "data": { "text/plain": [ - "{'question': 'how to transfer whatsapp voice message to computer?',\n", - " 'response': 'To transfer voice notes from WhatsApp on your device to your computer, you have the option to select the \"Share\" feature within the app and send the files via Email, Gmail, Bluetooth, or other available services. \\nYou can also move the files onto your phone\\'s SD card, connect your phone to your computer via a USB cable, then find and transfer the files via File Explorer on your PC. \\nAlternatively, you can choose to attach all the desired voice notes to an email and, from your phone, send them to your own email address. \\nUpon receiving the email on your computer, you can then download the voice note attachments.'}" + "{'question': 'why igp is used in mpls?',\n", + " 'response': \"An IGP exchanges routing prefixes between gateways/routers. \\nWithout a routing protocol, you'd have to configure each route on every router and you'd have no dynamic updates when routes change because of link failures. \\nFuthermore, within an MPLS network, an IGP is vital for advertising the internal topology and ensuring connectivity for MP-BGP inside the network.\",\n", + " 'gold_doc_ids': [2822, 2823]}" ] }, "execution_count": 6, @@ -282,7 +272,7 @@ { "data": { "text/plain": [ - "Example({'question': 'what are high memory and low memory on linux?', 'response': '\"High Memory\" refers to the application or user space, the memory that user programs can use and which isn\\'t permanently mapped in the kernel\\'s space, while \"Low Memory\" is the kernel\\'s space, which the kernel can address directly and is permanently mapped. \\nThe user cannot access the Low Memory as it is set aside for the required kernel programs.'}) (input_keys={'question'})" + "Example({'question': 'why are my text messages coming up as maybe?', 'response': 'This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you \"Maybe\". \\n\\nHowever, it has been suggested there is a bug in iOS 11.2 that can result in \"Maybe\" being displayed even when \"Find Contacts in Other Apps\" is disabled.', 'gold_doc_ids': [3956, 3957, 8034]}) (input_keys={'question'})" ] }, "execution_count": 7, @@ -305,11 +295,12 @@ "\n", "Now, let's divide the data into:\n", "\n", - "- Training and Validation sets:\n", + "- Training (and with it Validation) set:\n", " - These are the splits you typically give to DSPy optimizers.\n", " - Optimizers typically learn directly from the training examples and check their progress using the validation examples.\n", " - It's good to have 30--300 examples for training and validation each.\n", " - For prompt optimizers in particular, it's often better to pass _more_ validation than training.\n", + " - Below, we'll use 200 in total. MIPROv2 will split them into 20% training and 80% validation if you don't pass a valset.\n", "\n", "- Development and Test sets: The rest, typically on the order of 30--1000, can be used for:\n", " - development (i.e., you can inspect them as you iterate on your system) and\n", @@ -324,7 +315,7 @@ { "data": { "text/plain": [ - "(50, 100, 150, 200)" + "(200, 300, 500)" ] }, "execution_count": 8, @@ -333,9 +324,12 @@ } ], "source": [ - "trainset, valset, devset, testset = data[:50], data[50:150], data[150:300], data[300:500]\n", + "import random\n", "\n", - "len(trainset), len(valset), len(devset), len(testset)" + "random.Random(0).shuffle(data)\n", + "trainset, devset, testset = data[:200], data[200:500], data[500:1000]\n", + "\n", + "len(trainset), len(devset), len(testset)" ] }, { @@ -346,8 +340,7 @@ "\n", "What kind of metric can suit our question-answering task? There are many choices, but since the answers are long, we may ask: How well does the system response _cover_ all key facts in the gold response? And the other way around, how well is the system response _not saying things_ that aren't in the gold response?\n", "\n", - "That metric is essentially a **semantic F1**, so let's load a `SemanticF1` metric from DSPy. This metric is actually implemented as a [very simple DSPy module](https://github.com/stanfordnlp/dspy/blob/77c2e1cceba427c7f91edb2ed5653276fb0c6de7/dspy/evaluate/auto_evaluation.py#L21) using whatever LM we're working with.\n", - "\n" + "That metric is essentially a **semantic F1**, so let's load a `SemanticF1` metric from DSPy. This metric is actually implemented as a [very simple DSPy module](https://github.com/stanfordnlp/dspy/blob/main/dspy/evaluate/auto_evaluation.py#L21) using whatever LM we're working with." ] }, { @@ -359,14 +352,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "Question: \t what are high memory and low memory on linux?\n", + "Question: \t why are my text messages coming up as maybe?\n", "\n", - "Gold Response: \t \"High Memory\" refers to the application or user space, the memory that user programs can use and which isn't permanently mapped in the kernel's space, while \"Low Memory\" is the kernel's space, which the kernel can address directly and is permanently mapped. \n", - "The user cannot access the Low Memory as it is set aside for the required kernel programs.\n", + "Gold Response: \t This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you \"Maybe\". \n", "\n", - "Predicted Response: \t In Linux, \"low memory\" refers to the first 896 MB of RAM, which is directly accessible by the kernel and used for kernel operations and user processes. \"High memory\" refers to memory above this limit, which is not directly accessible by the kernel in 32-bit systems and is used for user processes, requiring special handling to access. This distinction is crucial for effective memory management in Linux.\n", + "However, it has been suggested there is a bug in iOS 11.2 that can result in \"Maybe\" being displayed even when \"Find Contacts in Other Apps\" is disabled.\n", "\n", - "Semantic F1 Score: 0.87\n" + "Predicted Response: \t Your text messages are showing up as \"maybe\" because your messaging app is uncertain about the sender's identity. This typically occurs when the sender's number is not saved in your contacts or if the message is from an unknown number. To resolve this, you can save the contact in your address book or check the message settings in your app.\n", + "\n", + "Semantic F1 Score: 0.33\n" ] } ], @@ -374,7 +368,7 @@ "from dspy.evaluate import SemanticF1\n", "\n", "# Instantiate the metric.\n", - "metric = SemanticF1()\n", + "metric = SemanticF1(decompositional=True)\n", "\n", "# Produce a prediction from our `cot` module, using the `example` above as input.\n", "pred = cot(**example.inputs())\n", @@ -410,7 +404,7 @@ "\n", "\n", "\n", - "\u001b[34m[2024-11-10T12:39:19.701005]\u001b[0m\n", + "\u001b[34m[2024-11-23T22:12:49.329836]\u001b[0m\n", "\n", "\u001b[31mSystem message:\u001b[0m\n", "\n", @@ -421,8 +415,11 @@ "\n", "Your output fields are:\n", "1. `reasoning` (str)\n", - "2. `recall` (float): fraction (out of 1.0) of ground truth covered by the system response\n", - "3. `precision` (float): fraction (out of 1.0) of system response covered by the ground truth\n", + "2. `ground_truth_key_ideas` (str): enumeration of key ideas in the ground truth\n", + "3. `system_response_key_ideas` (str): enumeration of key ideas in the system response\n", + "4. `discussion` (str): discussion of the overlap between ground truth and system response\n", + "5. `recall` (float): fraction (out of 1.0) of ground truth covered by the system response\n", + "6. `precision` (float): fraction (out of 1.0) of system response covered by the ground truth\n", "\n", "All interactions will be structured in the following way, with the appropriate values filled in.\n", "\n", @@ -438,6 +435,15 @@ "[[ ## reasoning ## ]]\n", "{reasoning}\n", "\n", + "[[ ## ground_truth_key_ideas ## ]]\n", + "{ground_truth_key_ideas}\n", + "\n", + "[[ ## system_response_key_ideas ## ]]\n", + "{system_response_key_ideas}\n", + "\n", + "[[ ## discussion ## ]]\n", + "{discussion}\n", + "\n", "[[ ## recall ## ]]\n", "{recall} # note: the value you produce must be a single float value\n", "\n", @@ -447,35 +453,50 @@ "[[ ## completed ## ]]\n", "\n", "In adhering to this structure, your objective is: \n", - " Compare a system's response to the ground truth to compute its recall and precision.\n", - " If asked to reason, enumerate key ideas in each response, and whether they are present in the other response.\n", + " Compare a system's response to the ground truth to compute recall and precision of key ideas.\n", + " You will first enumerate key ideas in each response, discuss their overlap, and then report recall and precision.\n", "\n", "\n", "\u001b[31mUser message:\u001b[0m\n", "\n", "[[ ## question ## ]]\n", - "what are high memory and low memory on linux?\n", + "why are my text messages coming up as maybe?\n", "\n", "[[ ## ground_truth ## ]]\n", - "\"High Memory\" refers to the application or user space, the memory that user programs can use and which isn't permanently mapped in the kernel's space, while \"Low Memory\" is the kernel's space, which the kernel can address directly and is permanently mapped. \n", - "The user cannot access the Low Memory as it is set aside for the required kernel programs.\n", + "This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you \"Maybe\". \n", + "\n", + "However, it has been suggested there is a bug in iOS 11.2 that can result in \"Maybe\" being displayed even when \"Find Contacts in Other Apps\" is disabled.\n", "\n", "[[ ## system_response ## ]]\n", - "In Linux, \"low memory\" refers to the first 896 MB of RAM, which is directly accessible by the kernel and used for kernel operations and user processes. \"High memory\" refers to memory above this limit, which is not directly accessible by the kernel in 32-bit systems and is used for user processes, requiring special handling to access. This distinction is crucial for effective memory management in Linux.\n", + "Your text messages are showing up as \"maybe\" because your messaging app is uncertain about the sender's identity. This typically occurs when the sender's number is not saved in your contacts or if the message is from an unknown number. To resolve this, you can save the contact in your address book or check the message settings in your app.\n", "\n", - "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## recall ## ]]` (must be formatted as a valid Python float), then `[[ ## precision ## ]]` (must be formatted as a valid Python float), and then ending with the marker for `[[ ## completed ## ]]`.\n", + "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## ground_truth_key_ideas ## ]]`, then `[[ ## system_response_key_ideas ## ]]`, then `[[ ## discussion ## ]]`, then `[[ ## recall ## ]]` (must be formatted as a valid Python float), then `[[ ## precision ## ]]` (must be formatted as a valid Python float), and then ending with the marker for `[[ ## completed ## ]]`.\n", "\n", "\n", "\u001b[31mResponse:\u001b[0m\n", "\n", "\u001b[32m[[ ## reasoning ## ]]\n", - "The system response correctly identifies \"Low Memory\" as the portion of RAM directly accessible by the kernel and used for kernel operations, which aligns with the ground truth. It also mentions that \"High Memory\" refers to memory above a certain limit that is not directly accessible by the kernel, which is consistent with the ground truth's description of user space. However, the ground truth emphasizes that Low Memory is set aside for kernel programs and that users cannot access it, which is not explicitly stated in the system response. Overall, the key ideas are present, but the system response lacks the explicit mention of user access limitations for Low Memory.\n", + "The system response provides a general explanation for why text messages may show up as \"maybe,\" focusing on the uncertainty of the sender's identity. However, it does not mention the specific iOS features or the potential bug in iOS 11.2 that are highlighted in the ground truth. The ground truth explains that this feature is part of iOS 9's Proactivity features and mentions a specific bug in iOS 11.2, which is not addressed in the system response.\n", + "\n", + "[[ ## ground_truth_key_ideas ## ]]\n", + "1. Proactivity features introduced in iOS 9.\n", + "2. The system checks emails to associate phone numbers with contacts.\n", + "3. \"Maybe\" is displayed if the number is not saved in contacts.\n", + "4. Mention of a bug in iOS 11.2 causing \"Maybe\" to appear incorrectly.\n", + "\n", + "[[ ## system_response_key_ideas ## ]]\n", + "1. Text messages show up as \"maybe\" due to uncertainty about the sender's identity.\n", + "2. Occurs when the sender's number is not saved in contacts or is from an unknown number.\n", + "3. Suggests saving the contact or checking message settings.\n", + "\n", + "[[ ## discussion ## ]]\n", + "There is some overlap between the ground truth and the system response regarding the uncertainty of the sender's identity and the suggestion to save the contact. However, the system response lacks specific details about the iOS features and the bug mentioned in the ground truth. The ground truth provides a more comprehensive explanation of the \"maybe\" feature, while the system response is more general and does not address the iOS version specifics.\n", "\n", "[[ ## recall ## ]]\n", - "0.85\n", + "0.25\n", "\n", "[[ ## precision ## ]]\n", - "0.90\n", + "0.5\n", "\n", "[[ ## completed ## ]]\u001b[0m\n", "\n", @@ -502,12 +523,25 @@ "execution_count": 11, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average Metric: 125.68 / 300 (41.9%): 100%|██████████| 300/300 [00:00<00:00, 598.18it/s]" + ] + }, { "name": "stderr", "output_type": "stream", "text": [ - "Average Metric: 55.380830691218016 / 150 (36.9): 100%|██████████| 150/150 [00:00<00:00, 513.51it/s]\n", - "2024/11/10 12:39:20 INFO dspy.evaluate.evaluate: Average Metric: 55.380830691218016 / 150 (36.9%)\n" + "2024/11/23 22:12:49 INFO dspy.evaluate.evaluate: Average Metric: 125.68228336477591 / 300 (41.9%)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" ] }, { @@ -533,6 +567,7 @@ " \n", " question\n", " example_response\n", + " gold_doc_ids\n", " reasoning\n", " pred_response\n", " SemanticF1\n", @@ -541,44 +576,50 @@ " \n", " \n", " 0\n", - " why is mercurial considered to be easier than git?\n", - " Mercurial's syntax is considered more familiar, especially for tho...\n", - " Mercurial is often considered easier than Git for several reasons....\n", - " Mercurial is considered easier than Git primarily due to its simpl...\n", - " ✔️ [0.545]\n", + " when to use c over c++, and c++ over c?\n", + " If you are equally familiar with both C++ and C, it's advisable to...\n", + " [733]\n", + " C and C++ are both powerful programming languages, but they serve ...\n", + " Use C when you need low-level access to memory, require high perfo...\n", + " \n", " \n", " \n", " 1\n", - " open finder window from current terminal location?\n", - " If you type 'open .' in Terminal, it will open the current directo...\n", - " To open a Finder window from the current terminal location on a Ma...\n", - " You can open a Finder window from your current terminal location b...\n", - " ✔️ [0.667]\n", + " should images be stored in a git repository?\n", + " One viewpoint expresses that there is no significant downside, esp...\n", + " [6253, 6254, 6275, 6278, 8215]\n", + " Storing images in a Git repository can be beneficial for version c...\n", + " Images can be stored in a Git repository, but it's important to co...\n", + " ✔️ [0.444]\n", " \n", " \n", "\n", "" ], "text/plain": [ - " question \\\n", - "0 why is mercurial considered to be easier than git? \n", - "1 open finder window from current terminal location? \n", + " question \\\n", + "0 when to use c over c++, and c++ over c? \n", + "1 should images be stored in a git repository? \n", "\n", " example_response \\\n", - "0 Mercurial's syntax is considered more familiar, especially for tho... \n", - "1 If you type 'open .' in Terminal, it will open the current directo... \n", + "0 If you are equally familiar with both C++ and C, it's advisable to... \n", + "1 One viewpoint expresses that there is no significant downside, esp... \n", + "\n", + " gold_doc_ids \\\n", + "0 [733] \n", + "1 [6253, 6254, 6275, 6278, 8215] \n", "\n", " reasoning \\\n", - "0 Mercurial is often considered easier than Git for several reasons.... \n", - "1 To open a Finder window from the current terminal location on a Ma... \n", + "0 C and C++ are both powerful programming languages, but they serve ... \n", + "1 Storing images in a Git repository can be beneficial for version c... \n", "\n", " pred_response \\\n", - "0 Mercurial is considered easier than Git primarily due to its simpl... \n", - "1 You can open a Finder window from your current terminal location b... \n", + "0 Use C when you need low-level access to memory, require high perfo... \n", + "1 Images can be stored in a Git repository, but it's important to co... \n", "\n", " SemanticF1 \n", - "0 ✔️ [0.545] \n", - "1 ✔️ [0.667] " + "0 \n", + "1 ✔️ [0.444] " ] }, "metadata": {}, @@ -594,7 +635,7 @@ " font-weight: bold;\n", " color: #555;\n", " margin: 10px 0;'>\n", - " ... 148 more rows not displayed ...\n", + " ... 298 more rows not displayed ...\n", " \n", " " ], @@ -608,7 +649,7 @@ { "data": { "text/plain": [ - "36.92" + "41.89" ] }, "execution_count": 11, @@ -640,7 +681,7 @@ "source": [ "## Basic Retrieval-Augmented Generation (RAG).\n", "\n", - "First, let's download the corpus data that we will use for RAG search. The next cell will seek to download 4 GBs, so it may take a few minutes. A future version of this notebook will come with a cache that allows you to skip downloads and the PyTorch installation." + "First, let's download the corpus data that we will use for RAG search. An older version of this tutorial used the full (650,000 document) corpus. To make this very fast and cheap to run, we've downsampled the corpus to just 28,000 documents." ] }, { @@ -649,8 +690,7 @@ "metadata": {}, "outputs": [], "source": [ - "download('https://huggingface.co/datasets/colbertv2/lotte_passages/resolve/main/technology/test_collection.jsonl')\n", - "download('https://huggingface.co/dspy/cache/resolve/main/index.pt')" + "download(\"https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_corpus.jsonl\")" ] }, { @@ -659,31 +699,33 @@ "source": [ "## Set up your system's retriever.\n", "\n", - "As far as DSPy is concerned, you can plug in any Python code for calling tools or retrievers. Hence, for our RAG system, we can plug any tools for the search step. Here, we'll just use OpenAI Embeddings and PyTorch for top-K search, but this is not a special choice, just a convenient one." + "As far as DSPy is concerned, you can plug in any Python code for calling tools or retrievers. Here, we'll just use OpenAI Embeddings and do top-K search locally, just for convenience." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 28436 documents. Will encode them below.\n", + "Training a 32-byte FAISS index with 337 partitions, based on 28436 x 512-dim embeddings\n" + ] + } + ], "source": [ - "import torch\n", - "import functools\n", - "from litellm import embedding as Embed\n", - "\n", - "with open(\"test_collection.jsonl\") as f:\n", - " corpus = [ujson.loads(line) for line in f]\n", - "\n", - "index = torch.load('index.pt', weights_only=True)\n", - "max_characters = 4000 # >98th percentile of document lengths\n", - "\n", - "@functools.lru_cache(maxsize=None)\n", - "def search(query, k=5):\n", - " query_embedding = torch.tensor(Embed(input=query, model=\"text-embedding-3-small\").data[0]['embedding'])\n", - " topk_scores, topk_indices = torch.matmul(index, query_embedding).topk(k)\n", - " topK = [dict(score=score.item(), **corpus[idx]) for idx, score in zip(topk_indices, topk_scores)]\n", - " return [doc['text'][:max_characters] for doc in topK]" + "max_characters = 6000 # for truncating >99th percentile of documents\n", + "topk_docs_to_retrieve = 5 # number of documents to retrieve per search query\n", + "\n", + "with open(\"ragqa_arena_tech_corpus.jsonl\") as f:\n", + " corpus = [ujson.loads(line)['text'][:max_characters] for line in f]\n", + " print(f\"Loaded {len(corpus)} documents. Will encode them below.\")\n", + "\n", + "embedder = dspy.Embedder('openai/text-embedding-3-small', dimensions=512)\n", + "search = dspy.retrievers.Embeddings(embedder=embedder, corpus=corpus, k=topk_docs_to_retrieve)" ] }, { @@ -707,12 +749,11 @@ "outputs": [], "source": [ "class RAG(dspy.Module):\n", - " def __init__(self, num_docs=5):\n", - " self.num_docs = num_docs\n", + " def __init__(self):\n", " self.respond = dspy.ChainOfThought('context, question -> response')\n", "\n", " def forward(self, question):\n", - " context = search(question, k=self.num_docs)\n", + " context = search(question).passages\n", " return self.respond(context=context, question=question)" ] }, @@ -733,8 +774,8 @@ "data": { "text/plain": [ "Prediction(\n", - " reasoning=\"High memory and low memory in Linux refer to the way the operating system organizes and manages memory for user-space applications and the kernel. Low memory is the portion of memory that is directly accessible by the kernel, while high memory is the part that is not directly mapped by the kernel's page tables. In a typical 32-bit architecture, low memory usually consists of the lower 3 GB of virtual memory, which is accessible to user-space applications, while high memory comprises the upper 1 GB, which is reserved for the kernel. The distinction is important for memory management, especially in systems with large amounts of RAM, as it affects how the kernel accesses and manages memory resources.\",\n", - " response=\"In Linux, high memory refers to the portion of memory that is not directly mapped by the kernel's page tables, meaning the kernel cannot access it directly without mapping it into its address space first. Low memory, on the other hand, is the segment of memory that the kernel can access directly. In a typical 32-bit system, low memory consists of the lower 3 GB of virtual memory, while high memory comprises the upper 1 GB. This organization helps manage memory more efficiently, especially in systems with large physical memory.\"\n", + " reasoning=\"High Memory and Low Memory in Linux refer to two segments of the kernel's memory space. Low Memory is the portion of memory that the kernel can access directly and is statically mapped at boot time. This area is typically used for kernel data structures and is always accessible to the kernel. High Memory, on the other hand, is not permanently mapped in the kernel's address space, meaning that the kernel cannot access it directly without first mapping it into its address space. High Memory is used for user-space applications and temporary data buffers. The distinction allows for better memory management and security, as user-space applications cannot directly access kernel-space memory.\",\n", + " response=\"In Linux, High Memory refers to the segment of memory that is not permanently mapped in the kernel's address space, which means the kernel must map it temporarily to access it. This area is typically used for user-space applications and temporary data buffers. Low Memory, in contrast, is the portion of memory that the kernel can access directly and is statically mapped at boot time. It is used for kernel data structures and is always accessible to the kernel. This separation enhances security by preventing user-space applications from accessing kernel-space memory directly.\"\n", ")" ] }, @@ -761,7 +802,7 @@ "\n", "\n", "\n", - "\u001b[34m[2024-11-10T12:39:22.802994]\u001b[0m\n", + "\u001b[34m[2024-11-23T22:13:02.348625]\u001b[0m\n", "\n", "\u001b[31mSystem message:\u001b[0m\n", "\n", @@ -797,10 +838,10 @@ "\n", "[[ ## context ## ]]\n", "[1] «As far as I remember, High Memory is used for application space and Low Memory for the kernel. Advantage is that (user-space) applications cant access kernel-space memory.»\n", - "[2] «For the people looking for an explanation in the context of Linux kernel memory space, beware that there are two conflicting definitions of the high/low memory split (unfortunately there is no standard, one has to interpret that in context): High memory defined as the totality of kernel space in VIRTUAL memory. This is a region that only the kernel can access and comprises all virtual addresses greater or equal than PAGE_OFFSET. Low memory refers therefore to the region of the remaining addresses, which correspond to the user-space memory accessible from each user process. For example: on 32-bit x86 with a default PAGE_OFFSET, this means that high memory is any address ADDR with ADDR ≥ 0xC0000000 = PAGE_OFFSET (i.e. higher 1 GB). This is the reason why in Linux 32-bit processes are typically limited to 3 GB. Note that PAGE_OFFSET cannot be configured directly, it depends on the configurable VMSPLIT_x options (source). To summarize: in 32-bit archs, virtual memory is by default split into lower 3 GB (user space) and higher 1 GB (kernel space). For 64 bit, PAGE_OFFSET is not configurable and depends on architectural details that are sometimes detected at runtime during kernel load. On x86_64, PAGE_OFFSET is 0xffff888000000000 for 4-level paging (typical) and 0xff11000000000000 for 5-level paging (source). For ARM64 this is usually 0x8000000000000000. Note though, if KASLR is enabled, this value is intentionally unpredictable. High memory defined as the portion of PHYSICAL memory that cannot be mapped contiguously with the rest of the kernel virtual memory. A portion of the kernel virtual address space can be mapped as a single contiguous chunk into the so-called physical low memory. To fully understand what this means, a deeper knowledge of the Linux virtual memory space is required. I would recommend going through these slides. From the slides: This kind of high/low memory split is only applicable to 32-bit architectures where the installed physical RAM size is relatively high (more than ~1 GB). Otherwise, i.e. when the physical address space is small (<1 GB) or when the virtual memory space is large (64 bits), the whole physical space can be accessed from the kernel virtual memory space. In that case, all physical memory is considered low memory. It is preferable that high memory does not exist at all because the whole physical space can be accessed directly from the kernel, which makes memory management a lot simpler and efficient. This is especially important when dealing with DMAs (which typically require physically contiguous memory). See also the answer by @gilles»\n", - "[3] «Low and High do not refer to whether there is a lot of usage or not. They represent the way it is organized by the system. According to Wikipedia: High Memory is the part of physical memory in a computer which is not directly mapped by the page tables of its operating system kernel. There is no duration for the free command which simply computes a snapshot of the information available. Most people, including programmers, do not need to understand it more clearly as it is managed in a much simpler form through system calls and compiler/interpreter operations.»\n", - "[4] «This is relevant to the Linux kernel; Im not sure how any Unix kernel handles this. The High Memory is the segment of memory that user-space programs can address. It cannot touch Low Memory. Low Memory is the segment of memory that the Linux kernel can address directly. If the kernel must access High Memory, it has to map it into its own address space first. There was a patch introduced recently that lets you control where the segment is. The tradeoff is that you can take addressable memory away from user space so that the kernel can have more memory that it does not have to map before using. Additional resources: http://tldp.org/HOWTO/KernelAnalysis-HOWTO-7.html http://linux-mm.org/HighMemory»\n", - "[5] «HIGHMEM is a range of kernels memory space, but it is NOT memory you access but its a place where you put what you want to access. A typical 32bit Linux virtual memory map is like: 0x00000000-0xbfffffff: user process (3GB) 0xc0000000-0xffffffff: kernel space (1GB) (CPU-specific vector and whatsoever are ignored here). Linux splits the 1GB kernel space into 2 pieces, LOWMEM and HIGHMEM. The split varies from installation to installation. If an installation chooses, say, 512MB-512MB for LOW and HIGH mems, the 512MB LOWMEM (0xc0000000-0xdfffffff) is statically mapped at the kernel boot time; usually the first so many bytes of the physical memory is used for this so that virtual and physical addresses in this range have a constant offset of, say, 0xc0000000. On the other hand, the latter 512MB (HIGHMEM) has no static mapping (although you could leave pages semi-permanently mapped there, but you must do so explicitly in your driver code). Instead, pages are temporarily mapped and unmapped here so that virtual and physical addresses in this range have no consistent mapping. Typical uses of HIGHMEM include single-time data buffers.»\n", + "[2] «HIGHMEM is a range of kernels memory space, but it is NOT memory you access but its a place where you put what you want to access. A typical 32bit Linux virtual memory map is like: 0x00000000-0xbfffffff: user process (3GB) 0xc0000000-0xffffffff: kernel space (1GB) (CPU-specific vector and whatsoever are ignored here). Linux splits the 1GB kernel space into 2 pieces, LOWMEM and HIGHMEM. The split varies from installation to installation. If an installation chooses, say, 512MB-512MB for LOW and HIGH mems, the 512MB LOWMEM (0xc0000000-0xdfffffff) is statically mapped at the kernel boot time; usually the first so many bytes of the physical memory is used for this so that virtual and physical addresses in this range have a constant offset of, say, 0xc0000000. On the other hand, the latter 512MB (HIGHMEM) has no static mapping (although you could leave pages semi-permanently mapped there, but you must do so explicitly in your driver code). Instead, pages are temporarily mapped and unmapped here so that virtual and physical addresses in this range have no consistent mapping. Typical uses of HIGHMEM include single-time data buffers.»\n", + "[3] «This is relevant to the Linux kernel; Im not sure how any Unix kernel handles this. The High Memory is the segment of memory that user-space programs can address. It cannot touch Low Memory. Low Memory is the segment of memory that the Linux kernel can address directly. If the kernel must access High Memory, it has to map it into its own address space first. There was a patch introduced recently that lets you control where the segment is. The tradeoff is that you can take addressable memory away from user space so that the kernel can have more memory that it does not have to map before using. Additional resources: http://tldp.org/HOWTO/KernelAnalysis-HOWTO-7.html http://linux-mm.org/HighMemory»\n", + "[4] «The first reference to turn to is Linux Device Drivers (available both online and in book form), particularly chapter 15 which has a section on the topic. In an ideal world, every system component would be able to map all the memory it ever needs to access. And this is the case for processes on Linux and most operating systems: a 32-bit process can only access a little less than 2^32 bytes of virtual memory (in fact about 3GB on a typical Linux 32-bit architecture). It gets difficult for the kernel, which needs to be able to map the full memory of the process whose system call its executing, plus the whole physical memory, plus any other memory-mapped hardware device. So when a 32-bit kernel needs to map more than 4GB of memory, it must be compiled with high memory support. High memory is memory which is not permanently mapped in the kernels address space. (Low memory is the opposite: it is always mapped, so you can access it in the kernel simply by dereferencing a pointer.) When you access high memory from kernel code, you need to call kmap first, to obtain a pointer from a page data structure (struct page). Calling kmap works whether the page is in high or low memory. There is also kmap_atomic which has added constraints but is more efficient on multiprocessor machines because it uses finer-grained locking. The pointer obtained through kmap is a resource: it uses up address space. Once youve finished with it, you must call kunmap (or kunmap_atomic) to free that resource; then the pointer is no longer valid, and the contents of the page cant be accessed until you call kmap again.»\n", + "[5] «/proc/meminfo will tell you how free works, but /proc/kcore can tell you what the kernel uses. From the same page: /proc/kcore This file represents the physical memory of the system and is stored in the ELF core file format. With this pseudo-file, and an unstripped kernel (/usr/src/linux/vmlinux) binary, GDB can be used to examine the current state of any kernel data structures. The total length of the file is the size of physical memory (RAM) plus 4KB. /proc/meminfo This file reports statistics about memory usage on the system. It is used by free(1) to report the amount of free and used memory (both physical and swap) on the system as well as the shared memory and buffers used by the kernel. Each line of the file consists of a parameter name, followed by a colon, the value of the parameter, and an option unit of measurement (e.g., kB). The list below describes the parameter names and the format specifier required to read the field value. Except as noted below, all of the fields have been present since at least Linux 2.6.0. Some fileds are displayed only if the kernel was configured with various options; those dependencies are noted in the list. MemTotal %lu Total usable RAM (i.e., physical RAM minus a few reserved bits and the kernel binary code). MemFree %lu The sum of LowFree+HighFree. Buffers %lu Relatively temporary storage for raw disk blocks that shouldnt get tremendously large (20MB or so). Cached %lu In-memory cache for files read from the disk (the page cache). Doesnt include SwapCached. SwapCached %lu Memory that once was swapped out, is swapped back in but still also is in the swap file. (If memory pressure is high, these pages dont need to be swapped out again because they are already in the swap file. This saves I/O.) Active %lu Memory that has been used more recently and usually not reclaimed unless absolutely necessary. Inactive %lu Memory which has been less recently used. It is more eligible to be reclaimed for other purposes. Active(anon) %lu (since Linux 2.6.28) [To be documented.] Inactive(anon) %lu (since Linux 2.6.28) [To be documented.] Active(file) %lu (since Linux 2.6.28) [To be documented.] Inactive(file) %lu (since Linux 2.6.28) [To be documented.] Unevictable %lu (since Linux 2.6.28) (From Linux 2.6.28 to 2.6.30, CONFIG_UNEVICTABLE_LRU was required.) [To be documented.] Mlocked %lu (since Linux 2.6.28) (From Linux 2.6.28 to 2.6.30, CONFIG_UNEVICTABLE_LRU was required.) [To be documented.] HighTotal %lu (Starting with Linux 2.6.19, CONFIG_HIGHMEM is required.) Total amount of highmem. Highmem is all memory above ~860MB of physical memory. Highmem areas are for use by user-space programs, or for the page cache. The kernel must use tricks to access this memory, making it slower to access than lowmem. HighFree %lu (Starting with Linux 2.6.19, CONFIG_HIGHMEM is required.) Amount of free highmem. LowTotal %lu (Starting with Linux 2.6.19, CONFIG_HIGHMEM is required.) Total amount of lowmem. Lowmem is memory which can be used for everything that highmem can be used for, but it is also available for the kernels use for its own data structures. Among many other things, it is where everything from Slab is allocated. Bad things happen when youre out of lowmem. LowFree %lu (Starting with Linux 2.6.19, CONFIG_HIGHMEM is required.) Amount of free lowmem. MmapCopy %lu (since Linux 2.6.29) (CONFIG_MMU is required.) [To be documented.] SwapTotal %lu Total amount of swap space available. SwapFree %lu Amount of swap space that is currently unused. Dirty %lu Memory which is waiting to get written back to the disk. Writeback %lu Memory which is actively being written back to the disk. AnonPages %lu (since Linux 2.6.18) Non-file backed pages mapped into user-space page tables. Mapped %lu Files which have been mmaped, such as libraries. Shmem %lu (since Linux 2.6.32) [To be documented.] Slab %lu In-kernel data structures cache. SReclaimable %lu (since Linux 2.6.19) Part of Slab, that might be reclaimed, such as caches. SUnreclaim %lu (since Linux 2.6.19) Part of Slab, that cannot be reclaimed on memory pressure. KernelStack %lu (since Linux 2.6.32) Amount of memory allocated to kernel stacks. PageTables %lu (since Linux 2.6.18) Amount of memory dedicated to the lowest level of page tables. Quicklists %lu (since Linux 2.6.27) (CONFIG_QUICKLIST is required.) [To be documented.] NFS_Unstable %lu (since Linux 2.6.18) NFS pages sent to the server, but not yet committed to stable storage. Bounce %lu (since Linux 2.6.18) Memory used for block device bounce buffers. WritebackTmp %lu (since Linux 2.6.26) Memory used by FUSE for temporary writeback buffers. CommitLimit %lu (since Linux 2.6.10) Based on the overcommit ratio (vm.overcommit_ratio), this is the total amount of memory currently available to be allocated on the system. This limit is adhered to only if strict overcommit accounting is enabled (mode 2 in /proc/sys/vm/overcommit_ratio). The CommitLimit is calculated using the following formula: CommitLimit = ([total RAM pages] - [total huge TLB pages]) * overcommit_ratio / 100 + [total swap pages] For example, on a system with 1GB of physical RAM and 7GB of swap with a overcommit_ratio of 30, this formula yields a CommitLimit of 7.3GB. For more details, see the memory overcommit documentation in the kernel source file Documentation/vm/overcommit-accounting. Committed_AS %lu The amount of memory presently allocated on the system. The committed memory is a sum of all of the memory which has been allocated by processes, even if it has not been used by them as of yet. A process which allocates 1GB of memory (using malloc(3) or similar), but touches only 300MB of that memory will show up as using only 300MB of memory even if it has the address space allocated for the entire 1GB. This 1GB is memory which has been committed to by the VM and can be used at any time by the allocating application. With strict overcommit enabled on the system (mode 2 /proc/sys/vm/overcommit_memory), allocations w»\n", "\n", "[[ ## question ## ]]\n", "what are high memory and low memory on linux?\n", @@ -811,10 +852,10 @@ "\u001b[31mResponse:\u001b[0m\n", "\n", "\u001b[32m[[ ## reasoning ## ]]\n", - "High memory and low memory in Linux refer to the way the operating system organizes and manages memory for user-space applications and the kernel. Low memory is the portion of memory that is directly accessible by the kernel, while high memory is the part that is not directly mapped by the kernel's page tables. In a typical 32-bit architecture, low memory usually consists of the lower 3 GB of virtual memory, which is accessible to user-space applications, while high memory comprises the upper 1 GB, which is reserved for the kernel. The distinction is important for memory management, especially in systems with large amounts of RAM, as it affects how the kernel accesses and manages memory resources.\n", + "High Memory and Low Memory in Linux refer to two segments of the kernel's memory space. Low Memory is the portion of memory that the kernel can access directly and is statically mapped at boot time. This area is typically used for kernel data structures and is always accessible to the kernel. High Memory, on the other hand, is not permanently mapped in the kernel's address space, meaning that the kernel cannot access it directly without first mapping it into its address space. High Memory is used for user-space applications and temporary data buffers. The distinction allows for better memory management and security, as user-space applications cannot directly access kernel-space memory.\n", "\n", "[[ ## response ## ]]\n", - "In Linux, high memory refers to the portion of memory that is not directly mapped by the kernel's page tables, meaning the kernel cannot access it directly without mapping it into its address space first. Low memory, on the other hand, is the segment of memory that the kernel can access directly. In a typical 32-bit system, low memory consists of the lower 3 GB of virtual memory, while high memory comprises the upper 1 GB. This organization helps manage memory more efficiently, especially in systems with large physical memory.\n", + "In Linux, High Memory refers to the segment of memory that is not permanently mapped in the kernel's address space, which means the kernel must map it temporarily to access it. This area is typically used for user-space applications and temporary data buffers. Low Memory, in contrast, is the portion of memory that the kernel can access directly and is statically mapped at boot time. It is used for kernel data structures and is always accessible to the kernel. This separation enhances security by preventing user-space applications from accessing kernel-space memory directly.\n", "\n", "[[ ## completed ## ]]\u001b[0m\n", "\n", @@ -833,7 +874,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Earlier with a CoT module, we got nearly 37% in terms of semantic F1 on our `devset`. Would this `RAG` module score better?" + "Earlier with a CoT module, we got around 40% in terms of semantic F1 on our `devset`. Would this `RAG` module score better?" ] }, { @@ -841,12 +882,25 @@ "execution_count": 17, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average Metric: 166.39 / 300 (55.5%): 100%|██████████| 300/300 [00:14<00:00, 20.29it/s]" + ] + }, { "name": "stderr", "output_type": "stream", "text": [ - "Average Metric: 74.61311832900337 / 150 (49.7): 100%|██████████| 150/150 [00:05<00:00, 27.92it/s] \n", - "2024/11/10 12:39:28 INFO dspy.evaluate.evaluate: Average Metric: 74.61311832900337 / 150 (49.7%)\n" + "2024/11/23 22:13:17 INFO dspy.evaluate.evaluate: Average Metric: 166.39410892098812 / 300 (55.5%)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" ] }, { @@ -872,6 +926,7 @@ " \n", " question\n", " example_response\n", + " gold_doc_ids\n", " reasoning\n", " pred_response\n", " SemanticF1\n", @@ -880,44 +935,50 @@ " \n", " \n", " 0\n", - " why is mercurial considered to be easier than git?\n", - " Mercurial's syntax is considered more familiar, especially for tho...\n", - " Mercurial is considered easier than Git for several reasons. First...\n", - " Mercurial is considered easier than Git because it has a more fami...\n", - " ✔️ [0.797]\n", + " when to use c over c++, and c++ over c?\n", + " If you are equally familiar with both C++ and C, it's advisable to...\n", + " [733]\n", + " C should be used over C++ primarily in scenarios where simplicity ...\n", + " Use C over C++ when working on embedded systems, requiring low-lev...\n", + " ✔️ [0.500]\n", " \n", " \n", " 1\n", - " open finder window from current terminal location?\n", - " If you type 'open .' in Terminal, it will open the current directo...\n", - " To open a Finder window from the current terminal location, you ca...\n", - " You can open a Finder window from your current terminal location b...\n", - " ✔️ [0.667]\n", + " should images be stored in a git repository?\n", + " One viewpoint expresses that there is no significant downside, esp...\n", + " [6253, 6254, 6275, 6278, 8215]\n", + " Storing images in a Git repository is generally not recommended du...\n", + " While it is technically possible to store images in a Git reposito...\n", + " ✔️ [0.444]\n", " \n", " \n", "\n", "" ], "text/plain": [ - " question \\\n", - "0 why is mercurial considered to be easier than git? \n", - "1 open finder window from current terminal location? \n", + " question \\\n", + "0 when to use c over c++, and c++ over c? \n", + "1 should images be stored in a git repository? \n", "\n", " example_response \\\n", - "0 Mercurial's syntax is considered more familiar, especially for tho... \n", - "1 If you type 'open .' in Terminal, it will open the current directo... \n", + "0 If you are equally familiar with both C++ and C, it's advisable to... \n", + "1 One viewpoint expresses that there is no significant downside, esp... \n", + "\n", + " gold_doc_ids \\\n", + "0 [733] \n", + "1 [6253, 6254, 6275, 6278, 8215] \n", "\n", " reasoning \\\n", - "0 Mercurial is considered easier than Git for several reasons. First... \n", - "1 To open a Finder window from the current terminal location, you ca... \n", + "0 C should be used over C++ primarily in scenarios where simplicity ... \n", + "1 Storing images in a Git repository is generally not recommended du... \n", "\n", " pred_response \\\n", - "0 Mercurial is considered easier than Git because it has a more fami... \n", - "1 You can open a Finder window from your current terminal location b... \n", + "0 Use C over C++ when working on embedded systems, requiring low-lev... \n", + "1 While it is technically possible to store images in a Git reposito... \n", "\n", " SemanticF1 \n", - "0 ✔️ [0.797] \n", - "1 ✔️ [0.667] " + "0 ✔️ [0.500] \n", + "1 ✔️ [0.444] " ] }, "metadata": {}, @@ -933,7 +994,7 @@ " font-weight: bold;\n", " color: #555;\n", " margin: 10px 0;'>\n", - " ... 148 more rows not displayed ...\n", + " ... 298 more rows not displayed ...\n", " \n", " " ], @@ -947,7 +1008,7 @@ { "data": { "text/plain": [ - "49.74" + "55.46" ] }, "execution_count": 17, @@ -965,7 +1026,7 @@ "source": [ "## Using a DSPy Optimizer to improve your RAG prompt.\n", "\n", - "Off the shelf, our `RAG` module scores nearly 50%. What are our options to make it stronger? One of the various choices DSPy offers is optimizing the prompts in our pipeline.\n", + "Off the shelf, our `RAG` module scores 55%. What are our options to make it stronger? One of the various choices DSPy offers is optimizing the prompts in our pipeline.\n", "\n", "If there are many sub-modules in your program, all of them will be optimized together. In this case, there's only one: `self.respond = dspy.ChainOfThought('context, question -> response')`\n", "\n", @@ -974,260 +1035,1259 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tp = dspy.MIPROv2(metric=metric, auto=\"medium\", num_threads=24) # use fewer threads if your rate limit is small\n", - "\n", - "optimized_rag = tp.compile(RAG(), trainset=trainset, valset=valset,\n", - " max_bootstrapped_demos=2, max_labeled_demos=2,\n", - " requires_permission_to_run=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The prompt optimization process here is pretty systematic, you can learn about it for example in this paper. Importantly, it's not a magic button. It's very possible that it can overfit your training set for instance and not generalize well to a held-out set, making it essential that we iteratively validate our programs.\n", - "\n", - "Let's check on an example here, asking the same question to the baseline `rag = RAG()` program, which was not optimized, and to the `optimized_rag = MIPROv2(..)(..)` program, after prompt optimization." - ] - }, - { - "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "You are correct; cmd+Tab does not work on hidden or minimized windows in macOS. It is designed to switch between applications and will only show non-minimized windows of the active application. To access minimized windows, you need to click on them directly or use other shortcuts.\n" + "2024/11/23 22:13:17 INFO dspy.teleprompt.mipro_optimizer_v2: \n", + "RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:\n", + "num_trials: 25\n", + "minibatch: True\n", + "num_candidates: 19\n", + "valset size: 160\n", + "\n", + "2024/11/23 22:13:17 INFO dspy.teleprompt.mipro_optimizer_v2: \n", + "==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==\n", + "2024/11/23 22:13:17 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.\n", + "\n", + "2024/11/23 22:13:17 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=19 sets of demonstrations...\n" ] - } - ], - "source": [ - "baseline = rag(question=\"cmd+tab does not work on hidden or minimized windows\")\n", - "print(baseline.response)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ + }, { "name": "stdout", "output_type": "stream", "text": [ - "In macOS, the Command+Tab shortcut is specifically designed to switch between applications, not individual windows. This means that if an application is minimized or hidden, it will not be activated using Command+Tab. Here are some alternative methods to manage minimized or hidden windows:\n", - "\n", - "1. **Click on the Minimized Window:**\n", - " - You can directly click on the minimized window in the Dock to restore it.\n", - "\n", - "2. **Use Command+M:**\n", - " - If you want to minimize the current window, you can use Command+M. To restore it, you will need to click on it in the Dock.\n", - "\n", - "3. **Use Mission Control:**\n", - " - You can activate Mission Control (F3 or Control+Up Arrow) to see all open windows and select the one you want to bring to the front.\n", - "\n", - "4. **Third-Party Applications:**\n", - " - Consider using third-party applications like HyperSwitch or Witch, which can provide enhanced window management features, including switching between windows of the same application.\n", - "\n", - "5. **Keyboard Shortcuts for Specific Applications:**\n", - " - Some applications may have their own shortcuts for managing windows. Check the preferences or documentation for the specific application you are using.\n", - "\n", - "By using these methods, you can effectively manage and restore minimized or hidden windows in macOS.\n" + "Bootstrapping set 1/19\n", + "Bootstrapping set 2/19\n", + "Bootstrapping set 3/19\n" ] - } - ], - "source": [ - "pred = optimized_rag(question=\"cmd+tab does not work on hidden or minimized windows\")\n", - "print(pred.response)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can use `dspy.inspect_history(n=2)` to view the RAG prompt [before optimization](https://gist.github.com/okhat/5d04648f2226e72e66e26a8cb1456ee4) and [after optimization](https://gist.github.com/okhat/79405b8889b4b07da577ee19f1a3479a).\n", - "\n", - "Concretely, in of run of this notebook, the optimized prompt:\n", - "\n", - "1. Constructs the following instruction,\n", - "```text\n", - "Using the provided `context` and `question`, analyze the information step by step to generate a comprehensive and informative `response`. Ensure that the response clearly explains the concepts involved, highlights key distinctions, and addresses any complexities noted in the context.\n", - "```\n", - "\n", - "2. And includes two fully worked out RAG examples with synthetic reasoning and answers, e.g. `how to transfer whatsapp voice message to computer?`.\n", - "\n", - "Let's now evaluate on the overall devset." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ + }, { "name": "stderr", "output_type": "stream", "text": [ - "Average Metric: 89.78303512426604 / 150 (59.9): 100%|██████████| 150/150 [00:00<00:00, 424.18it/s]\n", - "2024/11/10 12:39:36 INFO dspy.evaluate.evaluate: Average Metric: 89.78303512426604 / 150 (59.9%)\n" + " 10%|█ | 4/40 [00:00<00:04, 8.97it/s]\n" ] }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
questionexample_responsereasoningpred_responseSemanticF1
0why is mercurial considered to be easier than git?Mercurial's syntax is considered more familiar, especially for tho...Mercurial is often considered easier than Git for several reasons,...Mercurial is considered easier than Git for several key reasons: 1...✔️ [0.874]
1open finder window from current terminal location?If you type 'open .' in Terminal, it will open the current directo...To open a Finder window from the current terminal location in macO...To open a Finder window from your current terminal location in mac...✔️ [0.600]
\n", - "
" - ], - "text/plain": [ - " question \\\n", - "0 why is mercurial considered to be easier than git? \n", - "1 open finder window from current terminal location? \n", - "\n", - " example_response \\\n", - "0 Mercurial's syntax is considered more familiar, especially for tho... \n", - "1 If you type 'open .' in Terminal, it will open the current directo... \n", - "\n", - " reasoning \\\n", - "0 Mercurial is often considered easier than Git for several reasons,... \n", - "1 To open a Finder window from the current terminal location in macO... \n", - "\n", - " pred_response \\\n", - "0 Mercurial is considered easier than Git for several key reasons: 1... \n", - "1 To open a Finder window from your current terminal location in mac... \n", - "\n", - " SemanticF1 \n", - "0 ✔️ [0.874] \n", - "1 ✔️ [0.600] " - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stdout", + "output_type": "stream", + "text": [ + "Bootstrapped 2 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.\n", + "Bootstrapping set 4/19\n" + ] }, { - "data": { - "text/html": [ - "\n", - "
\n", - " ... 148 more rows not displayed ...\n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + " 15%|█▌ | 6/40 [00:00<00:03, 8.98it/s]\n" + ] }, { - "data": { - "text/plain": [ - "59.86" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "evaluate(optimized_rag)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Keeping an eye on cost.\n", - "\n", - "DSPy allows you to track the cost of your programs, which can be used to monitor the cost of your calls. Here, we'll show you how to track the cost of your programs with DSPy." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "cost = sum([x['cost'] for x in lm.history if x['cost'] is not None]) # in USD, as calculated by LiteLLM for certain providers" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Saving and loading.\n", - "\n", - "The optimized program has a pretty simple structure on the inside. Feel free to explore it.\n", - "\n", - "Here, we'll save `optimized_rag` so we can load it again later without having to optimize from scratch." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ + "name": "stdout", + "output_type": "stream", + "text": [ + "Bootstrapped 1 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.\n", + "Bootstrapping set 5/19\n" + ] + }, { - "data": { - "text/plain": [ - "Prediction(\n", - " reasoning='The behavior of the Command+Tab shortcut in macOS is designed to switch between applications rather than individual windows. This means that if an application is minimized or hidden, it will not be brought to the forefront using Command+Tab. Instead, the shortcut will only cycle through applications that are currently open and not minimized. To manage minimized windows, users may need to use different shortcuts or methods to restore them.',\n", - " response='In macOS, the Command+Tab shortcut is specifically designed to switch between applications, not individual windows. This means that if an application is minimized or hidden, it will not be activated using Command+Tab. Here are some alternative methods to manage minimized or hidden windows:\\n\\n1. **Click on the Minimized Window:**\\n - You can directly click on the minimized window in the Dock to restore it.\\n\\n2. **Use Command+M:**\\n - If you want to minimize the current window, you can use Command+M. To restore it, you will need to click on it in the Dock.\\n\\n3. **Use Mission Control:**\\n - You can activate Mission Control (F3 or Control+Up Arrow) to see all open windows and select the one you want to bring to the front.\\n\\n4. **Third-Party Applications:**\\n - Consider using third-party applications like HyperSwitch or Witch, which can provide enhanced window management features, including switching between windows of the same application.\\n\\n5. **Keyboard Shortcuts for Specific Applications:**\\n - Some applications may have their own shortcuts for managing windows. Check the preferences or documentation for the specific application you are using.\\n\\nBy using these methods, you can effectively manage and restore minimized or hidden windows in macOS.'\n", + "name": "stderr", + "output_type": "stream", + "text": [ + " 2%|▎ | 1/40 [00:00<00:04, 9.16it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n", + "Bootstrapping set 6/19\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 8%|▊ | 3/40 [00:00<00:04, 9.16it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n", + "Bootstrapping set 7/19\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 5%|▌ | 2/40 [00:00<00:03, 9.53it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n", + "Bootstrapping set 8/19\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 12%|█▎ | 5/40 [00:00<00:03, 8.94it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bootstrapped 1 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.\n", + "Bootstrapping set 9/19\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 5%|▌ | 2/40 [00:00<00:04, 9.15it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n", + "Bootstrapping set 10/19\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 8%|▊ | 3/40 [00:00<00:04, 9.11it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n", + "Bootstrapping set 11/19\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 8%|▊ | 3/40 [00:00<00:04, 8.67it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n", + "Bootstrapping set 12/19\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 5%|▌ | 2/40 [00:00<00:04, 8.49it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n", + "Bootstrapping set 13/19\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 5%|▌ | 2/40 [00:00<00:04, 8.91it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n", + "Bootstrapping set 14/19\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 2%|▎ | 1/40 [00:00<00:04, 9.13it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n", + "Bootstrapping set 15/19\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 2%|▎ | 1/40 [00:00<00:04, 9.16it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n", + "Bootstrapping set 16/19\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 5%|▌ | 2/40 [00:00<00:04, 9.24it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n", + "Bootstrapping set 17/19\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 25%|██▌ | 10/40 [00:01<00:03, 8.74it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bootstrapped 2 full traces after 10 examples for up to 1 rounds, amounting to 10 attempts.\n", + "Bootstrapping set 18/19\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 8%|▊ | 3/40 [00:00<00:04, 8.40it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n", + "Bootstrapping set 19/19\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 8%|▊ | 3/40 [00:00<00:04, 8.64it/s]\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: \n", + "==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: \n", + "Proposing instructions...\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given the fields `context`, `question`, produce the fields `response`.\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Using the provided `context` about Mac OS X commands and the `question` related to troubleshooting or file management, generate a detailed response. Begin by outlining the reasoning process step-by-step, then provide a comprehensive answer that not only addresses the question but also includes practical applications and comparisons where relevant.\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 2: You are a technical support assistant. Given the fields `context` and `question`, analyze the provided context to extract relevant information and produce a detailed and coherent `response` that answers the question based on the information available.\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 3: You are a technical support assistant. Given the fields `context` and `question`, provide a clear and structured `response` that outlines the methods for locking the screen in the XFCE desktop environment, using the information available in the `context`. Make sure to highlight the most effective methods and include any relevant details to enhance user understanding.\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 4: You are a technical expert in data integrity and security. Given the fields `context` and `question`, produce a well-reasoned `response` that clearly explains the differences between a hash function and a checksum, incorporating relevant details from the context provided.\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 5: You are a shell scripting expert. Given the context that provides various examples and explanations related to temporary files in shell scripting, along with a specific question about how to create a temporary file, produce a detailed response that includes a code snippet demonstrating the use of the `mktemp` command and how to properly manage the temporary file within a shell script. Make sure to emphasize the importance of cleanup after the file is used.\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 6: Using the provided `context` and `question`, analyze the information and generate a detailed yet concise `response` that effectively summarizes the main arguments and conclusions regarding the practice of commenting every line of code. Ensure that the response reflects the nuances of the context and provides a clear stance on the issue.\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 7: Using the provided `context` which contains detailed information about transferring photos from an iPhone to a computer, and the `question` regarding how to transfer edited photos specifically, generate a comprehensive `response` that outlines the necessary steps or methods to effectively transfer the edited photos. Ensure to highlight any limitations of direct import methods and suggest alternative approaches such as AirDrop, emailing, or using iCloud Photo Library for a successful transfer of edited images.\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 8: Imagine you are a technical support specialist assisting a user who is facing issues with their Mac OS X commands. They have a critical deadline and need reliable solutions to their questions. Your task is to provide detailed, accurate responses based on the given context and questions. Given the fields `context`, `question`, produce the fields `response` to help the user understand complex technical concepts and troubleshoot effectively.\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 9: Imagine you are a technical support agent assisting a user who has lost important data on their LG G2 device and needs to recover their TWRP backups urgently. Given the fields `context`, which contains vital information about where TWRP backups can be stored, and `question`, which asks specifically about the locations of these backups, produce a detailed `response` that guides the user on how to locate their backups effectively.\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 10: Using the provided `context` that contains information about deleting empty directories, and the `question` regarding how to recursively delete empty directories in the user's home directory, generate a detailed `response` that includes the appropriate command and an explanation of its components. Ensure to highlight the use of options like `-type`, `-empty`, and `-exec` in the command, and consider providing variations for additional clarity, such as including the verbose option for user feedback.\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 11: Based on the provided context and question, generate a detailed response that outlines the steps necessary to export a private key from a Java Keytool keystore. Include commands for converting the keystore from JKS to PKCS#12 format and for using OpenSSL to extract the private key. Emphasize the security implications of handling private keys and provide clear instructions on replacing placeholders with actual values.\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 12: You are a technical support assistant with expertise in Mac OS X commands. Given the fields `context` and `question`, provide a detailed and informative `response` that clarifies the distinctions or relationships between the concepts discussed in the context.\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 13: You are a privacy advocate explaining why someone would trust DuckDuckGo or similar providers with a privacy policy. Given the fields `context` and `question`, provide a detailed response that outlines the reasons for this trust, incorporating aspects such as the clarity of the privacy policy, technical implementations, user control, legal accountability, and community feedback.\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 14: Using the provided `context` and `question`, generate a detailed and coherent `response` that explains the reasons someone might trust DuckDuckGo or similar privacy-focused providers based on their privacy policies and practices.\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 15: Using the provided `context` that contains detailed information about Mac OS X commands and locations related to user account pictures, along with the `question` regarding where Mac stores these account pictures, generate a structured and informative `response`. Ensure that your response accurately summarizes the key locations and relevant details mentioned in the context, and clearly addresses the question posed.\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 16: Using the provided `context` and `question`, generate a comprehensive `response` that summarizes the key points, compares the advantages and disadvantages of the concepts discussed, and offers practical insights based on the information available. Ensure that the response is clear, organized, and addresses the user's inquiry effectively.\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 17: Given the context that describes various methods for creating temporary files in shell scripts, along with a specific question about how to create a temporary file, generate a detailed response that includes examples of using the `mktemp` command, ensuring to explain the importance of cleanup after file usage.\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 18: You are a technical support assistant. Given the fields `context`, `question`, produce the fields `response`. Ensure that your response is detailed and provides step-by-step guidance based on the context provided.\n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: \n", + "\n", + "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the default program...\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n", + "Average Metric: 89.01 / 160 (55.6%): 100%|██████████| 160/160 [00:04<00:00, 37.54it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:27 INFO dspy.evaluate.evaluate: Average Metric: 89.0075423349221 / 160 (55.6%)\n", + "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 55.63\n", + "\n", + "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==\n", + "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n", + "\n", + "/opt/anaconda3/envs/jun2024_py310/lib/python3.10/site-packages/optuna/samplers/_tpe/sampler.py:319: ExperimentalWarning: ``multivariate`` option is an experimental feature. The interface can change in the future.\n", + " warnings.warn(\n", + "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 1 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 14.72 / 25 (58.9%): 100%|██████████| 25/25 [00:00<00:00, 96.95it/s] " + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:27 INFO dspy.evaluate.evaluate: Average Metric: 14.719867707788584 / 25 (58.9%)\n", + "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.88 on minibatch of size 25 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 7'].\n", + "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88]\n", + "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", + "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", + "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", + "\n", + "\n", + "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 2 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 13.88 / 25 (55.5%): 100%|██████████| 25/25 [00:00<00:00, 99.17it/s] " + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:28 INFO dspy.evaluate.evaluate: Average Metric: 13.87639947083419 / 25 (55.5%)\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 55.51 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10', 'Predictor 0: Few-Shot Set 7'].\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51]\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", + "\n", + "\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 3 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 15.56 / 25 (62.3%): 100%|██████████| 25/25 [00:00<00:00, 99.46it/s] " + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:28 INFO dspy.evaluate.evaluate: Average Metric: 15.563671185234691 / 25 (62.3%)\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.25 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 18'].\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25]\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", + "\n", + "\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 4 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 14.54 / 25 (58.2%): 100%|██████████| 25/25 [00:00<00:00, 97.02it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:28 INFO dspy.evaluate.evaluate: Average Metric: 14.542840231125426 / 25 (58.2%)\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.17 on minibatch of size 25 with parameters ['Predictor 0: Instruction 15', 'Predictor 0: Few-Shot Set 2'].\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17]\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", + "\n", + "\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 5 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 15.75 / 25 (63.0%): 100%|██████████| 25/25 [00:00<00:00, 104.42it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:28 INFO dspy.evaluate.evaluate: Average Metric: 15.746005444613344 / 25 (63.0%)\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.98 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 18'].\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98]\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", + "\n", + "\n", + "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 6 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 14.68 / 25 (58.7%): 100%|██████████| 25/25 [00:00<00:00, 107.78it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:29 INFO dspy.evaluate.evaluate: Average Metric: 14.683617165143385 / 25 (58.7%)\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.73 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 1'].\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73]\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", + "\n", + "\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 7 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 15.93 / 25 (63.7%): 100%|██████████| 25/25 [00:00<00:00, 106.66it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:29 INFO dspy.evaluate.evaluate: Average Metric: 15.934088959267559 / 25 (63.7%)\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 63.74 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 12'].\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74]\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", + "\n", + "\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 8 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 15.52 / 25 (62.1%): 100%|██████████| 25/25 [00:00<00:00, 100.22it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:29 INFO dspy.evaluate.evaluate: Average Metric: 15.52144781700213 / 25 (62.1%)\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.09 on minibatch of size 25 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 13'].\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09]\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", + "\n", + "\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 9 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 15.54 / 25 (62.2%): 100%|██████████| 25/25 [00:00<00:00, 104.70it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:29 INFO dspy.evaluate.evaluate: Average Metric: 15.541098318140321 / 25 (62.2%)\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.16 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 4'].\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16]\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", + "\n", + "\n", + "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 10 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 12.33 / 25 (49.3%): 100%|██████████| 25/25 [00:00<00:00, 72.31it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:30 INFO dspy.evaluate.evaluate: Average Metric: 12.332086462618921 / 25 (49.3%)\n", + "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 49.33 on minibatch of size 25 with parameters ['Predictor 0: Instruction 14', 'Predictor 0: Few-Shot Set 1'].\n", + "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33]\n", + "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", + "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", + "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", + "\n", + "\n", + "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 1 =====\n", + "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 63.74) from minibatch trials...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 98.06 / 160 (61.3%): 100%|██████████| 160/160 [00:01<00:00, 139.10it/s]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:31 INFO dspy.evaluate.evaluate: Average Metric: 98.06249092576995 / 160 (61.3%)\n", + "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: \u001b[92mNew best full eval score!\u001b[0m Score: 61.29\n", + "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", + "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", + "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n", + "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: \n", + "\n", + "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 11 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average Metric: 15.61 / 25 (62.5%): 100%|██████████| 25/25 [00:00<00:00, 105.23it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:31 INFO dspy.evaluate.evaluate: Average Metric: 15.612633878081091 / 25 (62.5%)\n", + "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.45 on minibatch of size 25 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 12'].\n", + "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45]\n", + "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", + "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", + "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", + "\n", + "\n", + "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 12 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 15.03 / 25 (60.1%): 100%|██████████| 25/25 [00:00<00:00, 100.46it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:31 INFO dspy.evaluate.evaluate: Average Metric: 15.03300812819276 / 25 (60.1%)\n", + "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.13 on minibatch of size 25 with parameters ['Predictor 0: Instruction 6', 'Predictor 0: Few-Shot Set 18'].\n", + "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13]\n", + "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", + "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", + "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", + "\n", + "\n", + "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 13 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 14.43 / 25 (57.7%): 100%|██████████| 25/25 [00:00<00:00, 112.91it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:32 INFO dspy.evaluate.evaluate: Average Metric: 14.430989267101385 / 25 (57.7%)\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.72 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 12'].\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72]\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", + "\n", + "\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 14 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 14.68 / 25 (58.7%): 100%|██████████| 25/25 [00:00<00:00, 95.62it/s] " + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:32 INFO dspy.evaluate.evaluate: Average Metric: 14.681540371022235 / 25 (58.7%)\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.73 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 5'].\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73]\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", + "\n", + "\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 15 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 14.54 / 25 (58.2%): 100%|██████████| 25/25 [00:00<00:00, 100.56it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:32 INFO dspy.evaluate.evaluate: Average Metric: 14.53865209268966 / 25 (58.2%)\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.15 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 14'].\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15]\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", + "\n", + "\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 16 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 14.83 / 25 (59.3%): 100%|██████████| 25/25 [00:00<00:00, 108.11it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:32 INFO dspy.evaluate.evaluate: Average Metric: 14.832026371762414 / 25 (59.3%)\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 59.33 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 18'].\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33]\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", + "\n", + "\n", + "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 17 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 17.22 / 25 (68.9%): 100%|██████████| 25/25 [00:00<00:00, 105.12it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:33 INFO dspy.evaluate.evaluate: Average Metric: 17.216978671345192 / 25 (68.9%)\n", + "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.87 on minibatch of size 25 with parameters ['Predictor 0: Instruction 16', 'Predictor 0: Few-Shot Set 6'].\n", + "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87]\n", + "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", + "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", + "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", + "\n", + "\n", + "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 18 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 15.12 / 25 (60.5%): 100%|██████████| 25/25 [00:00<00:00, 97.80it/s] " + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:33 INFO dspy.evaluate.evaluate: Average Metric: 15.123535939830598 / 25 (60.5%)\n", + "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.49 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 6'].\n", + "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49]\n", + "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", + "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", + "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", + "\n", + "\n", + "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 19 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 15.26 / 25 (61.0%): 100%|██████████| 25/25 [00:00<00:00, 99.12it/s] " + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:33 INFO dspy.evaluate.evaluate: Average Metric: 15.256960301954985 / 25 (61.0%)\n", + "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 61.03 on minibatch of size 25 with parameters ['Predictor 0: Instruction 16', 'Predictor 0: Few-Shot Set 14'].\n", + "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03]\n", + "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", + "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", + "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", + "\n", + "\n", + "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 20 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 14.61 / 25 (58.4%): 100%|██████████| 25/25 [00:00<00:00, 102.38it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:34 INFO dspy.evaluate.evaluate: Average Metric: 14.607005004992326 / 25 (58.4%)\n", + "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.43 on minibatch of size 25 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 9'].\n", + "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03, 58.43]\n", + "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", + "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", + "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", + "\n", + "\n", + "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 2 =====\n", + "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 68.87) from minibatch trials...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 97.23 / 160 (60.8%): 100%|██████████| 160/160 [00:11<00:00, 14.01it/s] " + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:45 INFO dspy.evaluate.evaluate: Average Metric: 97.22622109571304 / 160 (60.8%)\n", + "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77]\n", + "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", + "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n", + "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: \n", + "\n", + "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 21 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 16.54 / 25 (66.2%): 100%|██████████| 25/25 [00:00<00:00, 112.10it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:45 INFO dspy.evaluate.evaluate: Average Metric: 16.54482901646923 / 25 (66.2%)\n", + "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 66.18 on minibatch of size 25 with parameters ['Predictor 0: Instruction 16', 'Predictor 0: Few-Shot Set 6'].\n", + "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03, 58.43, 66.18]\n", + "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77]\n", + "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", + "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", + "\n", + "\n", + "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 22 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 14.84 / 25 (59.4%): 100%|██████████| 25/25 [00:00<00:00, 113.00it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:45 INFO dspy.evaluate.evaluate: Average Metric: 14.837814582612035 / 25 (59.4%)\n", + "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 59.35 on minibatch of size 25 with parameters ['Predictor 0: Instruction 16', 'Predictor 0: Few-Shot Set 6'].\n", + "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03, 58.43, 66.18, 59.35]\n", + "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77]\n", + "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", + "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", + "\n", + "\n", + "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 23 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 14.71 / 25 (58.8%): 100%|██████████| 25/25 [00:00<00:00, 105.76it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:46 INFO dspy.evaluate.evaluate: Average Metric: 14.711485027993763 / 25 (58.8%)\n", + "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.85 on minibatch of size 25 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 15'].\n", + "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03, 58.43, 66.18, 59.35, 58.85]\n", + "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77]\n", + "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", + "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", + "\n", + "\n", + "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 24 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 15.14 / 25 (60.6%): 100%|██████████| 25/25 [00:00<00:00, 95.66it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:46 INFO dspy.evaluate.evaluate: Average Metric: 15.144601379869599 / 25 (60.6%)\n", + "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.58 on minibatch of size 25 with parameters ['Predictor 0: Instruction 18', 'Predictor 0: Few-Shot Set 8'].\n", + "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03, 58.43, 66.18, 59.35, 58.85, 60.58]\n", + "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77]\n", + "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", + "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", + "\n", + "\n", + "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 25 / 25 ==\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 14.26 / 25 (57.0%): 100%|██████████| 25/25 [00:00<00:00, 103.69it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:46 INFO dspy.evaluate.evaluate: Average Metric: 14.257718170019547 / 25 (57.0%)\n", + "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.03 on minibatch of size 25 with parameters ['Predictor 0: Instruction 16', 'Predictor 0: Few-Shot Set 0'].\n", + "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03, 58.43, 66.18, 59.35, 58.85, 60.58, 57.03]\n", + "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77]\n", + "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", + "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", + "\n", + "\n", + "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 3 =====\n", + "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 62.45) from minibatch trials...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Average Metric: 95.14 / 160 (59.5%): 100%|██████████| 160/160 [00:01<00:00, 143.17it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:13:47 INFO dspy.evaluate.evaluate: Average Metric: 95.13659459156446 / 160 (59.5%)\n", + "2024/11/23 22:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77, 59.46]\n", + "2024/11/23 22:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", + "2024/11/23 22:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n", + "2024/11/23 22:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: \n", + "\n", + "2024/11/23 22:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 61.29!\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "tp = dspy.MIPROv2(metric=metric, auto=\"medium\", num_threads=24) # use fewer threads if your rate limit is small\n", + "\n", + "optimized_rag = tp.compile(RAG(), trainset=trainset,\n", + " max_bootstrapped_demos=2, max_labeled_demos=2,\n", + " requires_permission_to_run=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The prompt optimization process here is pretty systematic, you can learn about it for example in this paper. Importantly, it's not a magic button. It's very possible that it can overfit your training set for instance and not generalize well to a held-out set, making it essential that we iteratively validate our programs.\n", + "\n", + "Let's check on an example here, asking the same question to the baseline `rag = RAG()` program, which was not optimized, and to the `optimized_rag = MIPROv2(..)(..)` program, after prompt optimization." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "You are correct that cmd+tab does not work on hidden or minimized windows. To switch back to a minimized app, you must first switch to another application and let it take focus before returning to the minimized one.\n" + ] + } + ], + "source": [ + "baseline = rag(question=\"cmd+tab does not work on hidden or minimized windows\")\n", + "print(baseline.response)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The Command + Tab shortcut on macOS is designed to switch between currently open applications, but it does not directly restore minimized or hidden windows. When you use Command + Tab, it cycles through the applications that are actively running, and minimized windows do not count as active. To manage minimized windows, you can use other shortcuts or methods. For example, you can use Command + Option + H + M to hide all other applications and minimize the most recently used one. Alternatively, you can navigate to the application you want to restore using Command + Tab and then manually click on the minimized window in the Dock to bring it back to focus.\n" + ] + } + ], + "source": [ + "pred = optimized_rag(question=\"cmd+tab does not work on hidden or minimized windows\")\n", + "print(pred.response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can use `dspy.inspect_history(n=2)` to view the RAG prompt [before optimization](https://gist.github.com/okhat/5d04648f2226e72e66e26a8cb1456ee4) and [after optimization](https://gist.github.com/okhat/79405b8889b4b07da577ee19f1a3479a).\n", + "\n", + "Concretely, in one of the runs of this notebook, the optimized prompt does the following (note that it may be different on a later rerun).\n", + "\n", + "1. Constructs the following instruction,\n", + "```text\n", + "Using the provided `context` and `question`, analyze the information step by step to generate a comprehensive and informative `response`. Ensure that the response clearly explains the concepts involved, highlights key distinctions, and addresses any complexities noted in the context.\n", + "```\n", + "\n", + "2. And includes two fully worked out RAG examples with synthetic reasoning and answers, e.g. `how to transfer whatsapp voice message to computer?`.\n", + "\n", + "Let's now evaluate on the overall devset." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average Metric: 183.28 / 300 (61.1%): 100%|██████████| 300/300 [00:13<00:00, 22.20it/s] " + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/11/23 22:14:01 INFO dspy.evaluate.evaluate: Average Metric: 183.27658621624977 / 300 (61.1%)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
questionexample_responsegold_doc_idsreasoningpred_responseSemanticF1
0when to use c over c++, and c++ over c?If you are equally familiar with both C++ and C, it's advisable to...[733]The context provides insights into the strengths and weaknesses of...You should consider using C over C++ in scenarios where simplicity...✔️ [0.333]
1should images be stored in a git repository?One viewpoint expresses that there is no significant downside, esp...[6253, 6254, 6275, 6278, 8215]The context discusses the challenges and considerations of storing...Storing images in a Git repository is generally considered bad pra...✔️ [0.500]
\n", + "
" + ], + "text/plain": [ + " question \\\n", + "0 when to use c over c++, and c++ over c? \n", + "1 should images be stored in a git repository? \n", + "\n", + " example_response \\\n", + "0 If you are equally familiar with both C++ and C, it's advisable to... \n", + "1 One viewpoint expresses that there is no significant downside, esp... \n", + "\n", + " gold_doc_ids \\\n", + "0 [733] \n", + "1 [6253, 6254, 6275, 6278, 8215] \n", + "\n", + " reasoning \\\n", + "0 The context provides insights into the strengths and weaknesses of... \n", + "1 The context discusses the challenges and considerations of storing... \n", + "\n", + " pred_response \\\n", + "0 You should consider using C over C++ in scenarios where simplicity... \n", + "1 Storing images in a Git repository is generally considered bad pra... \n", + "\n", + " SemanticF1 \n", + "0 ✔️ [0.333] \n", + "1 ✔️ [0.500] " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "
\n", + " ... 298 more rows not displayed ...\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "61.09" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "evaluate(optimized_rag)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Keeping an eye on cost.\n", + "\n", + "DSPy allows you to track the cost of your programs, which can be used to monitor the cost of your calls. Here, we'll show you how to track the cost of your programs with DSPy." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "cost = sum([x['cost'] for x in lm.history if x['cost'] is not None]) # in USD, as calculated by LiteLLM for certain providers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Saving and loading.\n", + "\n", + "The optimized program has a pretty simple structure on the inside. Feel free to explore it.\n", + "\n", + "Here, we'll save `optimized_rag` so we can load it again later without having to optimize from scratch." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Prediction(\n", + " reasoning='The context explains how the Command + Tab shortcut functions on macOS, particularly in relation to switching between applications. It notes that this shortcut does not bring back minimized or hidden windows directly. Instead, it cycles through applications that are currently open and visible. The information also suggests alternative methods for managing minimized windows and provides insights into how to navigate between applications effectively.',\n", + " response='The Command + Tab shortcut on macOS is designed to switch between currently open applications, but it does not directly restore minimized or hidden windows. When you use Command + Tab, it cycles through the applications that are actively running, and minimized windows do not count as active. To manage minimized windows, you can use other shortcuts or methods. For example, you can use Command + Option + H + M to hide all other applications and minimize the most recently used one. Alternatively, you can navigate to the application you want to restore using Command + Tab and then manually click on the minimized window in the Dock to bring it back to focus.'\n", ")" ] }, @@ -1245,6 +2305,278 @@ "loaded_rag(question=\"cmd+tab does not work on hidden or minimized windows\")" ] }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "\n", + "\u001b[34m[2024-11-23T22:14:01.562290]\u001b[0m\n", + "\n", + "\u001b[31mSystem message:\u001b[0m\n", + "\n", + "Your input fields are:\n", + "1. `question` (str)\n", + "2. `ground_truth` (str)\n", + "3. `system_response` (str)\n", + "\n", + "Your output fields are:\n", + "1. `reasoning` (str)\n", + "2. `ground_truth_key_ideas` (str): enumeration of key ideas in the ground truth\n", + "3. `system_response_key_ideas` (str): enumeration of key ideas in the system response\n", + "4. `discussion` (str): discussion of the overlap between ground truth and system response\n", + "5. `recall` (float): fraction (out of 1.0) of ground truth covered by the system response\n", + "6. `precision` (float): fraction (out of 1.0) of system response covered by the ground truth\n", + "\n", + "All interactions will be structured in the following way, with the appropriate values filled in.\n", + "\n", + "[[ ## question ## ]]\n", + "{question}\n", + "\n", + "[[ ## ground_truth ## ]]\n", + "{ground_truth}\n", + "\n", + "[[ ## system_response ## ]]\n", + "{system_response}\n", + "\n", + "[[ ## reasoning ## ]]\n", + "{reasoning}\n", + "\n", + "[[ ## ground_truth_key_ideas ## ]]\n", + "{ground_truth_key_ideas}\n", + "\n", + "[[ ## system_response_key_ideas ## ]]\n", + "{system_response_key_ideas}\n", + "\n", + "[[ ## discussion ## ]]\n", + "{discussion}\n", + "\n", + "[[ ## recall ## ]]\n", + "{recall} # note: the value you produce must be a single float value\n", + "\n", + "[[ ## precision ## ]]\n", + "{precision} # note: the value you produce must be a single float value\n", + "\n", + "[[ ## completed ## ]]\n", + "\n", + "In adhering to this structure, your objective is: \n", + " Compare a system's response to the ground truth to compute recall and precision of key ideas.\n", + " You will first enumerate key ideas in each response, discuss their overlap, and then report recall and precision.\n", + "\n", + "\n", + "\u001b[31mUser message:\u001b[0m\n", + "\n", + "[[ ## question ## ]]\n", + "how to use pip after the os x el capitan upgrade?\n", + "\n", + "[[ ## ground_truth ## ]]\n", + "One quick solution is to install Python using homebrew into a directory such as `/usr/local/bin`, which allows for the use of pip and python regardless of the system provided version. \n", + "Additionally, opting to use `pip3 install` alongside the desired package name helps to solve permission problem. \n", + "A direct and secure approach without altering system settings is to install Python modules for the individual user using the `pip install --user` command followed by the module name. \n", + "An alternative method involves installing Python with homebrew and then obtaining pip through the `sudo easy_install pip` command. \n", + "Lastly, using virtual environments, specifically using virtualenv or virtualenvwrapper, creates an isolated environment for each project, permitting seamless pip usage for package management within these enclosed spaces.\n", + "\n", + "[[ ## system_response ## ]]\n", + "After upgrading to OS X El Capitan, you can use pip by following these steps:\n", + "\n", + "1. **Check Python Installation**: Ensure that you have the correct version of Python installed. You can check this by running `python --version` or `python3 --version` in the terminal.\n", + "\n", + "2. **Install pip**: If pip is not already installed, you can install it using the following command:\n", + " ```bash\n", + " sudo easy_install pip\n", + " ```\n", + " Alternatively, if you are using a specific version of Python installed via MacPorts, make sure to use the corresponding pip version, such as `py36-pip` for Python 3.6.\n", + "\n", + "3. **Verify pip Installation**: After installation, verify that pip is working by running:\n", + " ```bash\n", + " pip --version\n", + " ```\n", + "\n", + "4. **Using pip**: You can now use pip to install packages. For example, to install the `nltk` package, you would run:\n", + " ```bash\n", + " pip install nltk\n", + " ```\n", + "\n", + "5. **Managing Multiple Versions**: If you have multiple versions of Python installed (e.g., via MacPorts or Homebrew), ensure you are using the correct pip associated with the desired Python version. You can do this by using the `port select` command to set the active Python version.\n", + "\n", + "6. **Check PATH**: If you encounter issues with pip not being recognized, check your PATH settings. You may need to add the directory containing pip to your PATH by editing the `/etc/paths` file:\n", + " ```bash\n", + " sudo vim /etc/paths\n", + " ```\n", + " Add the path to the pip installation (e.g., `/usr/local/bin`) at the end of the file, save, and exit.\n", + "\n", + "By following these steps, you should be able to use pip effectively after your OS X El Capitan upgrade.\n", + "\n", + "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## ground_truth_key_ideas ## ]]`, then `[[ ## system_response_key_ideas ## ]]`, then `[[ ## discussion ## ]]`, then `[[ ## recall ## ]]` (must be formatted as a valid Python float), then `[[ ## precision ## ]]` (must be formatted as a valid Python float), and then ending with the marker for `[[ ## completed ## ]]`.\n", + "\n", + "\n", + "\u001b[31mResponse:\u001b[0m\n", + "\n", + "\u001b[32m[[ ## reasoning ## ]]\n", + "The system response provides a detailed guide on how to use pip after upgrading to OS X El Capitan, including checking the Python installation, installing pip, verifying the installation, and managing multiple versions of Python. The ground truth also covers similar points but emphasizes using Homebrew for Python installation, using `pip3`, and installing packages for the individual user. Both responses address the installation and usage of pip, but the ground truth includes additional methods and best practices for managing Python environments.\n", + "\n", + "[[ ## ground_truth_key_ideas ## ]]\n", + "1. Install Python using Homebrew into `/usr/local/bin`.\n", + "2. Use `pip3 install` to solve permission issues.\n", + "3. Install Python modules for the individual user with `pip install --user`.\n", + "4. Install pip using `sudo easy_install pip`.\n", + "5. Use virtual environments (virtualenv or virtualenvwrapper) for isolated package management.\n", + "\n", + "[[ ## system_response_key_ideas ## ]]\n", + "1. Check Python installation version.\n", + "2. Install pip using `sudo easy_install pip`.\n", + "3. Verify pip installation.\n", + "4. Use pip to install packages (e.g., `pip install nltk`).\n", + "5. Manage multiple Python versions and check PATH settings.\n", + "\n", + "[[ ## discussion ## ]]\n", + "There is a significant overlap between the ground truth and the system response regarding the installation and usage of pip. Both mention the installation of pip and the verification of its functionality. However, the ground truth provides additional methods for managing Python installations and permissions, such as using Homebrew and user-specific installations. The system response focuses more on the steps to ensure pip is set up correctly after the OS X upgrade, including managing multiple Python versions and checking PATH settings. Overall, while both responses cover the essential aspects of using pip, the ground truth offers a broader perspective on installation methods and user management.\n", + "\n", + "[[ ## recall ## ]]\n", + "0.5\n", + "\n", + "[[ ## precision ## ]]\n", + "0.5\n", + "\n", + "[[ ## completed ## ]]\u001b[0m\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\u001b[34m[2024-11-23T22:14:01.708696]\u001b[0m\n", + "\n", + "\u001b[31mSystem message:\u001b[0m\n", + "\n", + "Your input fields are:\n", + "1. `context` (str)\n", + "2. `question` (str)\n", + "\n", + "Your output fields are:\n", + "1. `reasoning` (str)\n", + "2. `response` (str)\n", + "\n", + "All interactions will be structured in the following way, with the appropriate values filled in.\n", + "\n", + "[[ ## context ## ]]\n", + "{context}\n", + "\n", + "[[ ## question ## ]]\n", + "{question}\n", + "\n", + "[[ ## reasoning ## ]]\n", + "{reasoning}\n", + "\n", + "[[ ## response ## ]]\n", + "{response}\n", + "\n", + "[[ ## completed ## ]]\n", + "\n", + "In adhering to this structure, your objective is: \n", + " Using the provided `context` which contains detailed information about transferring photos from an iPhone to a computer, and the `question` regarding how to transfer edited photos specifically, generate a comprehensive `response` that outlines the necessary steps or methods to effectively transfer the edited photos. Ensure to highlight any limitations of direct import methods and suggest alternative approaches such as AirDrop, emailing, or using iCloud Photo Library for a successful transfer of edited images.\n", + "\n", + "\n", + "\u001b[31mUser message:\u001b[0m\n", + "\n", + "This is an example of the task, though some input or output fields are not supplied.\n", + "\n", + "[[ ## context ## ]]\n", + "Not supplied for this particular example.\n", + "\n", + "[[ ## question ## ]]\n", + "can i create a shortcut to open a specific application on os x?\n", + "\n", + "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## response ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.\n", + "\n", + "\n", + "\u001b[31mAssistant message:\u001b[0m\n", + "\n", + "[[ ## reasoning ## ]]\n", + "Not supplied for this particular example.\n", + "\n", + "[[ ## response ## ]]\n", + "While Mac OS X does have a built-in method for setting keyboard shortcuts, it doesn't work for Firefox. \n", + "Karabiner-Elements allows for customization of keyboard shortcuts on Mac versions 10.12 and later, including executing shell commands. \n", + "Alfred, particularly in its Power Pack version, offers the ability to map shortcuts to applications, among its various features. \n", + "Using Automator Services in combination with built-in keyboard shortcuts is another method to create shortcuts without third party apps. \n", + "Spark 3 is a free Shortcuts manager which enables you to create Hot Keys to launch applications and documents, execute AppleScript, control iTunes, etc. \n", + "Some users have found Apptivate to be the easiest to use for assigning global shortcuts to apps. \n", + "To open Terminal with a shortcut on Mac without third-party software, one can utilize a custom Quick Action created in Automator and assign it a unique hotkey, like ⌘ + ⌥ + F12. \n", + "For iTerm2 users with Oh My Zsh, one can replace AppleScript with the 'Launch Application' action in Automator. \n", + "Furthermore, Terminal app already has built-in shortcut keys for opening new terminals or tabs at a folder location: New Terminal at Folder ^+⇧+T New Terminal at Tab Folder ⌥+^+⇧+T.\n", + "\n", + "[[ ## completed ## ]]\n", + "\n", + "\n", + "\u001b[31mUser message:\u001b[0m\n", + "\n", + "[[ ## context ## ]]\n", + "[1] «On linux, /sbin/nologin comes from the util-linux project, while /bin/false is part of GNU Coreutils. They serve different roles, and nologin has the option of printing a message for people who have it as their shell who are logging in. The linux commands come from BSD, where they seem to have a long history of being different. The FreeBSD false simply returns 1, while the nologin checks to make sure its running on a TTY and sends a message to syslog during login attempts. The linux versions are a bit more complicated (false doing all sorts of fun stuff with internationalization for the output of --help, I assume) but essentially perform the same way.»\n", + "[2] «/bin/false is a system command that is used anytime you need to pass a command to a program that should do nothing more than exit with an error. Its the companion to /bin/true. Both of these are very old and standard POSIX utilities and neither produce any output by definition. true is sometimes used for a shell script that should loop indefinitely, like: while true; do ... # Waste time if [ $wasted_time -gt 100000 ]; then exit 0 fi ... done /usr/sbin/nologin is specifically designed to replace a shell and produces output complaining you cant log-in. Before it existed, it was common to use /bin/false for dummy users, but could be confusing since the user doesnt know why theyre kicked off.»\n", + "[3] «/bin/false only job is to exit with a non-zero exit code. Try it at the command line: $:> /bin/false $:> echo $? 1 $:> Some institutions use /bin/false in the shell field of the password file. If user tries to login, the shell is /bin/false, so they are exited right away»\n", + "[4] «Some FTP servers will allow you FTP access only if you have a valid shell. /sbin/nologin is regarded as a valid shell, whereas /bin/false is not. (I think valid means its exit status is 0, but /etc/shells may also come into it, it probably depends on the system, the FTP software, and your configuration.)»\n", + "[5] «You can use false (/bin/false, /usr/bin/false, or shell builtin): $ false || echo It failed. It failed. $ You can also use exit 1 from a subshell: $ (exit 1) || echo Gosh, it failed too. Gosh, it failed too. $»\n", + "\n", + "[[ ## question ## ]]\n", + "whats the difference between /sbin/nologin and /bin/false\n", + "\n", + "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## response ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.\n", + "\n", + "\n", + "\u001b[31mAssistant message:\u001b[0m\n", + "\n", + "[[ ## reasoning ## ]]\n", + "The context provides detailed information about the roles and behaviors of `/sbin/nologin` and `/bin/false`. `/sbin/nologin` is designed to prevent users from logging in while providing a message indicating that login is not allowed. In contrast, `/bin/false` simply exits with a non-zero exit code without any output, serving as a command that does nothing. The distinction lies in their intended use cases and the feedback they provide to users.\n", + "\n", + "[[ ## response ## ]]\n", + "The main difference between `/sbin/nologin` and `/bin/false` is their purpose and behavior during login attempts. `/sbin/nologin` is used to prevent users from logging in while displaying a message that informs them they cannot log in. It is considered a valid shell for certain applications, such as FTP servers. On the other hand, `/bin/false` is a command that simply exits with a non-zero exit code and does not provide any output. It is often used in situations where a command needs to do nothing and is not intended to inform the user of any restrictions.\n", + "\n", + "[[ ## completed ## ]]\n", + "\n", + "\n", + "\u001b[31mUser message:\u001b[0m\n", + "\n", + "[[ ## context ## ]]\n", + "[1] «If you hold command and quickly tap tab you will cycle between your two most recently used applications without bringing up the heads up display. If you want to use a third party app Witch has options to ignore minimized windows.»\n", + "[2] «Try this: On your Mac, Navigate to System Preferences Go to Mission Control Uncheck When switching to an application, switch to a Space with open windows for the application Try using the cmd+tab now.»\n", + "[3] «Command + Tab until you get the apps icon. Before releasing the Command key, press and hold the Option key. You must switch to another app and let it take focus first. In other words, you cant just Command + Tab to another app and before actually selecting that app (by releasing the Command and Tab keys), switch right back to your minimized app, which you might attempt to do if you minimized it by accident or just simply changed your mind shortly after minimizing. Both the Command and left Option keys must be pressed on the same side (left or right) of the keyboard.»\n", + "[4] «Command-option-click does not hide all open windows. As stated in the linked Apple article, this command hides all other open items leaving the clicked item visible. It is possible to hide all other items and minimize your most recently item with command+option+h+m. This is a combination of command+option+h - Hide Others and command+option+m - Minimize.»\n", + "[5] «Cmd+` of course will work but I do it in different way. I do the Cmd+tab when i do that, I dont release the Cmd so the opened apps are still visible. While it is visible, I will press ▼. This way it will show all your opened application with animated displayed so you can navigate using arrow keys.»\n", + "\n", + "[[ ## question ## ]]\n", + "cmd+tab does not work on hidden or minimized windows\n", + "\n", + "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## response ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.\n", + "\n", + "\n", + "\u001b[31mResponse:\u001b[0m\n", + "\n", + "\u001b[32m[[ ## reasoning ## ]]\n", + "The context explains how the Command + Tab shortcut functions on macOS, particularly in relation to switching between applications. It notes that this shortcut does not bring back minimized or hidden windows directly. Instead, it cycles through applications that are currently open and visible. The information also suggests alternative methods for managing minimized windows and provides insights into how to navigate between applications effectively.\n", + "\n", + "[[ ## response ## ]]\n", + "The Command + Tab shortcut on macOS is designed to switch between currently open applications, but it does not directly restore minimized or hidden windows. When you use Command + Tab, it cycles through the applications that are actively running, and minimized windows do not count as active. To manage minimized windows, you can use other shortcuts or methods. For example, you can use Command + Option + H + M to hide all other applications and minimize the most recently used one. Alternatively, you can navigate to the application you want to restore using Command + Tab and then manually click on the minimized window in the Dock to bring it back to focus.\n", + "\n", + "[[ ## completed ## ]]\u001b[0m\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "dspy.inspect_history(n=2)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1252,22 +2584,20 @@ "\n", "## What's next?\n", "\n", - "Improving from around 37% to approximately 60% on this task, in terms of `SemanticF1`, was pretty easy.\n", + "Improving from around 42% to approximately 61% on this task, in terms of `SemanticF1`, was pretty easy.\n", "\n", "But DSPy gives you paths to continue iterating on the quality of your system and we have barely scratched the surface.\n", "\n", "In general, you have the following tools:\n", "\n", - "1. Explore better system architectures for your program, e.g. what if we ask the LM to generate search queries for the retriever? See this [notebook](https://colab.research.google.com/github/stanfordnlp/dspy/blob/main/intro.ipynb) or the [STORM pipeline](https://arxiv.org/abs/2402.14207) built in DSPy.\n", + "1. Explore better system architectures for your program, e.g. what if we ask the LM to generate search queries for the retriever? See, e.g., the [STORM pipeline](https://arxiv.org/abs/2402.14207) built in DSPy.\n", "2. Explore different [prompt optimizers](https://arxiv.org/abs/2406.11695) or [weight optimizers](https://arxiv.org/abs/2407.10930). See the **[Optimizers Docs](/building-blocks/6-optimizers)**.\n", "3. Scale inference time compute using DSPy Optimizers, e.g. this [notebook](https://github.com/stanfordnlp/dspy/blob/main/examples/agents/multi_agent.ipynb).\n", - "4. Cut cost by distilling to a smaller LM, via prompt or weight optimization, e.g. [this notebook](https://github.com/stanfordnlp/dspy/blob/main/examples/nli/scone/scone.ipynb) or [this notebook](https://colab.research.google.com/github/stanfordnlp/dspy/blob/main/examples/qa/hotpot/multihop_finetune.ipynb).\n", + "4. Cut cost by distilling to a smaller LM, via prompt or weight optimization, e.g. [this notebook](https://github.com/stanfordnlp/dspy/blob/main/examples/nli/scone/scone.ipynb).\n", "\n", "How do you decide which ones to proceed with first?\n", "\n", - "The first step is to look at your system outputs, which will allow you to identify the sources of lower performance if any. While doing all of this, make sure you continue to refine your metric, e.g. by optimizing against your judgments, and to collect more (or more realistic) data, e.g. from related domains or from putting a demo of your system in front of users.\n", - "\n", - "Learn more about the [development cycle](/building-blocks/solving_your_task) in DSPy." + "The first step is to look at your system outputs, which will allow you to identify the sources of lower performance if any. While doing all of this, make sure you continue to refine your metric, e.g. by optimizing against your judgments, and to collect more (or more realistic) data, e.g. from related domains or from putting a demo of your system in front of users." ] } ], diff --git a/dspy/evaluate/auto_evaluation.py b/dspy/evaluate/auto_evaluation.py index d96d58f21..d98332143 100644 --- a/dspy/evaluate/auto_evaluation.py +++ b/dspy/evaluate/auto_evaluation.py @@ -55,7 +55,7 @@ def forward(self, example, pred, trace=None): ########### -class DecompositionalSemanticRecall(dspy.Signature): +class AnswerCompleteness(dspy.Signature): """ Estimate the completeness of a system's responses, against the ground truth. You will first enumerate key ideas in each response, discuss their overlap, and then report completeness. @@ -71,7 +71,7 @@ class DecompositionalSemanticRecall(dspy.Signature): -class DecompositionalGroundedness(dspy.Signature): +class AnswerGroundedness(dspy.Signature): """ Estimate the groundedness of a system's responses, against real retrieved documents written by people. You will first enumerate whatever non-trivial or check-worthy claims are made in the system response, and then @@ -89,8 +89,8 @@ class DecompositionalGroundedness(dspy.Signature): class CompleteAndGrounded(dspy.Module): def __init__(self, threshold=0.66): self.threshold = threshold - self.completeness_module = dspy.ChainOfThought(DecompositionalSemanticRecall) - self.groundedness_module = dspy.ChainOfThought(DecompositionalGroundedness) + self.completeness_module = dspy.ChainOfThought(AnswerCompleteness) + self.groundedness_module = dspy.ChainOfThought(AnswerGroundedness) def forward(self, example, pred, trace=None): completeness = self.completeness_module(question=example.question, ground_truth=example.response, system_response=pred.response) diff --git a/dspy/utils/__init__.py b/dspy/utils/__init__.py index f12b34b18..ba205504e 100644 --- a/dspy/utils/__init__.py +++ b/dspy/utils/__init__.py @@ -2,3 +2,17 @@ from dspy.utils.dummies import * from dspy.utils.caching import * from dspy.utils.logging_utils import * + +import os +import ujson +import requests + +def download(url): + filename = os.path.basename(url) + remote_size = int(requests.head(url, allow_redirects=True).headers.get('Content-Length', 0)) + local_size = os.path.getsize(filename) if os.path.exists(filename) else 0 + + if local_size != remote_size: + print(f"Downloading '{filename}'...") + with requests.get(url, stream=True) as r, open(filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): f.write(chunk) From f0efe4549b167b46a6edf046f3591aa188dc932f Mon Sep 17 00:00:00 2001 From: Omar Khattab Date: Sat, 23 Nov 2024 22:24:54 -0800 Subject: [PATCH 08/19] Wrap disk_cache.size_limit adjustment in an if stmt --- dspy/clients/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dspy/clients/__init__.py b/dspy/clients/__init__.py index de2e76f3c..546a96c75 100644 --- a/dspy/clients/__init__.py +++ b/dspy/clients/__init__.py @@ -13,7 +13,9 @@ # TODO: There's probably value in getting litellm to support FanoutCache and to separate the limit for # the LM cache from the embeddings cache. Then we can lower the default 30GB limit. litellm.cache = Cache(disk_cache_dir=DISK_CACHE_DIR, type="disk") -litellm.cache.cache.disk_cache.reset('size_limit', DISK_CACHE_LIMIT) + +if litellm.cache.cache.disk_cache.size_limit != DISK_CACHE_LIMIT: + litellm.cache.cache.disk_cache.reset('size_limit', DISK_CACHE_LIMIT) litellm.telemetry = False From bff476a0b05f93280d94e4423d411b0b8a87a532 Mon Sep 17 00:00:00 2001 From: Omar Khattab Date: Sat, 23 Nov 2024 22:25:26 -0800 Subject: [PATCH 09/19] Small tutorial adjustment --- docs/docs/tutorials/rag/index.ipynb | 272 ---------------------------- 1 file changed, 272 deletions(-) diff --git a/docs/docs/tutorials/rag/index.ipynb b/docs/docs/tutorials/rag/index.ipynb index a2dbe1108..36aa2f03c 100644 --- a/docs/docs/tutorials/rag/index.ipynb +++ b/docs/docs/tutorials/rag/index.ipynb @@ -2305,278 +2305,6 @@ "loaded_rag(question=\"cmd+tab does not work on hidden or minimized windows\")" ] }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "\n", - "\n", - "\u001b[34m[2024-11-23T22:14:01.562290]\u001b[0m\n", - "\n", - "\u001b[31mSystem message:\u001b[0m\n", - "\n", - "Your input fields are:\n", - "1. `question` (str)\n", - "2. `ground_truth` (str)\n", - "3. `system_response` (str)\n", - "\n", - "Your output fields are:\n", - "1. `reasoning` (str)\n", - "2. `ground_truth_key_ideas` (str): enumeration of key ideas in the ground truth\n", - "3. `system_response_key_ideas` (str): enumeration of key ideas in the system response\n", - "4. `discussion` (str): discussion of the overlap between ground truth and system response\n", - "5. `recall` (float): fraction (out of 1.0) of ground truth covered by the system response\n", - "6. `precision` (float): fraction (out of 1.0) of system response covered by the ground truth\n", - "\n", - "All interactions will be structured in the following way, with the appropriate values filled in.\n", - "\n", - "[[ ## question ## ]]\n", - "{question}\n", - "\n", - "[[ ## ground_truth ## ]]\n", - "{ground_truth}\n", - "\n", - "[[ ## system_response ## ]]\n", - "{system_response}\n", - "\n", - "[[ ## reasoning ## ]]\n", - "{reasoning}\n", - "\n", - "[[ ## ground_truth_key_ideas ## ]]\n", - "{ground_truth_key_ideas}\n", - "\n", - "[[ ## system_response_key_ideas ## ]]\n", - "{system_response_key_ideas}\n", - "\n", - "[[ ## discussion ## ]]\n", - "{discussion}\n", - "\n", - "[[ ## recall ## ]]\n", - "{recall} # note: the value you produce must be a single float value\n", - "\n", - "[[ ## precision ## ]]\n", - "{precision} # note: the value you produce must be a single float value\n", - "\n", - "[[ ## completed ## ]]\n", - "\n", - "In adhering to this structure, your objective is: \n", - " Compare a system's response to the ground truth to compute recall and precision of key ideas.\n", - " You will first enumerate key ideas in each response, discuss their overlap, and then report recall and precision.\n", - "\n", - "\n", - "\u001b[31mUser message:\u001b[0m\n", - "\n", - "[[ ## question ## ]]\n", - "how to use pip after the os x el capitan upgrade?\n", - "\n", - "[[ ## ground_truth ## ]]\n", - "One quick solution is to install Python using homebrew into a directory such as `/usr/local/bin`, which allows for the use of pip and python regardless of the system provided version. \n", - "Additionally, opting to use `pip3 install` alongside the desired package name helps to solve permission problem. \n", - "A direct and secure approach without altering system settings is to install Python modules for the individual user using the `pip install --user` command followed by the module name. \n", - "An alternative method involves installing Python with homebrew and then obtaining pip through the `sudo easy_install pip` command. \n", - "Lastly, using virtual environments, specifically using virtualenv or virtualenvwrapper, creates an isolated environment for each project, permitting seamless pip usage for package management within these enclosed spaces.\n", - "\n", - "[[ ## system_response ## ]]\n", - "After upgrading to OS X El Capitan, you can use pip by following these steps:\n", - "\n", - "1. **Check Python Installation**: Ensure that you have the correct version of Python installed. You can check this by running `python --version` or `python3 --version` in the terminal.\n", - "\n", - "2. **Install pip**: If pip is not already installed, you can install it using the following command:\n", - " ```bash\n", - " sudo easy_install pip\n", - " ```\n", - " Alternatively, if you are using a specific version of Python installed via MacPorts, make sure to use the corresponding pip version, such as `py36-pip` for Python 3.6.\n", - "\n", - "3. **Verify pip Installation**: After installation, verify that pip is working by running:\n", - " ```bash\n", - " pip --version\n", - " ```\n", - "\n", - "4. **Using pip**: You can now use pip to install packages. For example, to install the `nltk` package, you would run:\n", - " ```bash\n", - " pip install nltk\n", - " ```\n", - "\n", - "5. **Managing Multiple Versions**: If you have multiple versions of Python installed (e.g., via MacPorts or Homebrew), ensure you are using the correct pip associated with the desired Python version. You can do this by using the `port select` command to set the active Python version.\n", - "\n", - "6. **Check PATH**: If you encounter issues with pip not being recognized, check your PATH settings. You may need to add the directory containing pip to your PATH by editing the `/etc/paths` file:\n", - " ```bash\n", - " sudo vim /etc/paths\n", - " ```\n", - " Add the path to the pip installation (e.g., `/usr/local/bin`) at the end of the file, save, and exit.\n", - "\n", - "By following these steps, you should be able to use pip effectively after your OS X El Capitan upgrade.\n", - "\n", - "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## ground_truth_key_ideas ## ]]`, then `[[ ## system_response_key_ideas ## ]]`, then `[[ ## discussion ## ]]`, then `[[ ## recall ## ]]` (must be formatted as a valid Python float), then `[[ ## precision ## ]]` (must be formatted as a valid Python float), and then ending with the marker for `[[ ## completed ## ]]`.\n", - "\n", - "\n", - "\u001b[31mResponse:\u001b[0m\n", - "\n", - "\u001b[32m[[ ## reasoning ## ]]\n", - "The system response provides a detailed guide on how to use pip after upgrading to OS X El Capitan, including checking the Python installation, installing pip, verifying the installation, and managing multiple versions of Python. The ground truth also covers similar points but emphasizes using Homebrew for Python installation, using `pip3`, and installing packages for the individual user. Both responses address the installation and usage of pip, but the ground truth includes additional methods and best practices for managing Python environments.\n", - "\n", - "[[ ## ground_truth_key_ideas ## ]]\n", - "1. Install Python using Homebrew into `/usr/local/bin`.\n", - "2. Use `pip3 install` to solve permission issues.\n", - "3. Install Python modules for the individual user with `pip install --user`.\n", - "4. Install pip using `sudo easy_install pip`.\n", - "5. Use virtual environments (virtualenv or virtualenvwrapper) for isolated package management.\n", - "\n", - "[[ ## system_response_key_ideas ## ]]\n", - "1. Check Python installation version.\n", - "2. Install pip using `sudo easy_install pip`.\n", - "3. Verify pip installation.\n", - "4. Use pip to install packages (e.g., `pip install nltk`).\n", - "5. Manage multiple Python versions and check PATH settings.\n", - "\n", - "[[ ## discussion ## ]]\n", - "There is a significant overlap between the ground truth and the system response regarding the installation and usage of pip. Both mention the installation of pip and the verification of its functionality. However, the ground truth provides additional methods for managing Python installations and permissions, such as using Homebrew and user-specific installations. The system response focuses more on the steps to ensure pip is set up correctly after the OS X upgrade, including managing multiple Python versions and checking PATH settings. Overall, while both responses cover the essential aspects of using pip, the ground truth offers a broader perspective on installation methods and user management.\n", - "\n", - "[[ ## recall ## ]]\n", - "0.5\n", - "\n", - "[[ ## precision ## ]]\n", - "0.5\n", - "\n", - "[[ ## completed ## ]]\u001b[0m\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\u001b[34m[2024-11-23T22:14:01.708696]\u001b[0m\n", - "\n", - "\u001b[31mSystem message:\u001b[0m\n", - "\n", - "Your input fields are:\n", - "1. `context` (str)\n", - "2. `question` (str)\n", - "\n", - "Your output fields are:\n", - "1. `reasoning` (str)\n", - "2. `response` (str)\n", - "\n", - "All interactions will be structured in the following way, with the appropriate values filled in.\n", - "\n", - "[[ ## context ## ]]\n", - "{context}\n", - "\n", - "[[ ## question ## ]]\n", - "{question}\n", - "\n", - "[[ ## reasoning ## ]]\n", - "{reasoning}\n", - "\n", - "[[ ## response ## ]]\n", - "{response}\n", - "\n", - "[[ ## completed ## ]]\n", - "\n", - "In adhering to this structure, your objective is: \n", - " Using the provided `context` which contains detailed information about transferring photos from an iPhone to a computer, and the `question` regarding how to transfer edited photos specifically, generate a comprehensive `response` that outlines the necessary steps or methods to effectively transfer the edited photos. Ensure to highlight any limitations of direct import methods and suggest alternative approaches such as AirDrop, emailing, or using iCloud Photo Library for a successful transfer of edited images.\n", - "\n", - "\n", - "\u001b[31mUser message:\u001b[0m\n", - "\n", - "This is an example of the task, though some input or output fields are not supplied.\n", - "\n", - "[[ ## context ## ]]\n", - "Not supplied for this particular example.\n", - "\n", - "[[ ## question ## ]]\n", - "can i create a shortcut to open a specific application on os x?\n", - "\n", - "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## response ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.\n", - "\n", - "\n", - "\u001b[31mAssistant message:\u001b[0m\n", - "\n", - "[[ ## reasoning ## ]]\n", - "Not supplied for this particular example.\n", - "\n", - "[[ ## response ## ]]\n", - "While Mac OS X does have a built-in method for setting keyboard shortcuts, it doesn't work for Firefox. \n", - "Karabiner-Elements allows for customization of keyboard shortcuts on Mac versions 10.12 and later, including executing shell commands. \n", - "Alfred, particularly in its Power Pack version, offers the ability to map shortcuts to applications, among its various features. \n", - "Using Automator Services in combination with built-in keyboard shortcuts is another method to create shortcuts without third party apps. \n", - "Spark 3 is a free Shortcuts manager which enables you to create Hot Keys to launch applications and documents, execute AppleScript, control iTunes, etc. \n", - "Some users have found Apptivate to be the easiest to use for assigning global shortcuts to apps. \n", - "To open Terminal with a shortcut on Mac without third-party software, one can utilize a custom Quick Action created in Automator and assign it a unique hotkey, like ⌘ + ⌥ + F12. \n", - "For iTerm2 users with Oh My Zsh, one can replace AppleScript with the 'Launch Application' action in Automator. \n", - "Furthermore, Terminal app already has built-in shortcut keys for opening new terminals or tabs at a folder location: New Terminal at Folder ^+⇧+T New Terminal at Tab Folder ⌥+^+⇧+T.\n", - "\n", - "[[ ## completed ## ]]\n", - "\n", - "\n", - "\u001b[31mUser message:\u001b[0m\n", - "\n", - "[[ ## context ## ]]\n", - "[1] «On linux, /sbin/nologin comes from the util-linux project, while /bin/false is part of GNU Coreutils. They serve different roles, and nologin has the option of printing a message for people who have it as their shell who are logging in. The linux commands come from BSD, where they seem to have a long history of being different. The FreeBSD false simply returns 1, while the nologin checks to make sure its running on a TTY and sends a message to syslog during login attempts. The linux versions are a bit more complicated (false doing all sorts of fun stuff with internationalization for the output of --help, I assume) but essentially perform the same way.»\n", - "[2] «/bin/false is a system command that is used anytime you need to pass a command to a program that should do nothing more than exit with an error. Its the companion to /bin/true. Both of these are very old and standard POSIX utilities and neither produce any output by definition. true is sometimes used for a shell script that should loop indefinitely, like: while true; do ... # Waste time if [ $wasted_time -gt 100000 ]; then exit 0 fi ... done /usr/sbin/nologin is specifically designed to replace a shell and produces output complaining you cant log-in. Before it existed, it was common to use /bin/false for dummy users, but could be confusing since the user doesnt know why theyre kicked off.»\n", - "[3] «/bin/false only job is to exit with a non-zero exit code. Try it at the command line: $:> /bin/false $:> echo $? 1 $:> Some institutions use /bin/false in the shell field of the password file. If user tries to login, the shell is /bin/false, so they are exited right away»\n", - "[4] «Some FTP servers will allow you FTP access only if you have a valid shell. /sbin/nologin is regarded as a valid shell, whereas /bin/false is not. (I think valid means its exit status is 0, but /etc/shells may also come into it, it probably depends on the system, the FTP software, and your configuration.)»\n", - "[5] «You can use false (/bin/false, /usr/bin/false, or shell builtin): $ false || echo It failed. It failed. $ You can also use exit 1 from a subshell: $ (exit 1) || echo Gosh, it failed too. Gosh, it failed too. $»\n", - "\n", - "[[ ## question ## ]]\n", - "whats the difference between /sbin/nologin and /bin/false\n", - "\n", - "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## response ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.\n", - "\n", - "\n", - "\u001b[31mAssistant message:\u001b[0m\n", - "\n", - "[[ ## reasoning ## ]]\n", - "The context provides detailed information about the roles and behaviors of `/sbin/nologin` and `/bin/false`. `/sbin/nologin` is designed to prevent users from logging in while providing a message indicating that login is not allowed. In contrast, `/bin/false` simply exits with a non-zero exit code without any output, serving as a command that does nothing. The distinction lies in their intended use cases and the feedback they provide to users.\n", - "\n", - "[[ ## response ## ]]\n", - "The main difference between `/sbin/nologin` and `/bin/false` is their purpose and behavior during login attempts. `/sbin/nologin` is used to prevent users from logging in while displaying a message that informs them they cannot log in. It is considered a valid shell for certain applications, such as FTP servers. On the other hand, `/bin/false` is a command that simply exits with a non-zero exit code and does not provide any output. It is often used in situations where a command needs to do nothing and is not intended to inform the user of any restrictions.\n", - "\n", - "[[ ## completed ## ]]\n", - "\n", - "\n", - "\u001b[31mUser message:\u001b[0m\n", - "\n", - "[[ ## context ## ]]\n", - "[1] «If you hold command and quickly tap tab you will cycle between your two most recently used applications without bringing up the heads up display. If you want to use a third party app Witch has options to ignore minimized windows.»\n", - "[2] «Try this: On your Mac, Navigate to System Preferences Go to Mission Control Uncheck When switching to an application, switch to a Space with open windows for the application Try using the cmd+tab now.»\n", - "[3] «Command + Tab until you get the apps icon. Before releasing the Command key, press and hold the Option key. You must switch to another app and let it take focus first. In other words, you cant just Command + Tab to another app and before actually selecting that app (by releasing the Command and Tab keys), switch right back to your minimized app, which you might attempt to do if you minimized it by accident or just simply changed your mind shortly after minimizing. Both the Command and left Option keys must be pressed on the same side (left or right) of the keyboard.»\n", - "[4] «Command-option-click does not hide all open windows. As stated in the linked Apple article, this command hides all other open items leaving the clicked item visible. It is possible to hide all other items and minimize your most recently item with command+option+h+m. This is a combination of command+option+h - Hide Others and command+option+m - Minimize.»\n", - "[5] «Cmd+` of course will work but I do it in different way. I do the Cmd+tab when i do that, I dont release the Cmd so the opened apps are still visible. While it is visible, I will press ▼. This way it will show all your opened application with animated displayed so you can navigate using arrow keys.»\n", - "\n", - "[[ ## question ## ]]\n", - "cmd+tab does not work on hidden or minimized windows\n", - "\n", - "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## response ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.\n", - "\n", - "\n", - "\u001b[31mResponse:\u001b[0m\n", - "\n", - "\u001b[32m[[ ## reasoning ## ]]\n", - "The context explains how the Command + Tab shortcut functions on macOS, particularly in relation to switching between applications. It notes that this shortcut does not bring back minimized or hidden windows directly. Instead, it cycles through applications that are currently open and visible. The information also suggests alternative methods for managing minimized windows and provides insights into how to navigate between applications effectively.\n", - "\n", - "[[ ## response ## ]]\n", - "The Command + Tab shortcut on macOS is designed to switch between currently open applications, but it does not directly restore minimized or hidden windows. When you use Command + Tab, it cycles through the applications that are actively running, and minimized windows do not count as active. To manage minimized windows, you can use other shortcuts or methods. For example, you can use Command + Option + H + M to hide all other applications and minimize the most recently used one. Alternatively, you can navigate to the application you want to restore using Command + Tab and then manually click on the minimized window in the Dock to bring it back to focus.\n", - "\n", - "[[ ## completed ## ]]\u001b[0m\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ] - } - ], - "source": [ - "dspy.inspect_history(n=2)" - ] - }, { "cell_type": "markdown", "metadata": {}, From 846c495eb1a19233b0586cdf3e94ca001edfe654 Mon Sep 17 00:00:00 2001 From: Omar Khattab Date: Sat, 23 Nov 2024 22:31:38 -0800 Subject: [PATCH 10/19] Small tutorial adjustment --- docs/docs/tutorials/rag/index.ipynb | 1000 +-------------------------- 1 file changed, 2 insertions(+), 998 deletions(-) diff --git a/docs/docs/tutorials/rag/index.ipynb b/docs/docs/tutorials/rag/index.ipynb index 36aa2f03c..79cb6a070 100644 --- a/docs/docs/tutorials/rag/index.ipynb +++ b/docs/docs/tutorials/rag/index.ipynb @@ -1035,1005 +1035,9 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:17 INFO dspy.teleprompt.mipro_optimizer_v2: \n", - "RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:\n", - "num_trials: 25\n", - "minibatch: True\n", - "num_candidates: 19\n", - "valset size: 160\n", - "\n", - "2024/11/23 22:13:17 INFO dspy.teleprompt.mipro_optimizer_v2: \n", - "==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==\n", - "2024/11/23 22:13:17 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.\n", - "\n", - "2024/11/23 22:13:17 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=19 sets of demonstrations...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapping set 1/19\n", - "Bootstrapping set 2/19\n", - "Bootstrapping set 3/19\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 10%|█ | 4/40 [00:00<00:04, 8.97it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapped 2 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.\n", - "Bootstrapping set 4/19\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 15%|█▌ | 6/40 [00:00<00:03, 8.98it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapped 1 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.\n", - "Bootstrapping set 5/19\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 2%|▎ | 1/40 [00:00<00:04, 9.16it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n", - "Bootstrapping set 6/19\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 8%|▊ | 3/40 [00:00<00:04, 9.16it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n", - "Bootstrapping set 7/19\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 5%|▌ | 2/40 [00:00<00:03, 9.53it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n", - "Bootstrapping set 8/19\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 12%|█▎ | 5/40 [00:00<00:03, 8.94it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapped 1 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.\n", - "Bootstrapping set 9/19\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 5%|▌ | 2/40 [00:00<00:04, 9.15it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n", - "Bootstrapping set 10/19\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 8%|▊ | 3/40 [00:00<00:04, 9.11it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n", - "Bootstrapping set 11/19\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 8%|▊ | 3/40 [00:00<00:04, 8.67it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapped 1 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n", - "Bootstrapping set 12/19\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 5%|▌ | 2/40 [00:00<00:04, 8.49it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n", - "Bootstrapping set 13/19\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 5%|▌ | 2/40 [00:00<00:04, 8.91it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n", - "Bootstrapping set 14/19\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 2%|▎ | 1/40 [00:00<00:04, 9.13it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n", - "Bootstrapping set 15/19\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 2%|▎ | 1/40 [00:00<00:04, 9.16it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n", - "Bootstrapping set 16/19\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 5%|▌ | 2/40 [00:00<00:04, 9.24it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n", - "Bootstrapping set 17/19\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 25%|██▌ | 10/40 [00:01<00:03, 8.74it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapped 2 full traces after 10 examples for up to 1 rounds, amounting to 10 attempts.\n", - "Bootstrapping set 18/19\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 8%|▊ | 3/40 [00:00<00:04, 8.40it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n", - "Bootstrapping set 19/19\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 8%|▊ | 3/40 [00:00<00:04, 8.64it/s]\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: \n", - "==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: \n", - "Proposing instructions...\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Given the fields `context`, `question`, produce the fields `response`.\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Using the provided `context` about Mac OS X commands and the `question` related to troubleshooting or file management, generate a detailed response. Begin by outlining the reasoning process step-by-step, then provide a comprehensive answer that not only addresses the question but also includes practical applications and comparisons where relevant.\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 2: You are a technical support assistant. Given the fields `context` and `question`, analyze the provided context to extract relevant information and produce a detailed and coherent `response` that answers the question based on the information available.\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 3: You are a technical support assistant. Given the fields `context` and `question`, provide a clear and structured `response` that outlines the methods for locking the screen in the XFCE desktop environment, using the information available in the `context`. Make sure to highlight the most effective methods and include any relevant details to enhance user understanding.\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 4: You are a technical expert in data integrity and security. Given the fields `context` and `question`, produce a well-reasoned `response` that clearly explains the differences between a hash function and a checksum, incorporating relevant details from the context provided.\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 5: You are a shell scripting expert. Given the context that provides various examples and explanations related to temporary files in shell scripting, along with a specific question about how to create a temporary file, produce a detailed response that includes a code snippet demonstrating the use of the `mktemp` command and how to properly manage the temporary file within a shell script. Make sure to emphasize the importance of cleanup after the file is used.\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 6: Using the provided `context` and `question`, analyze the information and generate a detailed yet concise `response` that effectively summarizes the main arguments and conclusions regarding the practice of commenting every line of code. Ensure that the response reflects the nuances of the context and provides a clear stance on the issue.\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 7: Using the provided `context` which contains detailed information about transferring photos from an iPhone to a computer, and the `question` regarding how to transfer edited photos specifically, generate a comprehensive `response` that outlines the necessary steps or methods to effectively transfer the edited photos. Ensure to highlight any limitations of direct import methods and suggest alternative approaches such as AirDrop, emailing, or using iCloud Photo Library for a successful transfer of edited images.\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 8: Imagine you are a technical support specialist assisting a user who is facing issues with their Mac OS X commands. They have a critical deadline and need reliable solutions to their questions. Your task is to provide detailed, accurate responses based on the given context and questions. Given the fields `context`, `question`, produce the fields `response` to help the user understand complex technical concepts and troubleshoot effectively.\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 9: Imagine you are a technical support agent assisting a user who has lost important data on their LG G2 device and needs to recover their TWRP backups urgently. Given the fields `context`, which contains vital information about where TWRP backups can be stored, and `question`, which asks specifically about the locations of these backups, produce a detailed `response` that guides the user on how to locate their backups effectively.\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 10: Using the provided `context` that contains information about deleting empty directories, and the `question` regarding how to recursively delete empty directories in the user's home directory, generate a detailed `response` that includes the appropriate command and an explanation of its components. Ensure to highlight the use of options like `-type`, `-empty`, and `-exec` in the command, and consider providing variations for additional clarity, such as including the verbose option for user feedback.\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 11: Based on the provided context and question, generate a detailed response that outlines the steps necessary to export a private key from a Java Keytool keystore. Include commands for converting the keystore from JKS to PKCS#12 format and for using OpenSSL to extract the private key. Emphasize the security implications of handling private keys and provide clear instructions on replacing placeholders with actual values.\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 12: You are a technical support assistant with expertise in Mac OS X commands. Given the fields `context` and `question`, provide a detailed and informative `response` that clarifies the distinctions or relationships between the concepts discussed in the context.\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 13: You are a privacy advocate explaining why someone would trust DuckDuckGo or similar providers with a privacy policy. Given the fields `context` and `question`, provide a detailed response that outlines the reasons for this trust, incorporating aspects such as the clarity of the privacy policy, technical implementations, user control, legal accountability, and community feedback.\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 14: Using the provided `context` and `question`, generate a detailed and coherent `response` that explains the reasons someone might trust DuckDuckGo or similar privacy-focused providers based on their privacy policies and practices.\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 15: Using the provided `context` that contains detailed information about Mac OS X commands and locations related to user account pictures, along with the `question` regarding where Mac stores these account pictures, generate a structured and informative `response`. Ensure that your response accurately summarizes the key locations and relevant details mentioned in the context, and clearly addresses the question posed.\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 16: Using the provided `context` and `question`, generate a comprehensive `response` that summarizes the key points, compares the advantages and disadvantages of the concepts discussed, and offers practical insights based on the information available. Ensure that the response is clear, organized, and addresses the user's inquiry effectively.\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 17: Given the context that describes various methods for creating temporary files in shell scripts, along with a specific question about how to create a temporary file, generate a detailed response that includes examples of using the `mktemp` command, ensuring to explain the importance of cleanup after file usage.\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: 18: You are a technical support assistant. Given the fields `context`, `question`, produce the fields `response`. Ensure that your response is detailed and provides step-by-step guidance based on the context provided.\n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: \n", - "\n", - "2024/11/23 22:13:23 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the default program...\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n", - "Average Metric: 89.01 / 160 (55.6%): 100%|██████████| 160/160 [00:04<00:00, 37.54it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:27 INFO dspy.evaluate.evaluate: Average Metric: 89.0075423349221 / 160 (55.6%)\n", - "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 55.63\n", - "\n", - "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==\n", - "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n", - "\n", - "/opt/anaconda3/envs/jun2024_py310/lib/python3.10/site-packages/optuna/samplers/_tpe/sampler.py:319: ExperimentalWarning: ``multivariate`` option is an experimental feature. The interface can change in the future.\n", - " warnings.warn(\n", - "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 1 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 14.72 / 25 (58.9%): 100%|██████████| 25/25 [00:00<00:00, 96.95it/s] " - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:27 INFO dspy.evaluate.evaluate: Average Metric: 14.719867707788584 / 25 (58.9%)\n", - "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.88 on minibatch of size 25 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 7'].\n", - "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88]\n", - "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", - "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", - "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", - "\n", - "\n", - "2024/11/23 22:13:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 2 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 13.88 / 25 (55.5%): 100%|██████████| 25/25 [00:00<00:00, 99.17it/s] " - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:28 INFO dspy.evaluate.evaluate: Average Metric: 13.87639947083419 / 25 (55.5%)\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 55.51 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10', 'Predictor 0: Few-Shot Set 7'].\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51]\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", - "\n", - "\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 3 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 15.56 / 25 (62.3%): 100%|██████████| 25/25 [00:00<00:00, 99.46it/s] " - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:28 INFO dspy.evaluate.evaluate: Average Metric: 15.563671185234691 / 25 (62.3%)\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.25 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 18'].\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25]\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", - "\n", - "\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 4 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 14.54 / 25 (58.2%): 100%|██████████| 25/25 [00:00<00:00, 97.02it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:28 INFO dspy.evaluate.evaluate: Average Metric: 14.542840231125426 / 25 (58.2%)\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.17 on minibatch of size 25 with parameters ['Predictor 0: Instruction 15', 'Predictor 0: Few-Shot Set 2'].\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17]\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", - "\n", - "\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 5 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 15.75 / 25 (63.0%): 100%|██████████| 25/25 [00:00<00:00, 104.42it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:28 INFO dspy.evaluate.evaluate: Average Metric: 15.746005444613344 / 25 (63.0%)\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.98 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 18'].\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98]\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", - "\n", - "\n", - "2024/11/23 22:13:28 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 6 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 14.68 / 25 (58.7%): 100%|██████████| 25/25 [00:00<00:00, 107.78it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:29 INFO dspy.evaluate.evaluate: Average Metric: 14.683617165143385 / 25 (58.7%)\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.73 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 1'].\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73]\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", - "\n", - "\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 7 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 15.93 / 25 (63.7%): 100%|██████████| 25/25 [00:00<00:00, 106.66it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:29 INFO dspy.evaluate.evaluate: Average Metric: 15.934088959267559 / 25 (63.7%)\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 63.74 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 12'].\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74]\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", - "\n", - "\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 8 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 15.52 / 25 (62.1%): 100%|██████████| 25/25 [00:00<00:00, 100.22it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:29 INFO dspy.evaluate.evaluate: Average Metric: 15.52144781700213 / 25 (62.1%)\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.09 on minibatch of size 25 with parameters ['Predictor 0: Instruction 11', 'Predictor 0: Few-Shot Set 13'].\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09]\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", - "\n", - "\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 9 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 15.54 / 25 (62.2%): 100%|██████████| 25/25 [00:00<00:00, 104.70it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:29 INFO dspy.evaluate.evaluate: Average Metric: 15.541098318140321 / 25 (62.2%)\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.16 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 4'].\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16]\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n", - "\n", - "\n", - "2024/11/23 22:13:29 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 10 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 12.33 / 25 (49.3%): 100%|██████████| 25/25 [00:00<00:00, 72.31it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:30 INFO dspy.evaluate.evaluate: Average Metric: 12.332086462618921 / 25 (49.3%)\n", - "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 49.33 on minibatch of size 25 with parameters ['Predictor 0: Instruction 14', 'Predictor 0: Few-Shot Set 1'].\n", - "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33]\n", - "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63]\n", - "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 55.63\n", - "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", - "\n", - "\n", - "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 1 =====\n", - "2024/11/23 22:13:30 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 63.74) from minibatch trials...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 98.06 / 160 (61.3%): 100%|██████████| 160/160 [00:01<00:00, 139.10it/s]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:31 INFO dspy.evaluate.evaluate: Average Metric: 98.06249092576995 / 160 (61.3%)\n", - "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: \u001b[92mNew best full eval score!\u001b[0m Score: 61.29\n", - "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", - "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", - "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n", - "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: \n", - "\n", - "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 11 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Average Metric: 15.61 / 25 (62.5%): 100%|██████████| 25/25 [00:00<00:00, 105.23it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:31 INFO dspy.evaluate.evaluate: Average Metric: 15.612633878081091 / 25 (62.5%)\n", - "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.45 on minibatch of size 25 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 12'].\n", - "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45]\n", - "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", - "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", - "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", - "\n", - "\n", - "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 12 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 15.03 / 25 (60.1%): 100%|██████████| 25/25 [00:00<00:00, 100.46it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:31 INFO dspy.evaluate.evaluate: Average Metric: 15.03300812819276 / 25 (60.1%)\n", - "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.13 on minibatch of size 25 with parameters ['Predictor 0: Instruction 6', 'Predictor 0: Few-Shot Set 18'].\n", - "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13]\n", - "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", - "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", - "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", - "\n", - "\n", - "2024/11/23 22:13:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 13 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 14.43 / 25 (57.7%): 100%|██████████| 25/25 [00:00<00:00, 112.91it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:32 INFO dspy.evaluate.evaluate: Average Metric: 14.430989267101385 / 25 (57.7%)\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.72 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 12'].\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72]\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", - "\n", - "\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 14 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 14.68 / 25 (58.7%): 100%|██████████| 25/25 [00:00<00:00, 95.62it/s] " - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:32 INFO dspy.evaluate.evaluate: Average Metric: 14.681540371022235 / 25 (58.7%)\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.73 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 5'].\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73]\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", - "\n", - "\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 15 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 14.54 / 25 (58.2%): 100%|██████████| 25/25 [00:00<00:00, 100.56it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:32 INFO dspy.evaluate.evaluate: Average Metric: 14.53865209268966 / 25 (58.2%)\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.15 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 14'].\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15]\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", - "\n", - "\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 16 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 14.83 / 25 (59.3%): 100%|██████████| 25/25 [00:00<00:00, 108.11it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:32 INFO dspy.evaluate.evaluate: Average Metric: 14.832026371762414 / 25 (59.3%)\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 59.33 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 18'].\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33]\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", - "\n", - "\n", - "2024/11/23 22:13:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 17 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 17.22 / 25 (68.9%): 100%|██████████| 25/25 [00:00<00:00, 105.12it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:33 INFO dspy.evaluate.evaluate: Average Metric: 17.216978671345192 / 25 (68.9%)\n", - "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.87 on minibatch of size 25 with parameters ['Predictor 0: Instruction 16', 'Predictor 0: Few-Shot Set 6'].\n", - "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87]\n", - "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", - "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", - "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", - "\n", - "\n", - "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 18 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 15.12 / 25 (60.5%): 100%|██████████| 25/25 [00:00<00:00, 97.80it/s] " - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:33 INFO dspy.evaluate.evaluate: Average Metric: 15.123535939830598 / 25 (60.5%)\n", - "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.49 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 6'].\n", - "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49]\n", - "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", - "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", - "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", - "\n", - "\n", - "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 19 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 15.26 / 25 (61.0%): 100%|██████████| 25/25 [00:00<00:00, 99.12it/s] " - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:33 INFO dspy.evaluate.evaluate: Average Metric: 15.256960301954985 / 25 (61.0%)\n", - "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 61.03 on minibatch of size 25 with parameters ['Predictor 0: Instruction 16', 'Predictor 0: Few-Shot Set 14'].\n", - "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03]\n", - "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", - "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", - "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", - "\n", - "\n", - "2024/11/23 22:13:33 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 20 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 14.61 / 25 (58.4%): 100%|██████████| 25/25 [00:00<00:00, 102.38it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:34 INFO dspy.evaluate.evaluate: Average Metric: 14.607005004992326 / 25 (58.4%)\n", - "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.43 on minibatch of size 25 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 9'].\n", - "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03, 58.43]\n", - "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29]\n", - "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", - "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", - "\n", - "\n", - "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 2 =====\n", - "2024/11/23 22:13:34 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 68.87) from minibatch trials...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 97.23 / 160 (60.8%): 100%|██████████| 160/160 [00:11<00:00, 14.01it/s] " - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:45 INFO dspy.evaluate.evaluate: Average Metric: 97.22622109571304 / 160 (60.8%)\n", - "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77]\n", - "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", - "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n", - "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: \n", - "\n", - "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 21 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 16.54 / 25 (66.2%): 100%|██████████| 25/25 [00:00<00:00, 112.10it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:45 INFO dspy.evaluate.evaluate: Average Metric: 16.54482901646923 / 25 (66.2%)\n", - "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 66.18 on minibatch of size 25 with parameters ['Predictor 0: Instruction 16', 'Predictor 0: Few-Shot Set 6'].\n", - "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03, 58.43, 66.18]\n", - "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77]\n", - "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", - "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", - "\n", - "\n", - "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 22 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 14.84 / 25 (59.4%): 100%|██████████| 25/25 [00:00<00:00, 113.00it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:45 INFO dspy.evaluate.evaluate: Average Metric: 14.837814582612035 / 25 (59.4%)\n", - "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 59.35 on minibatch of size 25 with parameters ['Predictor 0: Instruction 16', 'Predictor 0: Few-Shot Set 6'].\n", - "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03, 58.43, 66.18, 59.35]\n", - "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77]\n", - "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", - "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", - "\n", - "\n", - "2024/11/23 22:13:45 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 23 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 14.71 / 25 (58.8%): 100%|██████████| 25/25 [00:00<00:00, 105.76it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:46 INFO dspy.evaluate.evaluate: Average Metric: 14.711485027993763 / 25 (58.8%)\n", - "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 58.85 on minibatch of size 25 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 15'].\n", - "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03, 58.43, 66.18, 59.35, 58.85]\n", - "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77]\n", - "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", - "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", - "\n", - "\n", - "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 24 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 15.14 / 25 (60.6%): 100%|██████████| 25/25 [00:00<00:00, 95.66it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:46 INFO dspy.evaluate.evaluate: Average Metric: 15.144601379869599 / 25 (60.6%)\n", - "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 60.58 on minibatch of size 25 with parameters ['Predictor 0: Instruction 18', 'Predictor 0: Few-Shot Set 8'].\n", - "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03, 58.43, 66.18, 59.35, 58.85, 60.58]\n", - "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77]\n", - "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", - "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", - "\n", - "\n", - "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 25 / 25 ==\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 14.26 / 25 (57.0%): 100%|██████████| 25/25 [00:00<00:00, 103.69it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:46 INFO dspy.evaluate.evaluate: Average Metric: 14.257718170019547 / 25 (57.0%)\n", - "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 57.03 on minibatch of size 25 with parameters ['Predictor 0: Instruction 16', 'Predictor 0: Few-Shot Set 0'].\n", - "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [58.88, 55.51, 62.25, 58.17, 62.98, 58.73, 63.74, 62.09, 62.16, 49.33, 62.45, 60.13, 57.72, 58.73, 58.15, 59.33, 68.87, 60.49, 61.03, 58.43, 66.18, 59.35, 58.85, 60.58, 57.03]\n", - "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77]\n", - "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", - "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n", - "\n", - "\n", - "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 3 =====\n", - "2024/11/23 22:13:46 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 62.45) from minibatch trials...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 95.14 / 160 (59.5%): 100%|██████████| 160/160 [00:01<00:00, 143.17it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/23 22:13:47 INFO dspy.evaluate.evaluate: Average Metric: 95.13659459156446 / 160 (59.5%)\n", - "2024/11/23 22:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [55.63, 61.29, 60.77, 59.46]\n", - "2024/11/23 22:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 61.29\n", - "2024/11/23 22:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n", - "2024/11/23 22:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: \n", - "\n", - "2024/11/23 22:13:47 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 61.29!\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], + "outputs": [], "source": [ "tp = dspy.MIPROv2(metric=metric, auto=\"medium\", num_threads=24) # use fewer threads if your rate limit is small\n", "\n", From 48fe4707ed5292a0c0395b342b3256b164c5091f Mon Sep 17 00:00:00 2001 From: Omar Khattab Date: Sat, 23 Nov 2024 22:32:07 -0800 Subject: [PATCH 11/19] Small tutorial adjustment --- .../tutorials/entity_extraction/index.ipynb | 323 +++--------------- 1 file changed, 40 insertions(+), 283 deletions(-) diff --git a/docs/docs/tutorials/entity_extraction/index.ipynb b/docs/docs/tutorials/entity_extraction/index.ipynb index 8ec298196..f298add18 100644 --- a/docs/docs/tutorials/entity_extraction/index.ipynb +++ b/docs/docs/tutorials/entity_extraction/index.ipynb @@ -239,7 +239,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Average Metric: 172.00 / 200 (86.0%): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 200/200 [00:16<00:00, 11.94it/s]" + "Average Metric: 172.00 / 200 (86.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:16<00:00, 11.94it/s]" ] }, { @@ -299,7 +299,7 @@ " [Nadim, Ladki]\n", " We extracted the tokens \"Nadim\" and \"Ladki\" as they refer to speci...\n", " [Nadim, Ladki]\n", - " \u2714\ufe0f [True]\n", + " ✔️ [True]\n", " \n", " \n", " 2\n", @@ -307,7 +307,7 @@ " []\n", " There are no tokens referring to specific people in the provided l...\n", " []\n", - " \u2714\ufe0f [True]\n", + " ✔️ [True]\n", " \n", " \n", " 3\n", @@ -315,7 +315,7 @@ " []\n", " We did not find any tokens referring to specific people in the pro...\n", " []\n", - " \u2714\ufe0f [True]\n", + " ✔️ [True]\n", " \n", " \n", " 4\n", @@ -339,7 +339,7 @@ " [David, Campese]\n", " The extracted_people includes \"David Campese\" as it refers to a sp...\n", " [David, Campese]\n", - " \u2714\ufe0f [True]\n", + " ✔️ [True]\n", " \n", " \n", " 196\n", @@ -347,7 +347,7 @@ " []\n", " The extracted_people includes \"Wallabies\" as it refers to a specif...\n", " []\n", - " \u2714\ufe0f [True]\n", + " ✔️ [True]\n", " \n", " \n", " 197\n", @@ -355,7 +355,7 @@ " [Campese, Rob, Andrew]\n", " The extracted tokens refer to specific people mentioned in the tex...\n", " [Campese, Rob, Andrew]\n", - " \u2714\ufe0f [True]\n", + " ✔️ [True]\n", " \n", " \n", " 198\n", @@ -363,7 +363,7 @@ " [Campo, Andrew]\n", " The extracted tokens referring to specific people include \"Campo\" ...\n", " [Campo, Andrew]\n", - " \u2714\ufe0f [True]\n", + " ✔️ [True]\n", " \n", " \n", " 199\n", @@ -371,11 +371,11 @@ " []\n", " We extracted the names of specific people from the tokenized text....\n", " []\n", - " \u2714\ufe0f [True]\n", + " ✔️ [True]\n", " \n", " \n", "\n", - "

200 rows \u00d7 5 columns

\n", + "

200 rows × 5 columns

\n", "" ], "text/plain": [ @@ -420,16 +420,16 @@ "\n", " extracted_people extraction_correctness_metric \n", "0 [JAPAN, CHINA] \n", - "1 [Nadim, Ladki] \u2714\ufe0f [True] \n", - "2 [] \u2714\ufe0f [True] \n", - "3 [] \u2714\ufe0f [True] \n", + "1 [Nadim, Ladki] ✔️ [True] \n", + "2 [] ✔️ [True] \n", + "3 [] ✔️ [True] \n", "4 [China, Uzbekistan] \n", ".. ... ... \n", - "195 [David, Campese] \u2714\ufe0f [True] \n", - "196 [] \u2714\ufe0f [True] \n", - "197 [Campese, Rob, Andrew] \u2714\ufe0f [True] \n", - "198 [Campo, Andrew] \u2714\ufe0f [True] \n", - "199 [] \u2714\ufe0f [True] \n", + "195 [David, Campese] ✔️ [True] \n", + "196 [] ✔️ [True] \n", + "197 [Campese, Rob, Andrew] ✔️ [True] \n", + "198 [Campo, Andrew] ✔️ [True] \n", + "199 [] ✔️ [True] \n", "\n", "[200 rows x 5 columns]" ] @@ -469,252 +469,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/18 21:08:04 INFO dspy.teleprompt.mipro_optimizer_v2: \n", - "RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:\n", - "num_trials: 25\n", - "minibatch: False\n", - "num_candidates: 19\n", - "valset size: 40\n", - "\n", - "2024/11/18 21:08:04 INFO dspy.teleprompt.mipro_optimizer_v2: \n", - "==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==\n", - "2024/11/18 21:08:04 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.\n", - "\n", - "2024/11/18 21:08:04 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=19 sets of demonstrations...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapping set 1/19\n", - "Bootstrapping set 2/19\n", - "Bootstrapping set 3/19\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n...\n", - "...\n", - "...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n", - "Bootstrapping set 19/19\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 40%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u258a | 4/10 [00:00<00:00, 995.21it/s]\n", - "2024/11/18 21:08:17 INFO dspy.teleprompt.mipro_optimizer_v2: \n", - "==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==\n", - "2024/11/18 21:08:17 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/18 21:08:21 INFO dspy.teleprompt.mipro_optimizer_v2: \n", - "Proposing instructions...\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Extract contiguous tokens referring to specific people, if any, from a list of string tokens.\n", - "Output a list of tokens. In other words, do not combine multiple tokens into a single value.\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Given a list of tokenized text, identify and extract all contiguous tokens that refer to specific individuals. Ensure that the output is a list of these tokens without combining them into single values. Provide a clear rationale explaining the reasoning behind each extraction.\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 2: In a high-stakes scenario where accurate identification of EU officials is crucial for compliance with new health regulations affecting livestock, extract contiguous tokens from the provided list that refer to specific individuals. Ensure that your output is a comprehensive list of these tokens, as any oversight could lead to significant regulatory implications. Remember, do not combine multiple tokens into a single value; each name must be clearly delineated.\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 3: Given a list of tokenized text strings, identify and extract any contiguous tokens that refer to specific individuals. Provide a rationale for your extraction process, explaining the reasoning step by step. Output the extracted names as a list of tokens, ensuring that multiple tokens are not combined into a single value.\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 4: You are a Named Entity Recognition expert. Your task is to extract contiguous tokens that refer to specific people from the provided list of string tokens. If there are no specific individuals mentioned, return an empty list. Ensure that you do not combine multiple tokens into a single value; output them as a list.\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 5: Given the tokenized text, extract contiguous tokens that refer to specific individuals. If there are no references to identifiable people, indicate that no people have been extracted. Provide a rationale for your reasoning process along with the list of extracted names.\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 6: In a critical situation where accurate identification of EU officials is essential for compliance with new regulations, extract contiguous tokens from the provided list of string tokens that specifically refer to individuals. Ensure that your output is a list of distinct tokens without combining them into single values. This task is vital for ensuring clear communication in health communications regarding livestock, particularly in the context of sheep and mad cow disease.\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 7: In a high-stakes situation where accurate identification of individuals is critical for regulatory compliance and public health communication, extract contiguous tokens referring to specific people from the provided list of string tokens. Ensure that you output each identified individual as separate tokens without combining multiple tokens into a single value. This task is essential for ensuring clarity and accountability in communications pertaining to EU regulations and health matters.\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 8: Given a list of tokenized text, identify and extract any contiguous sequences of tokens that refer specifically to individuals. Ensure that the output is a list of tokens representing those names, and do not merge multiple tokens into a single value. Provide reasoning for your extraction process, clearly stating if specific individuals were found or if the tokens did not contain any references to people.\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 9: In a high-stakes scenario where accurate identification of EU officials is critical for public health communications regarding livestock diseases, extract contiguous tokens that refer to specific people from the provided list of string tokens. Ensure that the output is a list of tokens, without combining multiple tokens into a single value. Provide a clear rationale explaining the reasoning behind the identification of these tokens as referring to specific individuals.\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 10: Identify and extract contiguous tokens from the provided list that specifically refer to individuals. Ensure that the output consists of distinct tokens representing the names, without merging them into single values.\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 11: In a critical situation where accurate identification of key individuals is essential for effective communication regarding EU regulations and health communications, extract contiguous tokens referring to specific people from the provided list of string tokens. Ensure that the output is a list of tokens without combining them into a single value. This task is crucial for clarity in reporting and decision-making processes.\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 12: In a critical situation where accurate identification of key individuals is essential for public health communications regarding EU regulations on livestock, extract contiguous tokens referring to specific people from the provided list of string tokens. Ensure that the output is a list of individual tokens, maintaining their separation to facilitate precise recognition of each person mentioned in the context.\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 13: In a high-stakes situation where accurate identification of individuals is critical for regulatory compliance and public health communication, extract contiguous tokens referring to specific people from the provided list of string tokens. Ensure that you output each identified individual as separate tokens without combining multiple tokens into a single value. This task is essential for ensuring clarity and accountability in communications pertaining to EU regulations and health matters.\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 14: You are a Named Entity Recognition expert. Your task is to extract contiguous tokens referring to specific people from a list of string tokens. Please ensure that you output a list of tokens without combining them into a single value. Provide a rationale for your extraction, explaining why the identified tokens refer to a specific person.\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 15: Given a list of tokenized words, identify and extract contiguous tokens that refer to specific individuals. Provide a rationale explaining the reasoning behind the extraction process, and output a list of the identified tokens without combining them into single values.\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 16: You are an AI text analyzer. Your task is to extract contiguous tokens that refer to specific individuals from a list of string tokens. Carefully examine the tokens and output a list of those that represent people. If no tokens refer to individuals, return an empty list. Remember to provide a rationale explaining your extraction process.\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 17: In a critical situation where EU regulations regarding livestock health are being discussed, it is essential to accurately identify and extract the names of officials involved in these discussions. Given a list of tokenized text, extract contiguous tokens that refer to specific individuals. Ensure that each name is output as separate tokens, as combining them could lead to confusion. This information is vital for understanding the key players in the regulatory landscape and their statements on issues like mad cow disease and sheep health.\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 18: In a high-stakes situation where accurate identification of key individuals is crucial for regulatory compliance and public health communication, extract contiguous tokens referring to specific people from the provided list of string tokens. Ensure that your output is a list of tokens representing individuals, without combining multiple tokens into a single value. This extraction is vital for understanding the roles and actions of officials in EU regulations related to livestock health.\n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: \n", - "\n", - "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the default program...\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Average Metric: 34.00 / 40 (85.0%): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 40/40 [00:10<00:00, 3.69it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/18 21:10:16 INFO dspy.evaluate.evaluate: Average Metric: 34 / 40 (85.0%)\n", - "2024/11/18 21:10:16 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 85.0\n", - "\n", - "2024/11/18 21:10:16 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==\n", - "2024/11/18 21:10:16 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n", - "\n", - "/Users/corey.zumar/miniconda3/envs/default/lib/python3.10/site-packages/optuna/samplers/_tpe/sampler.py:319: ExperimentalWarning: ``multivariate`` option is an experimental feature. The interface can change in the future.\n", - " warnings.warn(\n", - "2024/11/18 21:10:16 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 1 / 25 =====\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 34.00 / 40 (85.0%): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 40/40 [00:17<00:00, 2.31it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/18 21:10:34 INFO dspy.evaluate.evaluate: Average Metric: 34 / 40 (85.0%)\n", - "2024/11/18 21:10:34 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 85.0 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 7'].\n", - "2024/11/18 21:10:34 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [85.0, 85.0]\n", - "2024/11/18 21:10:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 85.0\n", - "2024/11/18 21:10:34 INFO dspy.teleprompt.mipro_optimizer_v2: ========================\n", - "\n", - "\n", - "2024/11/18 21:10:34 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 25 =====\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 36.00 / 40 (90.0%): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 40/40 [00:09<00:00, 4.16it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/18 21:10:43 INFO dspy.evaluate.evaluate: Average Metric: 36 / 40 (90.0%)\n", - "2024/11/18 21:10:43 INFO dspy.teleprompt.mipro_optimizer_v2: \u001b[92mBest full score so far!\u001b[0m Score: 90.0\n", - "2024/11/18 21:10:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 90.0 with parameters ['Predictor 0: Instruction 10', 'Predictor 0: Few-Shot Set 7'].\n", - "2024/11/18 21:10:43 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [85.0, 85.0, 90.0]\n", - "2024/11/18 21:10:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 90.0\n", - "2024/11/18 21:10:43 INFO dspy.teleprompt.mipro_optimizer_v2: ========================\n", - "\n", - "\n", - "2024/11/18 21:10:43 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 25 =====\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 39.00 / 40 (97.5%): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 40/40 [00:10<00:00, 3.68it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n...\n", - "...\n", - "...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/18 21:14:37 INFO dspy.evaluate.evaluate: Average Metric: 34 / 40 (85.0%)\n", - "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 85.0 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].\n", - "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [85.0, 85.0, 90.0, 97.5, 95.0, 97.5, 82.5, 92.5, 85.0, 77.5, 85.0, 97.5, 97.5, 97.5, 95.0, 95.0, 97.5, 85.0, 90.0, 97.5, 92.5, 95.0, 95.0, 95.0, 85.0]\n", - "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5\n", - "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: =========================\n", - "\n", - "\n", - "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 25 / 25 =====\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Average Metric: 39.00 / 40 (97.5%): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 40/40 [00:00<00:00, 2609.25it/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/11/18 21:14:37 INFO dspy.evaluate.evaluate: Average Metric: 39 / 40 (97.5%)\n", - "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.5 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 18'].\n", - "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [85.0, 85.0, 90.0, 97.5, 95.0, 97.5, 82.5, 92.5, 85.0, 77.5, 85.0, 97.5, 97.5, 97.5, 95.0, 95.0, 97.5, 85.0, 90.0, 97.5, 92.5, 95.0, 95.0, 95.0, 85.0, 97.5]\n", - "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5\n", - "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: =========================\n", - "\n", - "\n", - "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 97.5!\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], + "outputs": [], "source": [ "mipro_optimizer = dspy.MIPROv2(\n", " metric=extraction_correctness_metric,\n", @@ -751,7 +508,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Average Metric: 186.00 / 200 (93.0%): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 200/200 [00:23<00:00, 8.58it/s]" + "Average Metric: 186.00 / 200 (93.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:23<00:00, 8.58it/s]" ] }, { @@ -811,7 +568,7 @@ " [Nadim, Ladki]\n", " The tokens \"Nadim Ladki\" refer to a specific individual. Both toke...\n", " [Nadim, Ladki]\n", - " \u2714\ufe0f [True]\n", + " ✔️ [True]\n", " \n", " \n", " 2\n", @@ -819,7 +576,7 @@ " []\n", " There are no tokens referring to specific people in the provided l...\n", " []\n", - " \u2714\ufe0f [True]\n", + " ✔️ [True]\n", " \n", " \n", " 3\n", @@ -827,7 +584,7 @@ " []\n", " There are no specific people mentioned in the provided tokens. The...\n", " []\n", - " \u2714\ufe0f [True]\n", + " ✔️ [True]\n", " \n", " \n", " 4\n", @@ -835,7 +592,7 @@ " []\n", " There are no tokens referring to specific people in the provided l...\n", " []\n", - " \u2714\ufe0f [True]\n", + " ✔️ [True]\n", " \n", " \n", " ...\n", @@ -851,7 +608,7 @@ " [David, Campese]\n", " The extracted tokens refer to a specific person mentioned in the t...\n", " [David, Campese]\n", - " \u2714\ufe0f [True]\n", + " ✔️ [True]\n", " \n", " \n", " 196\n", @@ -859,7 +616,7 @@ " []\n", " There are no specific individuals mentioned in the provided tokens...\n", " []\n", - " \u2714\ufe0f [True]\n", + " ✔️ [True]\n", " \n", " \n", " 197\n", @@ -867,7 +624,7 @@ " [Campese, Rob, Andrew]\n", " The tokens include the names \"Campese\" and \"Rob Andrew,\" both of w...\n", " [Campese, Rob, Andrew]\n", - " \u2714\ufe0f [True]\n", + " ✔️ [True]\n", " \n", " \n", " 198\n", @@ -875,7 +632,7 @@ " [Campo, Andrew]\n", " The extracted tokens refer to specific people mentioned in the tex...\n", " [Campo, Andrew]\n", - " \u2714\ufe0f [True]\n", + " ✔️ [True]\n", " \n", " \n", " 199\n", @@ -883,11 +640,11 @@ " []\n", " There are no specific people mentioned in the provided tokens. The...\n", " []\n", - " \u2714\ufe0f [True]\n", + " ✔️ [True]\n", " \n", " \n", "\n", - "

200 rows \u00d7 5 columns

\n", + "

200 rows × 5 columns

\n", "" ], "text/plain": [ @@ -932,16 +689,16 @@ "\n", " extracted_people extraction_correctness_metric \n", "0 [] \n", - "1 [Nadim, Ladki] \u2714\ufe0f [True] \n", - "2 [] \u2714\ufe0f [True] \n", - "3 [] \u2714\ufe0f [True] \n", - "4 [] \u2714\ufe0f [True] \n", + "1 [Nadim, Ladki] ✔️ [True] \n", + "2 [] ✔️ [True] \n", + "3 [] ✔️ [True] \n", + "4 [] ✔️ [True] \n", ".. ... ... \n", - "195 [David, Campese] \u2714\ufe0f [True] \n", - "196 [] \u2714\ufe0f [True] \n", - "197 [Campese, Rob, Andrew] \u2714\ufe0f [True] \n", - "198 [Campo, Andrew] \u2714\ufe0f [True] \n", - "199 [] \u2714\ufe0f [True] \n", + "195 [David, Campese] ✔️ [True] \n", + "196 [] ✔️ [True] \n", + "197 [Campese, Rob, Andrew] ✔️ [True] \n", + "198 [Campo, Andrew] ✔️ [True] \n", + "199 [] ✔️ [True] \n", "\n", "[200 rows x 5 columns]" ] From 0eb1e04bfc131897199748fba621222905bca220 Mon Sep 17 00:00:00 2001 From: Omar Khattab Date: Sat, 23 Nov 2024 23:17:58 -0800 Subject: [PATCH 12/19] tutorial adjustment --- docs/docs/tutorials/rag/index.ipynb | 61 +++++++++++++++++------------ 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/docs/docs/tutorials/rag/index.ipynb b/docs/docs/tutorials/rag/index.ipynb index 79cb6a070..45ce46b7c 100644 --- a/docs/docs/tutorials/rag/index.ipynb +++ b/docs/docs/tutorials/rag/index.ipynb @@ -94,7 +94,7 @@ "\n", "\n", "\n", - "\u001b[34m[2024-11-23T22:12:48.901453]\u001b[0m\n", + "\u001b[34m[2024-11-23T23:16:35.966534]\u001b[0m\n", "\n", "\u001b[31mSystem message:\u001b[0m\n", "\n", @@ -404,7 +404,7 @@ "\n", "\n", "\n", - "\u001b[34m[2024-11-23T22:12:49.329836]\u001b[0m\n", + "\u001b[34m[2024-11-23T23:16:36.149518]\u001b[0m\n", "\n", "\u001b[31mSystem message:\u001b[0m\n", "\n", @@ -527,14 +527,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "Average Metric: 125.68 / 300 (41.9%): 100%|██████████| 300/300 [00:00<00:00, 598.18it/s]" + "Average Metric: 125.68 / 300 (41.9%): 100%|██████████| 300/300 [00:00<00:00, 666.96it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2024/11/23 22:12:49 INFO dspy.evaluate.evaluate: Average Metric: 125.68228336477591 / 300 (41.9%)\n" + "2024/11/23 23:16:36 INFO dspy.evaluate.evaluate: Average Metric: 125.68228336477591 / 300 (41.9%)\n" ] }, { @@ -699,13 +699,24 @@ "source": [ "## Set up your system's retriever.\n", "\n", - "As far as DSPy is concerned, you can plug in any Python code for calling tools or retrievers. Here, we'll just use OpenAI Embeddings and do top-K search locally, just for convenience." + "As far as DSPy is concerned, you can plug in any Python code for calling tools or retrievers. Here, we'll just use OpenAI Embeddings and do top-K search locally, just for convenience.\n", + "\n", + "**Note:** The step below will require that you either do `pip install -U faiss-cpu` or pass `brute_force_threshold=30_000` to `dspy.retrievers.Embeddings` to avoid faiss." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, + "outputs": [], + "source": [ + "# %pip install -U faiss-cpu # or faiss-gpu if you have a GPU" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -744,7 +755,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -767,7 +778,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -779,7 +790,7 @@ ")" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -791,7 +802,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -802,7 +813,7 @@ "\n", "\n", "\n", - "\u001b[34m[2024-11-23T22:13:02.348625]\u001b[0m\n", + "\u001b[34m[2024-11-23T23:16:49.175612]\u001b[0m\n", "\n", "\u001b[31mSystem message:\u001b[0m\n", "\n", @@ -879,21 +890,21 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Average Metric: 166.39 / 300 (55.5%): 100%|██████████| 300/300 [00:14<00:00, 20.29it/s]" + "Average Metric: 166.54 / 300 (55.5%): 100%|██████████| 300/300 [00:04<00:00, 61.40it/s] " ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2024/11/23 22:13:17 INFO dspy.evaluate.evaluate: Average Metric: 166.39410892098812 / 300 (55.5%)\n" + "2024/11/23 23:16:54 INFO dspy.evaluate.evaluate: Average Metric: 166.53601368289284 / 300 (55.5%)\n" ] }, { @@ -1008,10 +1019,10 @@ { "data": { "text/plain": [ - "55.46" + "55.51" ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1057,7 +1068,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -1075,7 +1086,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1111,21 +1122,21 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Average Metric: 183.28 / 300 (61.1%): 100%|██████████| 300/300 [00:13<00:00, 22.20it/s] " + "Average Metric: 183.32 / 300 (61.1%): 100%|██████████| 300/300 [00:02<00:00, 104.48it/s]" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2024/11/23 22:14:01 INFO dspy.evaluate.evaluate: Average Metric: 183.27658621624977 / 300 (61.1%)\n" + "2024/11/23 23:17:21 INFO dspy.evaluate.evaluate: Average Metric: 183.3194433591069 / 300 (61.1%)\n" ] }, { @@ -1240,10 +1251,10 @@ { "data": { "text/plain": [ - "61.09" + "61.11" ] }, - "execution_count": 21, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1263,7 +1274,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -1283,7 +1294,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -1295,7 +1306,7 @@ ")" ] }, - "execution_count": 23, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } From 5c2ee6ac67ee6babf57f4081a0c78a4bf6a29a84 Mon Sep 17 00:00:00 2001 From: Omar Khattab Date: Sun, 24 Nov 2024 08:29:56 -0800 Subject: [PATCH 13/19] Convert dspy.settings to a ContextVar, improve ParallelExecutor (isolate even if 1 thread), and permit user-launched threads (#1852) * Convert dspy.settings to a ContextVar, improve ParallelExecutor (isolate even if 1 thread), and permit user-launched threads * Fixes --- dsp/utils/settings.py | 123 ++++++++++++++++++++----------------- dspy/utils/asyncify.py | 21 +------ dspy/utils/parallelizer.py | 98 ++++++++++++++++------------- 3 files changed, 124 insertions(+), 118 deletions(-) diff --git a/dsp/utils/settings.py b/dsp/utils/settings.py index 00f01eeaf..118a61fdc 100644 --- a/dsp/utils/settings.py +++ b/dsp/utils/settings.py @@ -1,7 +1,8 @@ +import copy import threading -from contextlib import contextmanager -from copy import deepcopy +from contextlib import contextmanager +from contextvars import ContextVar from dsp.utils.utils import dotdict DEFAULT_CONFIG = dotdict( @@ -27,85 +28,95 @@ async_max_workers=8, ) +# Global base configuration +main_thread_config = copy.deepcopy(DEFAULT_CONFIG) + +# Initialize the context variable with an empty dict as default +dspy_ctx_overrides = ContextVar('dspy_ctx_overrides', default=dotdict()) + class Settings: - """DSP configuration settings.""" + """ + A singleton class for DSPy configuration settings. + + This is thread-safe. User threads are supported both through ParallelExecutor and native threading. + - If native threading is used, the thread inherits the initial config from the main thread. + - If ParallelExecutor is used, the thread inherits the initial config from its parent thread. + """ _instance = None def __new__(cls): - """ - Singleton Pattern. See https://python-patterns.guide/gang-of-four/singleton/ - """ - if cls._instance is None: cls._instance = super().__new__(cls) - cls._instance.lock = threading.Lock() - cls._instance.main_tid = threading.get_ident() - cls._instance.main_stack = [] - cls._instance.stack_by_thread = {} - cls._instance.stack_by_thread[threading.get_ident()] = cls._instance.main_stack + cls._instance.lock = threading.Lock() # maintained here for assertions + return cls._instance - # TODO: remove first-class support for re-ranker and potentially combine with RM to form a pipeline of sorts - # eg: RetrieveThenRerankPipeline(RetrievalModel, Reranker) - # downstream operations like dsp.retrieve would use configs from the defined pipeline. + def __getattr__(self, name): + overrides = dspy_ctx_overrides.get() + if name in overrides: + return overrides[name] + elif name in main_thread_config: + return main_thread_config[name] + else: + raise AttributeError(f"'Settings' object has no attribute '{name}'") - # make a deepcopy of the default config to avoid modifying the default config - cls._instance.__append(deepcopy(DEFAULT_CONFIG)) + def __setattr__(self, name, value): + if name in ('_instance',): + super().__setattr__(name, value) + else: + self.configure(**{name: value}) - return cls._instance + # Dictionary-like access - @property - def config(self): - thread_id = threading.get_ident() - if thread_id not in self.stack_by_thread: - self.stack_by_thread[thread_id] = [self.main_stack[-1].copy()] - return self.stack_by_thread[thread_id][-1] + def __getitem__(self, key): + return self.__getattr__(key) - def __getattr__(self, name): - if hasattr(self.config, name): - return getattr(self.config, name) + def __setitem__(self, key, value): + self.__setattr__(key, value) - if name in self.config: - return self.config[name] + def __contains__(self, key): + overrides = dspy_ctx_overrides.get() + return key in overrides or key in main_thread_config - super().__getattr__(name) + def get(self, key, default=None): + try: + return self[key] + except AttributeError: + return default - def __append(self, config): - thread_id = threading.get_ident() - if thread_id not in self.stack_by_thread: - self.stack_by_thread[thread_id] = [self.main_stack[-1].copy()] - self.stack_by_thread[thread_id].append(config) + def copy(self): + overrides = dspy_ctx_overrides.get() + return dotdict({**main_thread_config, **overrides}) - def __pop(self): - thread_id = threading.get_ident() - if thread_id in self.stack_by_thread: - self.stack_by_thread[thread_id].pop() + # Configuration methods - def configure(self, inherit_config: bool = True, **kwargs): - """Set configuration settings. + def configure(self, return_token=False, **kwargs): + global main_thread_config + overrides = dspy_ctx_overrides.get() + new_overrides = dotdict({**main_thread_config, **overrides, **kwargs}) + token = dspy_ctx_overrides.set(new_overrides) - Args: - inherit_config (bool, optional): Set configurations for the given, and use existing configurations for the rest. Defaults to True. - """ - if inherit_config: - config = {**self.config, **kwargs} - else: - config = {**kwargs} + # Update main_thread_config, in the main thread only + if threading.current_thread() is threading.main_thread(): + main_thread_config = new_overrides - self.__append(config) + if return_token: + return token @contextmanager - def context(self, inherit_config=True, **kwargs): - self.configure(inherit_config=inherit_config, **kwargs) - + def context(self, **kwargs): + """Context manager for temporary configuration changes.""" + token = self.configure(return_token=True, **kwargs) try: yield finally: - self.__pop() + dspy_ctx_overrides.reset(token) - def __repr__(self) -> str: - return repr(self.config) + def __repr__(self): + overrides = dspy_ctx_overrides.get() + combined_config = {**main_thread_config, **overrides} + return repr(combined_config) -settings = Settings() \ No newline at end of file +settings = Settings() diff --git a/dspy/utils/asyncify.py b/dspy/utils/asyncify.py index ca801e12a..03bd9a7e9 100644 --- a/dspy/utils/asyncify.py +++ b/dspy/utils/asyncify.py @@ -24,22 +24,7 @@ def get_limiter(): def asyncify(program): - import dspy import threading - - assert threading.get_ident() == dspy.settings.main_tid, "asyncify can only be called from the main thread" - - def wrapped(*args, **kwargs): - thread_stacks = dspy.settings.stack_by_thread - current_thread_id = threading.get_ident() - creating_new_thread = current_thread_id not in thread_stacks - - assert creating_new_thread - thread_stacks[current_thread_id] = list(dspy.settings.main_stack) - - try: - return program(*args, **kwargs) - finally: - del thread_stacks[threading.get_ident()] - - return asyncer.asyncify(wrapped, abandon_on_cancel=True, limiter=get_limiter()) + assert threading.current_thread() is threading.main_thread(), "asyncify can only be called from the main thread" + # NOTE: To allow this to be nested, we'd need behavior with contextvars like parallelizer.py + return asyncer.asyncify(program, abandon_on_cancel=True, limiter=get_limiter()) diff --git a/dspy/utils/parallelizer.py b/dspy/utils/parallelizer.py index 27983632b..c6b5f3d5f 100644 --- a/dspy/utils/parallelizer.py +++ b/dspy/utils/parallelizer.py @@ -1,16 +1,15 @@ -import logging import sys import tqdm -import dspy import signal +import logging import threading import traceback import contextlib +from contextvars import copy_context from tqdm.contrib.logging import logging_redirect_tqdm from concurrent.futures import ThreadPoolExecutor, as_completed - logger = logging.getLogger(__name__) @@ -23,6 +22,8 @@ def __init__( provide_traceback=False, compare_results=False, ): + """Offers isolation between the tasks (dspy.settings) irrespective of whether num_threads == 1 or > 1.""" + self.num_threads = num_threads self.disable_progress_bar = disable_progress_bar self.max_errors = max_errors @@ -33,34 +34,18 @@ def __init__( self.error_lock = threading.Lock() self.cancel_jobs = threading.Event() - def execute(self, function, data): wrapped_function = self._wrap_function(function) if self.num_threads == 1: - return self._execute_single_thread(wrapped_function, data) + return self._execute_isolated_single_thread(wrapped_function, data) else: return self._execute_multi_thread(wrapped_function, data) - def _wrap_function(self, function): - # Wrap the function with threading context and error handling - def wrapped(item, parent_id=None): - thread_stacks = dspy.settings.stack_by_thread - current_thread_id = threading.get_ident() - creating_new_thread = current_thread_id not in thread_stacks - - assert creating_new_thread or threading.get_ident() == dspy.settings.main_tid - - if creating_new_thread: - # If we have a parent thread ID, copy its stack. TODO: Should the caller just pass a copy of the stack? - if parent_id and parent_id in thread_stacks: - thread_stacks[current_thread_id] = list(thread_stacks[parent_id]) - else: - thread_stacks[current_thread_id] = list(dspy.settings.main_stack) - - # TODO: Consider the behavior below. - # import copy; thread_stacks[current_thread_id].append(copy.deepcopy(thread_stacks[current_thread_id][-1])) - + # Wrap the function with error handling + def wrapped(item): + if self.cancel_jobs.is_set(): + return None try: return function(item) except Exception as e: @@ -79,45 +64,53 @@ def wrapped(item, parent_id=None): f"Error processing item {item}: {e}. Set `provide_traceback=True` to see the stack trace." ) return None - finally: - if creating_new_thread: - del thread_stacks[threading.get_ident()] return wrapped - - def _execute_single_thread(self, function, data): + def _execute_isolated_single_thread(self, function, data): results = [] pbar = tqdm.tqdm( total=len(data), dynamic_ncols=True, disable=self.disable_progress_bar, - file=sys.stdout, + file=sys.stdout ) + for item in data: with logging_redirect_tqdm(): if self.cancel_jobs.is_set(): break - result = function(item) + + # Create an isolated context for each task + task_ctx = copy_context() + result = task_ctx.run(function, item) results.append(result) + if self.compare_results: # Assumes score is the last element of the result tuple - self._update_progress(pbar, sum([r[-1] for r in results if r is not None]), len([r for r in data if r is not None])) + self._update_progress( + pbar, + sum([r[-1] for r in results if r is not None]), + len([r for r in data if r is not None]), + ) else: self._update_progress(pbar, len(results), len(data)) + pbar.close() + if self.cancel_jobs.is_set(): logger.warning("Execution was cancelled due to errors.") raise Exception("Execution was cancelled due to errors.") - return results + return results def _update_progress(self, pbar, nresults, ntotal): if self.compare_results: - pbar.set_description(f"Average Metric: {nresults:.2f} / {ntotal} ({round(100 * nresults / ntotal, 1) if ntotal > 0 else 0}%)") + percentage = round(100 * nresults / ntotal, 1) if ntotal > 0 else 0 + pbar.set_description(f"Average Metric: {nresults:.2f} / {ntotal} ({percentage}%)") else: pbar.set_description(f"Processed {nresults} / {ntotal} examples") - pbar.update() + pbar.update() def _execute_multi_thread(self, function, data): results = [None] * len(data) # Pre-allocate results list to maintain order @@ -132,6 +125,7 @@ def interrupt_handler_manager(): def interrupt_handler(sig, frame): self.cancel_jobs.set() logger.warning("Received SIGINT. Cancelling execution.") + # Re-raise the signal to allow default behavior default_handler(sig, frame) signal.signal(signal.SIGINT, interrupt_handler) @@ -143,37 +137,53 @@ def interrupt_handler(sig, frame): # If not in the main thread, skip setting signal handlers yield - def cancellable_function(index_item, parent_id=None): + def cancellable_function(index_item): index, item = index_item if self.cancel_jobs.is_set(): return index, job_cancelled - return index, function(item, parent_id) - - parent_id = threading.get_ident() if threading.current_thread() is not threading.main_thread() else None + return index, function(item) with ThreadPoolExecutor(max_workers=self.num_threads) as executor, interrupt_handler_manager(): - futures = {executor.submit(cancellable_function, pair, parent_id): pair for pair in enumerate(data)} + futures = {} + for pair in enumerate(data): + # Capture the context for each task + task_ctx = copy_context() + future = executor.submit(task_ctx.run, cancellable_function, pair) + futures[future] = pair + pbar = tqdm.tqdm( total=len(data), dynamic_ncols=True, disable=self.disable_progress_bar, - file=sys.stdout, + file=sys.stdout ) for future in as_completed(futures): index, result = future.result() - + if result is job_cancelled: continue + results[index] = result if self.compare_results: # Assumes score is the last element of the result tuple - self._update_progress(pbar, sum([r[-1] for r in results if r is not None]), len([r for r in results if r is not None])) + self._update_progress( + pbar, + sum([r[-1] for r in results if r is not None]), + len([r for r in results if r is not None]), + ) else: - self._update_progress(pbar, len([r for r in results if r is not None]), len(data)) + self._update_progress( + pbar, + len([r for r in results if r is not None]), + len(data), + ) + pbar.close() + if self.cancel_jobs.is_set(): logger.warning("Execution was cancelled due to errors.") raise Exception("Execution was cancelled due to errors.") + return results From 21ab3bb2b54376020c5245d182b5ad9a21f28fd5 Mon Sep 17 00:00:00 2001 From: Omar Khattab Date: Sun, 24 Nov 2024 08:54:07 -0800 Subject: [PATCH 14/19] Fix settings.context finally block: main_thread_config --- dsp/utils/settings.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dsp/utils/settings.py b/dsp/utils/settings.py index 118a61fdc..8edceddbb 100644 --- a/dsp/utils/settings.py +++ b/dsp/utils/settings.py @@ -49,7 +49,7 @@ class Settings: def __new__(cls): if cls._instance is None: cls._instance = super().__new__(cls) - cls._instance.lock = threading.Lock() # maintained here for assertions + # No need for a lock since we're only updating main_thread_config in the main thread return cls._instance def __getattr__(self, name): @@ -113,6 +113,10 @@ def context(self, **kwargs): finally: dspy_ctx_overrides.reset(token) + if threading.current_thread() is threading.main_thread(): + global main_thread_config + main_thread_config = dspy_ctx_overrides.get() + def __repr__(self): overrides = dspy_ctx_overrides.get() combined_config = {**main_thread_config, **overrides} From 56456e0810919e94efba6f81eb8493854be26ce6 Mon Sep 17 00:00:00 2001 From: Omar Khattab Date: Sun, 24 Nov 2024 08:58:05 -0800 Subject: [PATCH 15/19] fix settings.py --- dsp/utils/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dsp/utils/settings.py b/dsp/utils/settings.py index 8edceddbb..ffea8259e 100644 --- a/dsp/utils/settings.py +++ b/dsp/utils/settings.py @@ -49,7 +49,7 @@ class Settings: def __new__(cls): if cls._instance is None: cls._instance = super().__new__(cls) - # No need for a lock since we're only updating main_thread_config in the main thread + cls._instance.lock = threading.Lock() # maintained here for DSPy assertions.py return cls._instance def __getattr__(self, name): From ff6d77fcba599b595262385c91ef451b4ed933ea Mon Sep 17 00:00:00 2001 From: Omar Khattab Date: Sun, 24 Nov 2024 10:02:08 -0800 Subject: [PATCH 16/19] add back settings.config --- dsp/utils/settings.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dsp/utils/settings.py b/dsp/utils/settings.py index ffea8259e..3081eb693 100644 --- a/dsp/utils/settings.py +++ b/dsp/utils/settings.py @@ -89,6 +89,12 @@ def copy(self): overrides = dspy_ctx_overrides.get() return dotdict({**main_thread_config, **overrides}) + @property + def config(self): + config = self.copy() + del config['lock'] + return config + # Configuration methods def configure(self, return_token=False, **kwargs): From 2aa6f0172eff8d488b4d88e58168c97e641b6da0 Mon Sep 17 00:00:00 2001 From: Omar Khattab Date: Sun, 24 Nov 2024 13:36:05 -0800 Subject: [PATCH 17/19] fix settings contextmanager --- dsp/utils/settings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dsp/utils/settings.py b/dsp/utils/settings.py index 3081eb693..4ffbd23d9 100644 --- a/dsp/utils/settings.py +++ b/dsp/utils/settings.py @@ -100,7 +100,7 @@ def config(self): def configure(self, return_token=False, **kwargs): global main_thread_config overrides = dspy_ctx_overrides.get() - new_overrides = dotdict({**main_thread_config, **overrides, **kwargs}) + new_overrides = dotdict({**copy.deepcopy(DEFAULT_CONFIG), **main_thread_config, **overrides, **kwargs}) token = dspy_ctx_overrides.set(new_overrides) # Update main_thread_config, in the main thread only @@ -121,7 +121,7 @@ def context(self, **kwargs): if threading.current_thread() is threading.main_thread(): global main_thread_config - main_thread_config = dspy_ctx_overrides.get() + main_thread_config = dotdict({**copy.deepcopy(DEFAULT_CONFIG), **dspy_ctx_overrides.get()}) def __repr__(self): overrides = dspy_ctx_overrides.get() From ff6f5a873e46a7cc59e530d795f8d9b7a79a4bbd Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Mon, 25 Nov 2024 15:23:21 +0000 Subject: [PATCH 18/19] Bugfix/signature replace and pydantic 2.10 (#1855) * feat(dspy): add datamodel-code-generator to dev reqs * fix(dspy): fix signature replace for pydantic v2.10 * fix(dspy): fix signature replace for pydantic v2.10 --- dspy/signatures/signature.py | 45 ++++++++++++++++++++++++++---------- requirements-dev.txt | 5 ++-- 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/dspy/signatures/signature.py b/dspy/signatures/signature.py index 88812be0c..387b9d771 100644 --- a/dspy/signatures/signature.py +++ b/dspy/signatures/signature.py @@ -1,5 +1,6 @@ import ast import inspect +import logging import re import types import typing @@ -11,8 +12,9 @@ from pydantic.fields import FieldInfo import dsp -from dspy.signatures.field import InputField, OutputField, new_to_old_field from dspy.adapters.image_utils import Image +from dspy.signatures.field import InputField, OutputField, new_to_old_field + def signature_to_template(signature, adapter=None) -> dsp.Template: """Convert from new to legacy format.""" @@ -242,8 +244,8 @@ class Signature(BaseModel, metaclass=SignatureMeta): @classmethod @contextmanager def replace( - cls: "Signature", - new_signature: "Signature", + cls, + new_signature: "Type[Signature]", validate_new_signature: bool = True, ) -> typing.Generator[None, None, None]: """Replace the signature with an updated version. @@ -262,16 +264,35 @@ def replace( f"Field '{field}' is missing from the updated signature '{new_signature.__class__}.", ) - class OldSignature(cls, Signature): + class OldSignature(cls): pass - replace_fields = ["__doc__", "model_fields", "model_extra", "model_config"] - for field in replace_fields: - setattr(cls, field, getattr(new_signature, field)) + def swap_attributes(source: Type[Signature]): + unhandled = {} + + for attr in ["__doc__", "__pydantic_fields__", "model_fields", "model_extra", "model_config"]: + try: + setattr(cls, attr, getattr(source, attr)) + except AttributeError as exc: + if attr in ("__pydantic_fields__", "model_fields"): + version = "< 2.10" if attr == "__pydantic_fields__" else ">= 2.10" + logging.debug(f"Model attribute {attr} not replaced, expected with pydantic {version}") + unhandled[attr] = exc + else: + raise exc + + # if neither of the attributes were replaced, raise an error to prevent silent failures + if set(unhandled.keys()) >= {"model_fields", "__pydantic_fields__"}: + raise ValueError("Failed to replace either model_fields or __pydantic_fields__") from ( + unhandled.get("model_fields") or unhandled.get("__pydantic_fields__") + ) + + swap_attributes(new_signature) cls.model_rebuild(force=True) + yield - for field in replace_fields: - setattr(cls, field, getattr(OldSignature, field)) + + swap_attributes(OldSignature) cls.model_rebuild(force=True) @@ -383,7 +404,7 @@ def _parse_type_node(node, names=None) -> Any: without using structural pattern matching introduced in Python 3.10. """ - + if names is None: names = typing.__dict__ @@ -401,7 +422,7 @@ def _parse_type_node(node, names=None) -> Any: id_ = node.id if id_ in names: return names[id_] - + for type_ in [int, str, float, bool, list, tuple, dict, Image]: if type_.__name__ == id_: return type_ @@ -420,7 +441,7 @@ def _parse_type_node(node, names=None) -> Any: keys = [kw.arg for kw in node.keywords] values = [kw.value.value for kw in node.keywords] return Field(**dict(zip(keys, values))) - + if isinstance(node, ast.Attribute) and node.attr == "Image": return Image diff --git a/requirements-dev.txt b/requirements-dev.txt index 98d89e732..23984fa07 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,7 @@ black==24.2.0 +datamodel-code-generator==0.26.3 +litellm[proxy]==1.51.0 +pillow==10.4.0 pre-commit==3.7.0 pytest==8.3.3 pytest-env==1.1.3 @@ -6,5 +9,3 @@ pytest-mock==3.12.0 ruff==0.3.0 torch==2.2.1 transformers==4.38.2 -pillow==10.4.0 -litellm[proxy]==1.51.0 From 1b10e234f2a52457b1b6504db4754ac73ae19077 Mon Sep 17 00:00:00 2001 From: Tim Kellogg Date: Mon, 25 Nov 2024 11:17:45 -0500 Subject: [PATCH 19/19] Allow react.Tool to wrap methods (#1856) The big reason for this is to pass parameters out-of-band, e.g. a user_id to ensure the LLM doesn't get the wrong data. The unit test includes a usage, you can't use it as a decorator this way, but it works. The alternative, of course, is to have a very long function and have all the tools be nested functions. It works, but can lead to some very long functions. I prefer long classes over long functions. --- dspy/predict/react.py | 2 +- tests/predict/test_react.py | 27 ++++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/dspy/predict/react.py b/dspy/predict/react.py index ce8edaa46..28640d5d1 100644 --- a/dspy/predict/react.py +++ b/dspy/predict/react.py @@ -9,7 +9,7 @@ class Tool: def __init__(self, func: Callable, name: str = None, desc: str = None, args: dict[str, Any] = None): - annotations_func = func if inspect.isfunction(func) else func.__call__ + annotations_func = func if inspect.isfunction(func) or inspect.ismethod(func) else func.__call__ self.func = func self.name = name or getattr(func, '__name__', type(func).__name__) self.desc = desc or getattr(func, '__doc__', None) or getattr(annotations_func, '__doc__', "") diff --git a/tests/predict/test_react.py b/tests/predict/test_react.py index 8435f86a9..4c6a150db 100644 --- a/tests/predict/test_react.py +++ b/tests/predict/test_react.py @@ -2,6 +2,7 @@ import dspy from dspy.utils.dummies import DummyLM, dummy_rm +from dspy.predict import react # def test_example_no_tools(): @@ -121,4 +122,28 @@ # react = dspy.ReAct(ExampleSignature) # assert react.react[0].signature.instructions is not None -# assert react.react[0].signature.instructions.startswith("You are going to generate output based on input.") \ No newline at end of file +# assert react.react[0].signature.instructions.startswith("You are going to generate output based on input.") + +def test_tool_from_function(): + def foo(a: int, b: int) -> int: + """Add two numbers.""" + return a + b + + tool = react.Tool(foo) + assert tool.name == "foo" + assert tool.desc == "Add two numbers." + assert tool.args == {"a": "int", "b": "int"} + +def test_tool_from_class(): + class Foo: + def __init__(self, user_id: str): + self.user_id = user_id + + def foo(self, a: int, b: int) -> int: + """Add two numbers.""" + return a + b + + tool = react.Tool(Foo("123").foo) + assert tool.name == "foo" + assert tool.desc == "Add two numbers." + assert tool.args == {"a": "int", "b": "int"}