diff --git a/docs/docs/index.md b/docs/docs/index.md
index cb16c7c69..81b0b7f28 100644
--- a/docs/docs/index.md
+++ b/docs/docs/index.md
@@ -368,7 +368,7 @@ BootstrapFS on MATH with a tiny LM like Llama-3.2 with Ollama (maybe with a big
 
 ## 3) **DSPy's Ecosystem** advances open-source AI research.
 
-Compared to working on or with monolithic LMs, DSPy's modular paradigm aims to enable a large community to improve the compositional architectures, inference-time strategies, and optimizers for LM programs in an open, distributed way. It gives you more control, helps you iterate much faster, and allows your programs to get better over time by applying the latest optimizers or modules.
+Compared to monolithic LMs, DSPy's modular paradigm enables a large community to improve the compositional architectures, inference-time strategies, and optimizers for LM programs in an open, distributed way. This gives DSPy users more control, helps them iterate much faster, and allows their programs to get better over time by applying the latest optimizers or modules.
 
 The DSPy research effort started at Stanford NLP in Feb 2022, building on what we learned from developing early [compound LM systems](https://bair.berkeley.edu/blog/2024/02/18/compound-ai-systems/) like [ColBERT-QA](https://arxiv.org/abs/2007.00814), [Baleen](https://arxiv.org/abs/2101.00436), and [Hindsight](https://arxiv.org/abs/2110.07752). The first version was released as [DSP](https://arxiv.org/abs/2212.14024) in Dec 2022 and evolved by Oct 2023 into [DSPy](https://arxiv.org/abs/2310.03714). Thanks to [250 contributors](https://github.com/stanfordnlp/dspy/graphs/contributors), DSPy has introduced tens of thousands of people to building and optimizing modular LM programs.
 
diff --git a/docs/docs/learn/programming/signatures.md b/docs/docs/learn/programming/signatures.md
index af400b11a..450f69f64 100644
--- a/docs/docs/learn/programming/signatures.md
+++ b/docs/docs/learn/programming/signatures.md
@@ -64,7 +64,6 @@ The 21-year-old Lee made seven appearances and scored one goal for West Ham last
 
 Many DSPy modules (except `dspy.Predict`) return auxiliary information by expanding your signature under the hood.
 
-For example, `dspy.ChainOfThought` also adds a `reasoning` field that includes the LM's reasoning before it generates the output `summary`.
 For example, `dspy.ChainOfThought` also adds a `reasoning` field that includes the LM's reasoning before it generates the output `summary`.
 
 ```python
diff --git a/docs/docs/tutorials/entity_extraction/index.ipynb b/docs/docs/tutorials/entity_extraction/index.ipynb
index 8ec298196..f298add18 100644
--- a/docs/docs/tutorials/entity_extraction/index.ipynb
+++ b/docs/docs/tutorials/entity_extraction/index.ipynb
@@ -239,7 +239,7 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "Average Metric: 172.00 / 200 (86.0%): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 200/200 [00:16<00:00, 11.94it/s]"
+            "Average Metric: 172.00 / 200 (86.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:16<00:00, 11.94it/s]"
           ]
         },
         {
@@ -299,7 +299,7 @@
               "      <td>[Nadim, Ladki]</td>\n",
               "      <td>We extracted the tokens \"Nadim\" and \"Ladki\" as they refer to speci...</td>\n",
               "      <td>[Nadim, Ladki]</td>\n",
-              "      <td>\u2714\ufe0f [True]</td>\n",
+              "      <td>✔️ [True]</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>2</th>\n",
@@ -307,7 +307,7 @@
               "      <td>[]</td>\n",
               "      <td>There are no tokens referring to specific people in the provided l...</td>\n",
               "      <td>[]</td>\n",
-              "      <td>\u2714\ufe0f [True]</td>\n",
+              "      <td>✔️ [True]</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>3</th>\n",
@@ -315,7 +315,7 @@
               "      <td>[]</td>\n",
               "      <td>We did not find any tokens referring to specific people in the pro...</td>\n",
               "      <td>[]</td>\n",
-              "      <td>\u2714\ufe0f [True]</td>\n",
+              "      <td>✔️ [True]</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>4</th>\n",
@@ -339,7 +339,7 @@
               "      <td>[David, Campese]</td>\n",
               "      <td>The extracted_people includes \"David Campese\" as it refers to a sp...</td>\n",
               "      <td>[David, Campese]</td>\n",
-              "      <td>\u2714\ufe0f [True]</td>\n",
+              "      <td>✔️ [True]</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>196</th>\n",
@@ -347,7 +347,7 @@
               "      <td>[]</td>\n",
               "      <td>The extracted_people includes \"Wallabies\" as it refers to a specif...</td>\n",
               "      <td>[]</td>\n",
-              "      <td>\u2714\ufe0f [True]</td>\n",
+              "      <td>✔️ [True]</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>197</th>\n",
@@ -355,7 +355,7 @@
               "      <td>[Campese, Rob, Andrew]</td>\n",
               "      <td>The extracted tokens refer to specific people mentioned in the tex...</td>\n",
               "      <td>[Campese, Rob, Andrew]</td>\n",
-              "      <td>\u2714\ufe0f [True]</td>\n",
+              "      <td>✔️ [True]</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>198</th>\n",
@@ -363,7 +363,7 @@
               "      <td>[Campo, Andrew]</td>\n",
               "      <td>The extracted tokens referring to specific people include \"Campo\" ...</td>\n",
               "      <td>[Campo, Andrew]</td>\n",
-              "      <td>\u2714\ufe0f [True]</td>\n",
+              "      <td>✔️ [True]</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>199</th>\n",
@@ -371,11 +371,11 @@
               "      <td>[]</td>\n",
               "      <td>We extracted the names of specific people from the tokenized text....</td>\n",
               "      <td>[]</td>\n",
-              "      <td>\u2714\ufe0f [True]</td>\n",
+              "      <td>✔️ [True]</td>\n",
               "    </tr>\n",
               "  </tbody>\n",
               "</table>\n",
-              "<p>200 rows \u00d7 5 columns</p>\n",
+              "<p>200 rows × 5 columns</p>\n",
               "</div>"
             ],
             "text/plain": [
@@ -420,16 +420,16 @@
               "\n",
               "           extracted_people extraction_correctness_metric  \n",
               "0            [JAPAN, CHINA]                                \n",
-              "1            [Nadim, Ladki]                     \u2714\ufe0f [True]  \n",
-              "2                        []                     \u2714\ufe0f [True]  \n",
-              "3                        []                     \u2714\ufe0f [True]  \n",
+              "1            [Nadim, Ladki]                     ✔️ [True]  \n",
+              "2                        []                     ✔️ [True]  \n",
+              "3                        []                     ✔️ [True]  \n",
               "4       [China, Uzbekistan]                                \n",
               "..                      ...                           ...  \n",
-              "195        [David, Campese]                     \u2714\ufe0f [True]  \n",
-              "196                      []                     \u2714\ufe0f [True]  \n",
-              "197  [Campese, Rob, Andrew]                     \u2714\ufe0f [True]  \n",
-              "198         [Campo, Andrew]                     \u2714\ufe0f [True]  \n",
-              "199                      []                     \u2714\ufe0f [True]  \n",
+              "195        [David, Campese]                     ✔️ [True]  \n",
+              "196                      []                     ✔️ [True]  \n",
+              "197  [Campese, Rob, Andrew]                     ✔️ [True]  \n",
+              "198         [Campo, Andrew]                     ✔️ [True]  \n",
+              "199                      []                     ✔️ [True]  \n",
               "\n",
               "[200 rows x 5 columns]"
             ]
@@ -469,252 +469,9 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 7,
+      "execution_count": null,
       "metadata": {},
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "2024/11/18 21:08:04 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
-            "RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:\n",
-            "num_trials: 25\n",
-            "minibatch: False\n",
-            "num_candidates: 19\n",
-            "valset size: 40\n",
-            "\n",
-            "2024/11/18 21:08:04 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
-            "==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==\n",
-            "2024/11/18 21:08:04 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.\n",
-            "\n",
-            "2024/11/18 21:08:04 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=19 sets of demonstrations...\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Bootstrapping set 1/19\n",
-            "Bootstrapping set 2/19\n",
-            "Bootstrapping set 3/19\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "\n...\n",
-            "...\n",
-            "...\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
-            "Bootstrapping set 19/19\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            " 40%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u258a                                                                                        | 4/10 [00:00<00:00, 995.21it/s]\n",
-            "2024/11/18 21:08:17 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
-            "==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==\n",
-            "2024/11/18 21:08:17 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "2024/11/18 21:08:21 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
-            "Proposing instructions...\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Extract contiguous tokens referring to specific people, if any, from a list of string tokens.\n",
-            "Output a list of tokens. In other words, do not combine multiple tokens into a single value.\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Given a list of tokenized text, identify and extract all contiguous tokens that refer to specific individuals. Ensure that the output is a list of these tokens without combining them into single values. Provide a clear rationale explaining the reasoning behind each extraction.\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 2: In a high-stakes scenario where accurate identification of EU officials is crucial for compliance with new health regulations affecting livestock, extract contiguous tokens from the provided list that refer to specific individuals. Ensure that your output is a comprehensive list of these tokens, as any oversight could lead to significant regulatory implications. Remember, do not combine multiple tokens into a single value; each name must be clearly delineated.\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 3: Given a list of tokenized text strings, identify and extract any contiguous tokens that refer to specific individuals. Provide a rationale for your extraction process, explaining the reasoning step by step. Output the extracted names as a list of tokens, ensuring that multiple tokens are not combined into a single value.\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 4: You are a Named Entity Recognition expert. Your task is to extract contiguous tokens that refer to specific people from the provided list of string tokens. If there are no specific individuals mentioned, return an empty list. Ensure that you do not combine multiple tokens into a single value; output them as a list.\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 5: Given the tokenized text, extract contiguous tokens that refer to specific individuals. If there are no references to identifiable people, indicate that no people have been extracted. Provide a rationale for your reasoning process along with the list of extracted names.\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 6: In a critical situation where accurate identification of EU officials is essential for compliance with new regulations, extract contiguous tokens from the provided list of string tokens that specifically refer to individuals. Ensure that your output is a list of distinct tokens without combining them into single values. This task is vital for ensuring clear communication in health communications regarding livestock, particularly in the context of sheep and mad cow disease.\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 7: In a high-stakes situation where accurate identification of individuals is critical for regulatory compliance and public health communication, extract contiguous tokens referring to specific people from the provided list of string tokens. Ensure that you output each identified individual as separate tokens without combining multiple tokens into a single value. This task is essential for ensuring clarity and accountability in communications pertaining to EU regulations and health matters.\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 8: Given a list of tokenized text, identify and extract any contiguous sequences of tokens that refer specifically to individuals. Ensure that the output is a list of tokens representing those names, and do not merge multiple tokens into a single value. Provide reasoning for your extraction process, clearly stating if specific individuals were found or if the tokens did not contain any references to people.\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 9: In a high-stakes scenario where accurate identification of EU officials is critical for public health communications regarding livestock diseases, extract contiguous tokens that refer to specific people from the provided list of string tokens. Ensure that the output is a list of tokens, without combining multiple tokens into a single value. Provide a clear rationale explaining the reasoning behind the identification of these tokens as referring to specific individuals.\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 10: Identify and extract contiguous tokens from the provided list that specifically refer to individuals. Ensure that the output consists of distinct tokens representing the names, without merging them into single values.\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 11: In a critical situation where accurate identification of key individuals is essential for effective communication regarding EU regulations and health communications, extract contiguous tokens referring to specific people from the provided list of string tokens. Ensure that the output is a list of tokens without combining them into a single value. This task is crucial for clarity in reporting and decision-making processes.\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 12: In a critical situation where accurate identification of key individuals is essential for public health communications regarding EU regulations on livestock, extract contiguous tokens referring to specific people from the provided list of string tokens. Ensure that the output is a list of individual tokens, maintaining their separation to facilitate precise recognition of each person mentioned in the context.\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 13: In a high-stakes situation where accurate identification of individuals is critical for regulatory compliance and public health communication, extract contiguous tokens referring to specific people from the provided list of string tokens. Ensure that you output each identified individual as separate tokens without combining multiple tokens into a single value. This task is essential for ensuring clarity and accountability in communications pertaining to EU regulations and health matters.\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 14: You are a Named Entity Recognition expert. Your task is to extract contiguous tokens referring to specific people from a list of string tokens. Please ensure that you output a list of tokens without combining them into a single value. Provide a rationale for your extraction, explaining why the identified tokens refer to a specific person.\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 15: Given a list of tokenized words, identify and extract contiguous tokens that refer to specific individuals. Provide a rationale explaining the reasoning behind the extraction process, and output a list of the identified tokens without combining them into single values.\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 16: You are an AI text analyzer. Your task is to extract contiguous tokens that refer to specific individuals from a list of string tokens. Carefully examine the tokens and output a list of those that represent people. If no tokens refer to individuals, return an empty list. Remember to provide a rationale explaining your extraction process.\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 17: In a critical situation where EU regulations regarding livestock health are being discussed, it is essential to accurately identify and extract the names of officials involved in these discussions. Given a list of tokenized text, extract contiguous tokens that refer to specific individuals. Ensure that each name is output as separate tokens, as combining them could lead to confusion. This information is vital for understanding the key players in the regulatory landscape and their statements on issues like mad cow disease and sheep health.\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: 18: In a high-stakes situation where accurate identification of key individuals is crucial for regulatory compliance and public health communication, extract contiguous tokens referring to specific people from the provided list of string tokens. Ensure that your output is a list of tokens representing individuals, without combining multiple tokens into a single value. This extraction is vital for understanding the roles and actions of officials in EU regulations related to livestock health.\n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
-            "\n",
-            "2024/11/18 21:10:06 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the default program...\n",
-            "\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Average Metric: 34.00 / 40 (85.0%): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 40/40 [00:10<00:00,  3.69it/s]"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "2024/11/18 21:10:16 INFO dspy.evaluate.evaluate: Average Metric: 34 / 40 (85.0%)\n",
-            "2024/11/18 21:10:16 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 85.0\n",
-            "\n",
-            "2024/11/18 21:10:16 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==\n",
-            "2024/11/18 21:10:16 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n",
-            "\n",
-            "/Users/corey.zumar/miniconda3/envs/default/lib/python3.10/site-packages/optuna/samplers/_tpe/sampler.py:319: ExperimentalWarning: ``multivariate`` option is an experimental feature. The interface can change in the future.\n",
-            "  warnings.warn(\n",
-            "2024/11/18 21:10:16 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 1 / 25 =====\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Average Metric: 34.00 / 40 (85.0%): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 40/40 [00:17<00:00,  2.31it/s]"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "2024/11/18 21:10:34 INFO dspy.evaluate.evaluate: Average Metric: 34 / 40 (85.0%)\n",
-            "2024/11/18 21:10:34 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 85.0 with parameters ['Predictor 0: Instruction 12', 'Predictor 0: Few-Shot Set 7'].\n",
-            "2024/11/18 21:10:34 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [85.0, 85.0]\n",
-            "2024/11/18 21:10:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 85.0\n",
-            "2024/11/18 21:10:34 INFO dspy.teleprompt.mipro_optimizer_v2: ========================\n",
-            "\n",
-            "\n",
-            "2024/11/18 21:10:34 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 25 =====\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Average Metric: 36.00 / 40 (90.0%): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 40/40 [00:09<00:00,  4.16it/s]"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "2024/11/18 21:10:43 INFO dspy.evaluate.evaluate: Average Metric: 36 / 40 (90.0%)\n",
-            "2024/11/18 21:10:43 INFO dspy.teleprompt.mipro_optimizer_v2: \u001b[92mBest full score so far!\u001b[0m Score: 90.0\n",
-            "2024/11/18 21:10:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 90.0 with parameters ['Predictor 0: Instruction 10', 'Predictor 0: Few-Shot Set 7'].\n",
-            "2024/11/18 21:10:43 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [85.0, 85.0, 90.0]\n",
-            "2024/11/18 21:10:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 90.0\n",
-            "2024/11/18 21:10:43 INFO dspy.teleprompt.mipro_optimizer_v2: ========================\n",
-            "\n",
-            "\n",
-            "2024/11/18 21:10:43 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 25 =====\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Average Metric: 39.00 / 40 (97.5%): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 40/40 [00:10<00:00,  3.68it/s]"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "\n...\n",
-            "...\n",
-            "...\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "2024/11/18 21:14:37 INFO dspy.evaluate.evaluate: Average Metric: 34 / 40 (85.0%)\n",
-            "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 85.0 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 0'].\n",
-            "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [85.0, 85.0, 90.0, 97.5, 95.0, 97.5, 82.5, 92.5, 85.0, 77.5, 85.0, 97.5, 97.5, 97.5, 95.0, 95.0, 97.5, 85.0, 90.0, 97.5, 92.5, 95.0, 95.0, 95.0, 85.0]\n",
-            "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5\n",
-            "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: =========================\n",
-            "\n",
-            "\n",
-            "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 25 / 25 =====\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "Average Metric: 39.00 / 40 (97.5%): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 40/40 [00:00<00:00, 2609.25it/s]"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "2024/11/18 21:14:37 INFO dspy.evaluate.evaluate: Average Metric: 39 / 40 (97.5%)\n",
-            "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 97.5 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 18'].\n",
-            "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [85.0, 85.0, 90.0, 97.5, 95.0, 97.5, 82.5, 92.5, 85.0, 77.5, 85.0, 97.5, 97.5, 97.5, 95.0, 95.0, 97.5, 85.0, 90.0, 97.5, 92.5, 95.0, 95.0, 95.0, 85.0, 97.5]\n",
-            "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 97.5\n",
-            "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: =========================\n",
-            "\n",
-            "\n",
-            "2024/11/18 21:14:37 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 97.5!\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "\n"
-          ]
-        }
-      ],
+      "outputs": [],
       "source": [
         "mipro_optimizer = dspy.MIPROv2(\n",
         "    metric=extraction_correctness_metric,\n",
@@ -751,7 +508,7 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "Average Metric: 186.00 / 200 (93.0%): 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 200/200 [00:23<00:00,  8.58it/s]"
+            "Average Metric: 186.00 / 200 (93.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:23<00:00,  8.58it/s]"
           ]
         },
         {
@@ -811,7 +568,7 @@
               "      <td>[Nadim, Ladki]</td>\n",
               "      <td>The tokens \"Nadim Ladki\" refer to a specific individual. Both toke...</td>\n",
               "      <td>[Nadim, Ladki]</td>\n",
-              "      <td>\u2714\ufe0f [True]</td>\n",
+              "      <td>✔️ [True]</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>2</th>\n",
@@ -819,7 +576,7 @@
               "      <td>[]</td>\n",
               "      <td>There are no tokens referring to specific people in the provided l...</td>\n",
               "      <td>[]</td>\n",
-              "      <td>\u2714\ufe0f [True]</td>\n",
+              "      <td>✔️ [True]</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>3</th>\n",
@@ -827,7 +584,7 @@
               "      <td>[]</td>\n",
               "      <td>There are no specific people mentioned in the provided tokens. The...</td>\n",
               "      <td>[]</td>\n",
-              "      <td>\u2714\ufe0f [True]</td>\n",
+              "      <td>✔️ [True]</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>4</th>\n",
@@ -835,7 +592,7 @@
               "      <td>[]</td>\n",
               "      <td>There are no tokens referring to specific people in the provided l...</td>\n",
               "      <td>[]</td>\n",
-              "      <td>\u2714\ufe0f [True]</td>\n",
+              "      <td>✔️ [True]</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>...</th>\n",
@@ -851,7 +608,7 @@
               "      <td>[David, Campese]</td>\n",
               "      <td>The extracted tokens refer to a specific person mentioned in the t...</td>\n",
               "      <td>[David, Campese]</td>\n",
-              "      <td>\u2714\ufe0f [True]</td>\n",
+              "      <td>✔️ [True]</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>196</th>\n",
@@ -859,7 +616,7 @@
               "      <td>[]</td>\n",
               "      <td>There are no specific individuals mentioned in the provided tokens...</td>\n",
               "      <td>[]</td>\n",
-              "      <td>\u2714\ufe0f [True]</td>\n",
+              "      <td>✔️ [True]</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>197</th>\n",
@@ -867,7 +624,7 @@
               "      <td>[Campese, Rob, Andrew]</td>\n",
               "      <td>The tokens include the names \"Campese\" and \"Rob Andrew,\" both of w...</td>\n",
               "      <td>[Campese, Rob, Andrew]</td>\n",
-              "      <td>\u2714\ufe0f [True]</td>\n",
+              "      <td>✔️ [True]</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>198</th>\n",
@@ -875,7 +632,7 @@
               "      <td>[Campo, Andrew]</td>\n",
               "      <td>The extracted tokens refer to specific people mentioned in the tex...</td>\n",
               "      <td>[Campo, Andrew]</td>\n",
-              "      <td>\u2714\ufe0f [True]</td>\n",
+              "      <td>✔️ [True]</td>\n",
               "    </tr>\n",
               "    <tr>\n",
               "      <th>199</th>\n",
@@ -883,11 +640,11 @@
               "      <td>[]</td>\n",
               "      <td>There are no specific people mentioned in the provided tokens. The...</td>\n",
               "      <td>[]</td>\n",
-              "      <td>\u2714\ufe0f [True]</td>\n",
+              "      <td>✔️ [True]</td>\n",
               "    </tr>\n",
               "  </tbody>\n",
               "</table>\n",
-              "<p>200 rows \u00d7 5 columns</p>\n",
+              "<p>200 rows × 5 columns</p>\n",
               "</div>"
             ],
             "text/plain": [
@@ -932,16 +689,16 @@
               "\n",
               "           extracted_people extraction_correctness_metric  \n",
               "0                        []                                \n",
-              "1            [Nadim, Ladki]                     \u2714\ufe0f [True]  \n",
-              "2                        []                     \u2714\ufe0f [True]  \n",
-              "3                        []                     \u2714\ufe0f [True]  \n",
-              "4                        []                     \u2714\ufe0f [True]  \n",
+              "1            [Nadim, Ladki]                     ✔️ [True]  \n",
+              "2                        []                     ✔️ [True]  \n",
+              "3                        []                     ✔️ [True]  \n",
+              "4                        []                     ✔️ [True]  \n",
               "..                      ...                           ...  \n",
-              "195        [David, Campese]                     \u2714\ufe0f [True]  \n",
-              "196                      []                     \u2714\ufe0f [True]  \n",
-              "197  [Campese, Rob, Andrew]                     \u2714\ufe0f [True]  \n",
-              "198         [Campo, Andrew]                     \u2714\ufe0f [True]  \n",
-              "199                      []                     \u2714\ufe0f [True]  \n",
+              "195        [David, Campese]                     ✔️ [True]  \n",
+              "196                      []                     ✔️ [True]  \n",
+              "197  [Campese, Rob, Andrew]                     ✔️ [True]  \n",
+              "198         [Campo, Andrew]                     ✔️ [True]  \n",
+              "199                      []                     ✔️ [True]  \n",
               "\n",
               "[200 rows x 5 columns]"
             ]
diff --git a/docs/docs/tutorials/rag/index.ipynb b/docs/docs/tutorials/rag/index.ipynb
index 59f3b07d3..45ce46b7c 100644
--- a/docs/docs/tutorials/rag/index.ipynb
+++ b/docs/docs/tutorials/rag/index.ipynb
@@ -55,11 +55,11 @@
      "text": [
       "In Linux, \"high memory\" and \"low memory\" refer to different regions of the system's memory address space, particularly in the context of 32-bit architectures.\n",
       "\n",
-      "- **Low Memory**: This typically refers to the first 896 MB of memory in a 32-bit system. The kernel can directly access this memory without any special handling. It is used for kernel data structures and for user processes. The low memory region is crucial for the kernel's operation, as it allows for efficient memory management and access.\n",
+      "- **Low Memory**: This typically refers to the memory that is directly accessible by the kernel. In a 32-bit system, this is usually the first 896 MB of RAM (from 0 to 896 MB). The kernel can directly map this memory, making it faster for the kernel to access and manage. Low memory is used for kernel data structures and for user processes that require direct access to memory.\n",
       "\n",
-      "- **High Memory**: This refers to memory above the 896 MB threshold in a 32-bit system. The kernel cannot directly access this memory; instead, it must use special mechanisms to map it into the kernel's address space when needed. High memory is often used for user processes and can be allocated dynamically, but it requires additional overhead for the kernel to manage.\n",
+      "- **High Memory**: This refers to the memory above the low memory limit, which is not directly accessible by the kernel in a 32-bit system. This area is typically above 896 MB. The kernel cannot directly access this memory without using special mechanisms, such as mapping it into the kernel's address space when needed. High memory is used for user processes that require more memory than what is available in low memory.\n",
       "\n",
-      "In 64-bit systems, the distinction between high and low memory is less relevant, as the addressable memory space is significantly larger, and the kernel can access most of the memory directly.\n"
+      "In summary, low memory is directly accessible by the kernel, while high memory requires additional steps for the kernel to access it, especially in 32-bit systems. In 64-bit systems, this distinction is less significant as the kernel can address a much larger memory space directly.\n"
      ]
     }
    ],
@@ -94,7 +94,7 @@
       "\n",
       "\n",
       "\n",
-      "\u001b[34m[2024-11-10T12:39:19.458514]\u001b[0m\n",
+      "\u001b[34m[2024-11-23T23:16:35.966534]\u001b[0m\n",
       "\n",
       "\u001b[31mSystem message:\u001b[0m\n",
       "\n",
@@ -131,11 +131,11 @@
       "\u001b[32m[[ ## response ## ]]\n",
       "In Linux, \"high memory\" and \"low memory\" refer to different regions of the system's memory address space, particularly in the context of 32-bit architectures.\n",
       "\n",
-      "- **Low Memory**: This typically refers to the first 896 MB of memory in a 32-bit system. The kernel can directly access this memory without any special handling. It is used for kernel data structures and for user processes. The low memory region is crucial for the kernel's operation, as it allows for efficient memory management and access.\n",
+      "- **Low Memory**: This typically refers to the memory that is directly accessible by the kernel. In a 32-bit system, this is usually the first 896 MB of RAM (from 0 to 896 MB). The kernel can directly map this memory, making it faster for the kernel to access and manage. Low memory is used for kernel data structures and for user processes that require direct access to memory.\n",
       "\n",
-      "- **High Memory**: This refers to memory above the 896 MB threshold in a 32-bit system. The kernel cannot directly access this memory; instead, it must use special mechanisms to map it into the kernel's address space when needed. High memory is often used for user processes and can be allocated dynamically, but it requires additional overhead for the kernel to manage.\n",
+      "- **High Memory**: This refers to the memory above the low memory limit, which is not directly accessible by the kernel in a 32-bit system. This area is typically above 896 MB. The kernel cannot directly access this memory without using special mechanisms, such as mapping it into the kernel's address space when needed. High memory is used for user processes that require more memory than what is available in low memory.\n",
       "\n",
-      "In 64-bit systems, the distinction between high and low memory is less relevant, as the addressable memory space is significantly larger, and the kernel can access most of the memory directly.\n",
+      "In summary, low memory is directly accessible by the kernel, while high memory requires additional steps for the kernel to access it, especially in 32-bit systems. In 64-bit systems, this distinction is less significant as the kernel can address a much larger memory space directly.\n",
       "\n",
       "[[ ## completed ## ]]\u001b[0m\n",
       "\n",
@@ -170,8 +170,8 @@
      "data": {
       "text/plain": [
        "Prediction(\n",
-       "    reasoning=\"The placement of curly braces on their own line is largely a matter of coding style and conventions. In some programming languages and style guides, such as those used in C, C++, and Java, it is common to place opening curly braces on the same line as the control statement (like `if`, `for`, etc.) and closing braces on a new line. However, other styles, such as the Allman style, advocate for placing both opening and closing braces on their own lines. Ultimately, the decision should be based on the team's coding standards or personal preference, as long as it maintains readability and consistency.\",\n",
-       "    response=\"Curly braces can either appear on their own line or not, depending on the coding style you choose to follow. It's important to adhere to a consistent style throughout your codebase.\"\n",
+       "    reasoning='The placement of curly braces on their own line depends on the coding style and conventions being followed. In some programming languages and style guides, such as the Allman style, curly braces are placed on their own line to enhance readability. In contrast, other styles, like K&R style, place the opening brace on the same line as the control statement. Ultimately, it is a matter of personal or team preference, and consistency within a project is key.',\n",
+       "    response='Curly braces can appear on their own line depending on the coding style you are following. If you prefer a style that enhances readability, such as the Allman style, then yes, they should be on their own line. However, if you are following a different style, like K&R, they may not need to be. Consistency is important, so choose a style and stick with it.'\n",
        ")"
       ]
      },
@@ -191,7 +191,7 @@
    "source": [
     "\n",
     "\n",
-    "Interestingly, asking for reasoning made the output `response` shorter in this case. Is this a good thing or a bad thing? It depends on what you need: there's no free lunch, but DSPy gives you the tools to experiment with different strategies extremely quickly.\n",
+    "Interestingly, asking for reasoning can make the output `response` shorter in this case. Is this a good thing or a bad thing? It depends on what you need: there's no free lunch, but DSPy gives you the tools to experiment with different strategies extremely quickly.\n",
     "\n",
     "By the way, `dspy.ChainOfThought` is implemented in DSPy, using `dspy.Predict`. This is a good place to `dspy.inspect_history` if you're curious.\n"
    ]
@@ -220,25 +220,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
     "import ujson\n",
-    "import requests\n",
+    "from dspy.utils import download\n",
     "\n",
-    "def download(url):\n",
-    "    filename = os.path.basename(url)\n",
-    "    remote_size = int(requests.head(url, allow_redirects=True).headers.get('Content-Length', 0))\n",
-    "    local_size = os.path.getsize(filename) if os.path.exists(filename) else 0\n",
+    "# Download question--answer pairs from the RAG-QA Arena \"Tech\" dataset.\n",
+    "download(\"https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_examples.jsonl\")\n",
     "\n",
-    "    if local_size != remote_size:\n",
-    "        print(f\"Downloading '{filename}'...\")\n",
-    "        with requests.get(url, stream=True) as r, open(filename, 'wb') as f:\n",
-    "            for chunk in r.iter_content(chunk_size=8192): f.write(chunk)\n",
-    "\n",
-    "# Download 500 question--answer pairs from the RAG-QA Arena \"Tech\" dataset.\n",
-    "download(\"https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_500.json\")\n",
-    "\n",
-    "with open('ragqa_arena_tech_500.json') as f:\n",
-    "    data = ujson.load(f)"
+    "with open(\"ragqa_arena_tech_examples.jsonl\") as f:\n",
+    "    data = [ujson.loads(line) for line in f]"
    ]
   },
   {
@@ -249,8 +238,9 @@
     {
      "data": {
       "text/plain": [
-       "{'question': 'how to transfer whatsapp voice message to computer?',\n",
-       " 'response': 'To transfer voice notes from WhatsApp on your device to your computer, you have the option to select the \"Share\" feature within the app and send the files via Email, Gmail, Bluetooth, or other available services.  \\nYou can also move the files onto your phone\\'s SD card, connect your phone to your computer via a USB cable, then find and transfer the files via File Explorer on your PC. \\nAlternatively, you can choose to attach all the desired voice notes to an email and, from your phone, send them to your own email address.  \\nUpon receiving the email on your computer, you can then download the voice note attachments.'}"
+       "{'question': 'why igp is used in mpls?',\n",
+       " 'response': \"An IGP exchanges routing prefixes between gateways/routers.  \\nWithout a routing protocol, you'd have to configure each route on every router and you'd have no dynamic updates when routes change because of link failures. \\nFuthermore, within an MPLS network, an IGP is vital for advertising the internal topology and ensuring connectivity for MP-BGP inside the network.\",\n",
+       " 'gold_doc_ids': [2822, 2823]}"
       ]
      },
      "execution_count": 6,
@@ -282,7 +272,7 @@
     {
      "data": {
       "text/plain": [
-       "Example({'question': 'what are high memory and low memory on linux?', 'response': '\"High Memory\" refers to the application or user space, the memory that user programs can use and which isn\\'t permanently mapped in the kernel\\'s space, while \"Low Memory\" is the kernel\\'s space, which the kernel can address directly and is permanently mapped. \\nThe user cannot access the Low Memory as it is set aside for the required kernel programs.'}) (input_keys={'question'})"
+       "Example({'question': 'why are my text messages coming up as maybe?', 'response': 'This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you \"Maybe\". \\n\\nHowever, it has been suggested there is a bug in iOS 11.2 that can result in \"Maybe\" being displayed even when \"Find Contacts in Other Apps\" is disabled.', 'gold_doc_ids': [3956, 3957, 8034]}) (input_keys={'question'})"
       ]
      },
      "execution_count": 7,
@@ -305,11 +295,12 @@
     "\n",
     "Now, let's divide the data into:\n",
     "\n",
-    "- Training and Validation sets:\n",
+    "- Training (and with it Validation) set:\n",
     "    - These are the splits you typically give to DSPy optimizers.\n",
     "    - Optimizers typically learn directly from the training examples and check their progress using the validation examples.\n",
     "    - It's good to have 30--300 examples for training and validation each.\n",
     "    - For prompt optimizers in particular, it's often better to pass _more_ validation than training.\n",
+    "    - Below, we'll use 200 in total. MIPROv2 will split them into 20% training and 80% validation if you don't pass a valset.\n",
     "\n",
     "- Development and Test sets: The rest, typically on the order of 30--1000, can be used for:\n",
     "    - development (i.e., you can inspect them as you iterate on your system) and\n",
@@ -324,7 +315,7 @@
     {
      "data": {
       "text/plain": [
-       "(50, 100, 150, 200)"
+       "(200, 300, 500)"
       ]
      },
      "execution_count": 8,
@@ -333,9 +324,12 @@
     }
    ],
    "source": [
-    "trainset, valset, devset, testset = data[:50], data[50:150], data[150:300], data[300:500]\n",
+    "import random\n",
+    "\n",
+    "random.Random(0).shuffle(data)\n",
+    "trainset, devset, testset = data[:200], data[200:500], data[500:1000]\n",
     "\n",
-    "len(trainset), len(valset), len(devset), len(testset)"
+    "len(trainset), len(devset), len(testset)"
    ]
   },
   {
@@ -346,8 +340,7 @@
     "\n",
     "What kind of metric can suit our question-answering task? There are many choices, but since the answers are long, we may ask: How well does the system response _cover_ all key facts in the gold response? And the other way around, how well is the system response _not saying things_ that aren't in the gold response?\n",
     "\n",
-    "That metric is essentially a **semantic F1**, so let's load a `SemanticF1` metric from DSPy. This metric is actually implemented as a [very simple DSPy module](https://github.com/stanfordnlp/dspy/blob/77c2e1cceba427c7f91edb2ed5653276fb0c6de7/dspy/evaluate/auto_evaluation.py#L21) using whatever LM we're working with.\n",
-    "\n"
+    "That metric is essentially a **semantic F1**, so let's load a `SemanticF1` metric from DSPy. This metric is actually implemented as a [very simple DSPy module](https://github.com/stanfordnlp/dspy/blob/main/dspy/evaluate/auto_evaluation.py#L21) using whatever LM we're working with."
    ]
   },
   {
@@ -359,14 +352,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Question: \t what are high memory and low memory on linux?\n",
+      "Question: \t why are my text messages coming up as maybe?\n",
+      "\n",
+      "Gold Response: \t This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you \"Maybe\". \n",
       "\n",
-      "Gold Response: \t \"High Memory\" refers to the application or user space, the memory that user programs can use and which isn't permanently mapped in the kernel's space, while \"Low Memory\" is the kernel's space, which the kernel can address directly and is permanently mapped. \n",
-      "The user cannot access the Low Memory as it is set aside for the required kernel programs.\n",
+      "However, it has been suggested there is a bug in iOS 11.2 that can result in \"Maybe\" being displayed even when \"Find Contacts in Other Apps\" is disabled.\n",
       "\n",
-      "Predicted Response: \t In Linux, \"low memory\" refers to the first 896 MB of RAM, which is directly accessible by the kernel and used for kernel operations and user processes. \"High memory\" refers to memory above this limit, which is not directly accessible by the kernel in 32-bit systems and is used for user processes, requiring special handling to access. This distinction is crucial for effective memory management in Linux.\n",
+      "Predicted Response: \t Your text messages are showing up as \"maybe\" because your messaging app is uncertain about the sender's identity. This typically occurs when the sender's number is not saved in your contacts or if the message is from an unknown number. To resolve this, you can save the contact in your address book or check the message settings in your app.\n",
       "\n",
-      "Semantic F1 Score: 0.87\n"
+      "Semantic F1 Score: 0.33\n"
      ]
     }
    ],
@@ -374,7 +368,7 @@
     "from dspy.evaluate import SemanticF1\n",
     "\n",
     "# Instantiate the metric.\n",
-    "metric = SemanticF1()\n",
+    "metric = SemanticF1(decompositional=True)\n",
     "\n",
     "# Produce a prediction from our `cot` module, using the `example` above as input.\n",
     "pred = cot(**example.inputs())\n",
@@ -410,7 +404,7 @@
       "\n",
       "\n",
       "\n",
-      "\u001b[34m[2024-11-10T12:39:19.701005]\u001b[0m\n",
+      "\u001b[34m[2024-11-23T23:16:36.149518]\u001b[0m\n",
       "\n",
       "\u001b[31mSystem message:\u001b[0m\n",
       "\n",
@@ -421,8 +415,11 @@
       "\n",
       "Your output fields are:\n",
       "1. `reasoning` (str)\n",
-      "2. `recall` (float): fraction (out of 1.0) of ground truth covered by the system response\n",
-      "3. `precision` (float): fraction (out of 1.0) of system response covered by the ground truth\n",
+      "2. `ground_truth_key_ideas` (str): enumeration of key ideas in the ground truth\n",
+      "3. `system_response_key_ideas` (str): enumeration of key ideas in the system response\n",
+      "4. `discussion` (str): discussion of the overlap between ground truth and system response\n",
+      "5. `recall` (float): fraction (out of 1.0) of ground truth covered by the system response\n",
+      "6. `precision` (float): fraction (out of 1.0) of system response covered by the ground truth\n",
       "\n",
       "All interactions will be structured in the following way, with the appropriate values filled in.\n",
       "\n",
@@ -438,6 +435,15 @@
       "[[ ## reasoning ## ]]\n",
       "{reasoning}\n",
       "\n",
+      "[[ ## ground_truth_key_ideas ## ]]\n",
+      "{ground_truth_key_ideas}\n",
+      "\n",
+      "[[ ## system_response_key_ideas ## ]]\n",
+      "{system_response_key_ideas}\n",
+      "\n",
+      "[[ ## discussion ## ]]\n",
+      "{discussion}\n",
+      "\n",
       "[[ ## recall ## ]]\n",
       "{recall}        # note: the value you produce must be a single float value\n",
       "\n",
@@ -447,35 +453,50 @@
       "[[ ## completed ## ]]\n",
       "\n",
       "In adhering to this structure, your objective is: \n",
-      "        Compare a system's response to the ground truth to compute its recall and precision.\n",
-      "        If asked to reason, enumerate key ideas in each response, and whether they are present in the other response.\n",
+      "        Compare a system's response to the ground truth to compute recall and precision of key ideas.\n",
+      "        You will first enumerate key ideas in each response, discuss their overlap, and then report recall and precision.\n",
       "\n",
       "\n",
       "\u001b[31mUser message:\u001b[0m\n",
       "\n",
       "[[ ## question ## ]]\n",
-      "what are high memory and low memory on linux?\n",
+      "why are my text messages coming up as maybe?\n",
       "\n",
       "[[ ## ground_truth ## ]]\n",
-      "\"High Memory\" refers to the application or user space, the memory that user programs can use and which isn't permanently mapped in the kernel's space, while \"Low Memory\" is the kernel's space, which the kernel can address directly and is permanently mapped. \n",
-      "The user cannot access the Low Memory as it is set aside for the required kernel programs.\n",
+      "This is part of the Proactivity features new with iOS 9: It looks at info in emails to see if anyone with this number sent you an email and if it finds the phone number associated with a contact from your email, it will show you \"Maybe\". \n",
+      "\n",
+      "However, it has been suggested there is a bug in iOS 11.2 that can result in \"Maybe\" being displayed even when \"Find Contacts in Other Apps\" is disabled.\n",
       "\n",
       "[[ ## system_response ## ]]\n",
-      "In Linux, \"low memory\" refers to the first 896 MB of RAM, which is directly accessible by the kernel and used for kernel operations and user processes. \"High memory\" refers to memory above this limit, which is not directly accessible by the kernel in 32-bit systems and is used for user processes, requiring special handling to access. This distinction is crucial for effective memory management in Linux.\n",
+      "Your text messages are showing up as \"maybe\" because your messaging app is uncertain about the sender's identity. This typically occurs when the sender's number is not saved in your contacts or if the message is from an unknown number. To resolve this, you can save the contact in your address book or check the message settings in your app.\n",
       "\n",
-      "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## recall ## ]]` (must be formatted as a valid Python float), then `[[ ## precision ## ]]` (must be formatted as a valid Python float), and then ending with the marker for `[[ ## completed ## ]]`.\n",
+      "Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## ground_truth_key_ideas ## ]]`, then `[[ ## system_response_key_ideas ## ]]`, then `[[ ## discussion ## ]]`, then `[[ ## recall ## ]]` (must be formatted as a valid Python float), then `[[ ## precision ## ]]` (must be formatted as a valid Python float), and then ending with the marker for `[[ ## completed ## ]]`.\n",
       "\n",
       "\n",
       "\u001b[31mResponse:\u001b[0m\n",
       "\n",
       "\u001b[32m[[ ## reasoning ## ]]\n",
-      "The system response correctly identifies \"Low Memory\" as the portion of RAM directly accessible by the kernel and used for kernel operations, which aligns with the ground truth. It also mentions that \"High Memory\" refers to memory above a certain limit that is not directly accessible by the kernel, which is consistent with the ground truth's description of user space. However, the ground truth emphasizes that Low Memory is set aside for kernel programs and that users cannot access it, which is not explicitly stated in the system response. Overall, the key ideas are present, but the system response lacks the explicit mention of user access limitations for Low Memory.\n",
+      "The system response provides a general explanation for why text messages may show up as \"maybe,\" focusing on the uncertainty of the sender's identity. However, it does not mention the specific iOS features or the potential bug in iOS 11.2 that are highlighted in the ground truth. The ground truth explains that this feature is part of iOS 9's Proactivity features and mentions a specific bug in iOS 11.2, which is not addressed in the system response.\n",
+      "\n",
+      "[[ ## ground_truth_key_ideas ## ]]\n",
+      "1. Proactivity features introduced in iOS 9.\n",
+      "2. The system checks emails to associate phone numbers with contacts.\n",
+      "3. \"Maybe\" is displayed if the number is not saved in contacts.\n",
+      "4. Mention of a bug in iOS 11.2 causing \"Maybe\" to appear incorrectly.\n",
+      "\n",
+      "[[ ## system_response_key_ideas ## ]]\n",
+      "1. Text messages show up as \"maybe\" due to uncertainty about the sender's identity.\n",
+      "2. Occurs when the sender's number is not saved in contacts or is from an unknown number.\n",
+      "3. Suggests saving the contact or checking message settings.\n",
+      "\n",
+      "[[ ## discussion ## ]]\n",
+      "There is some overlap between the ground truth and the system response regarding the uncertainty of the sender's identity and the suggestion to save the contact. However, the system response lacks specific details about the iOS features and the bug mentioned in the ground truth. The ground truth provides a more comprehensive explanation of the \"maybe\" feature, while the system response is more general and does not address the iOS version specifics.\n",
       "\n",
       "[[ ## recall ## ]]\n",
-      "0.85\n",
+      "0.25\n",
       "\n",
       "[[ ## precision ## ]]\n",
-      "0.90\n",
+      "0.5\n",
       "\n",
       "[[ ## completed ## ]]\u001b[0m\n",
       "\n",
@@ -502,12 +523,25 @@
    "execution_count": 11,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average Metric: 125.68 / 300 (41.9%): 100%|██████████| 300/300 [00:00<00:00, 666.96it/s]"
+     ]
+    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Average Metric: 55.380830691218016 / 150  (36.9): 100%|██████████| 150/150 [00:00<00:00, 513.51it/s]\n",
-      "2024/11/10 12:39:20 INFO dspy.evaluate.evaluate: Average Metric: 55.380830691218016 / 150 (36.9%)\n"
+      "2024/11/23 23:16:36 INFO dspy.evaluate.evaluate: Average Metric: 125.68228336477591 / 300 (41.9%)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
      ]
     },
     {
@@ -533,6 +567,7 @@
        "      <th></th>\n",
        "      <th>question</th>\n",
        "      <th>example_response</th>\n",
+       "      <th>gold_doc_ids</th>\n",
        "      <th>reasoning</th>\n",
        "      <th>pred_response</th>\n",
        "      <th>SemanticF1</th>\n",
@@ -541,44 +576,50 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>why is mercurial considered to be easier than git?</td>\n",
-       "      <td>Mercurial's syntax is considered more familiar, especially for tho...</td>\n",
-       "      <td>Mercurial is often considered easier than Git for several reasons....</td>\n",
-       "      <td>Mercurial is considered easier than Git primarily due to its simpl...</td>\n",
-       "      <td>✔️ [0.545]</td>\n",
+       "      <td>when to use c over c++, and c++ over c?</td>\n",
+       "      <td>If you are equally familiar with both C++ and C, it's advisable to...</td>\n",
+       "      <td>[733]</td>\n",
+       "      <td>C and C++ are both powerful programming languages, but they serve ...</td>\n",
+       "      <td>Use C when you need low-level access to memory, require high perfo...</td>\n",
+       "      <td></td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>open finder window from current terminal location?</td>\n",
-       "      <td>If you type 'open .' in Terminal, it will open the current directo...</td>\n",
-       "      <td>To open a Finder window from the current terminal location on a Ma...</td>\n",
-       "      <td>You can open a Finder window from your current terminal location b...</td>\n",
-       "      <td>✔️ [0.667]</td>\n",
+       "      <td>should images be stored in a git repository?</td>\n",
+       "      <td>One viewpoint expresses that there is no significant downside, esp...</td>\n",
+       "      <td>[6253, 6254, 6275, 6278, 8215]</td>\n",
+       "      <td>Storing images in a Git repository can be beneficial for version c...</td>\n",
+       "      <td>Images can be stored in a Git repository, but it's important to co...</td>\n",
+       "      <td>✔️ [0.444]</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                             question  \\\n",
-       "0  why is mercurial considered to be easier than git?   \n",
-       "1  open finder window from current terminal location?   \n",
+       "                                       question  \\\n",
+       "0       when to use c over c++, and c++ over c?   \n",
+       "1  should images be stored in a git repository?   \n",
        "\n",
        "                                                        example_response  \\\n",
-       "0  Mercurial's syntax is considered more familiar, especially for tho...   \n",
-       "1  If you type 'open .' in Terminal, it will open the current directo...   \n",
+       "0  If you are equally familiar with both C++ and C, it's advisable to...   \n",
+       "1  One viewpoint expresses that there is no significant downside, esp...   \n",
+       "\n",
+       "                     gold_doc_ids  \\\n",
+       "0                           [733]   \n",
+       "1  [6253, 6254, 6275, 6278, 8215]   \n",
        "\n",
        "                                                               reasoning  \\\n",
-       "0  Mercurial is often considered easier than Git for several reasons....   \n",
-       "1  To open a Finder window from the current terminal location on a Ma...   \n",
+       "0  C and C++ are both powerful programming languages, but they serve ...   \n",
+       "1  Storing images in a Git repository can be beneficial for version c...   \n",
        "\n",
        "                                                           pred_response  \\\n",
-       "0  Mercurial is considered easier than Git primarily due to its simpl...   \n",
-       "1  You can open a Finder window from your current terminal location b...   \n",
+       "0  Use C when you need low-level access to memory, require high perfo...   \n",
+       "1  Images can be stored in a Git repository, but it's important to co...   \n",
        "\n",
        "   SemanticF1  \n",
-       "0  ✔️ [0.545]  \n",
-       "1  ✔️ [0.667]  "
+       "0              \n",
+       "1  ✔️ [0.444]  "
       ]
      },
      "metadata": {},
@@ -594,7 +635,7 @@
        "                    font-weight: bold;\n",
        "                    color: #555;\n",
        "                    margin: 10px 0;'>\n",
-       "                    ... 148 more rows not displayed ...\n",
+       "                    ... 298 more rows not displayed ...\n",
        "                </div>\n",
        "                "
       ],
@@ -608,7 +649,7 @@
     {
      "data": {
       "text/plain": [
-       "36.92"
+       "41.89"
       ]
      },
      "execution_count": 11,
@@ -640,7 +681,7 @@
    "source": [
     "## Basic Retrieval-Augmented Generation (RAG).\n",
     "\n",
-    "First, let's download the corpus data that we will use for RAG search. The next cell will seek to download 4 GBs, so it may take a few minutes. A future version of this notebook will come with a cache that allows you to skip downloads and the PyTorch installation."
+    "First, let's download the corpus data that we will use for RAG search. An older version of this tutorial used the full (650,000 document) corpus. To make this very fast and cheap to run, we've downsampled the corpus to just 28,000 documents."
    ]
   },
   {
@@ -649,8 +690,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "download('https://huggingface.co/datasets/colbertv2/lotte_passages/resolve/main/technology/test_collection.jsonl')\n",
-    "download('https://huggingface.co/dspy/cache/resolve/main/index.pt')"
+    "download(\"https://huggingface.co/dspy/cache/resolve/main/ragqa_arena_tech_corpus.jsonl\")"
    ]
   },
   {
@@ -659,7 +699,9 @@
    "source": [
     "## Set up your system's retriever.\n",
     "\n",
-    "As far as DSPy is concerned, you can plug in any Python code for calling tools or retrievers. Hence, for our RAG system, we can plug any tools for the search step. Here, we'll just use OpenAI Embeddings and PyTorch for top-K search, but this is not a special choice, just a convenient one."
+    "As far as DSPy is concerned, you can plug in any Python code for calling tools or retrievers. Here, we'll just use OpenAI Embeddings and do top-K search locally, just for convenience.\n",
+    "\n",
+    "**Note:** The step below will require that you either do `pip install -U faiss-cpu` or pass `brute_force_threshold=30_000` to `dspy.retrievers.Embeddings` to avoid faiss."
    ]
   },
   {
@@ -668,22 +710,33 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import torch\n",
-    "import functools\n",
-    "from litellm import embedding as Embed\n",
-    "\n",
-    "with open(\"test_collection.jsonl\") as f:\n",
-    "    corpus = [ujson.loads(line) for line in f]\n",
-    "\n",
-    "index = torch.load('index.pt', weights_only=True)\n",
-    "max_characters = 4000 # >98th percentile of document lengths\n",
-    "\n",
-    "@functools.lru_cache(maxsize=None)\n",
-    "def search(query, k=5):\n",
-    "    query_embedding = torch.tensor(Embed(input=query, model=\"text-embedding-3-small\").data[0]['embedding'])\n",
-    "    topk_scores, topk_indices = torch.matmul(index, query_embedding).topk(k)\n",
-    "    topK = [dict(score=score.item(), **corpus[idx]) for idx, score in zip(topk_indices, topk_scores)]\n",
-    "    return [doc['text'][:max_characters] for doc in topK]"
+    "# %pip install -U faiss-cpu  # or faiss-gpu if you have a GPU"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded 28436 documents. Will encode them below.\n",
+      "Training a 32-byte FAISS index with 337 partitions, based on 28436 x 512-dim embeddings\n"
+     ]
+    }
+   ],
+   "source": [
+    "max_characters = 6000  # for truncating >99th percentile of documents\n",
+    "topk_docs_to_retrieve = 5  # number of documents to retrieve per search query\n",
+    "\n",
+    "with open(\"ragqa_arena_tech_corpus.jsonl\") as f:\n",
+    "    corpus = [ujson.loads(line)['text'][:max_characters] for line in f]\n",
+    "    print(f\"Loaded {len(corpus)} documents. Will encode them below.\")\n",
+    "\n",
+    "embedder = dspy.Embedder('openai/text-embedding-3-small', dimensions=512)\n",
+    "search = dspy.retrievers.Embeddings(embedder=embedder, corpus=corpus, k=topk_docs_to_retrieve)"
    ]
   },
   {
@@ -702,17 +755,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
     "class RAG(dspy.Module):\n",
-    "    def __init__(self, num_docs=5):\n",
-    "        self.num_docs = num_docs\n",
+    "    def __init__(self):\n",
     "        self.respond = dspy.ChainOfThought('context, question -> response')\n",
     "\n",
     "    def forward(self, question):\n",
-    "        context = search(question, k=self.num_docs)\n",
+    "        context = search(question).passages\n",
     "        return self.respond(context=context, question=question)"
    ]
   },
@@ -726,19 +778,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "Prediction(\n",
-       "    reasoning=\"High memory and low memory in Linux refer to the way the operating system organizes and manages memory for user-space applications and the kernel. Low memory is the portion of memory that is directly accessible by the kernel, while high memory is the part that is not directly mapped by the kernel's page tables. In a typical 32-bit architecture, low memory usually consists of the lower 3 GB of virtual memory, which is accessible to user-space applications, while high memory comprises the upper 1 GB, which is reserved for the kernel. The distinction is important for memory management, especially in systems with large amounts of RAM, as it affects how the kernel accesses and manages memory resources.\",\n",
-       "    response=\"In Linux, high memory refers to the portion of memory that is not directly mapped by the kernel's page tables, meaning the kernel cannot access it directly without mapping it into its address space first. Low memory, on the other hand, is the segment of memory that the kernel can access directly. In a typical 32-bit system, low memory consists of the lower 3 GB of virtual memory, while high memory comprises the upper 1 GB. This organization helps manage memory more efficiently, especially in systems with large physical memory.\"\n",
+       "    reasoning=\"High Memory and Low Memory in Linux refer to two segments of the kernel's memory space. Low Memory is the portion of memory that the kernel can access directly and is statically mapped at boot time. This area is typically used for kernel data structures and is always accessible to the kernel. High Memory, on the other hand, is not permanently mapped in the kernel's address space, meaning that the kernel cannot access it directly without first mapping it into its address space. High Memory is used for user-space applications and temporary data buffers. The distinction allows for better memory management and security, as user-space applications cannot directly access kernel-space memory.\",\n",
+       "    response=\"In Linux, High Memory refers to the segment of memory that is not permanently mapped in the kernel's address space, which means the kernel must map it temporarily to access it. This area is typically used for user-space applications and temporary data buffers. Low Memory, in contrast, is the portion of memory that the kernel can access directly and is statically mapped at boot time. It is used for kernel data structures and is always accessible to the kernel. This separation enhances security by preventing user-space applications from accessing kernel-space memory directly.\"\n",
        ")"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -750,7 +802,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -761,7 +813,7 @@
       "\n",
       "\n",
       "\n",
-      "\u001b[34m[2024-11-10T12:39:22.802994]\u001b[0m\n",
+      "\u001b[34m[2024-11-23T23:16:49.175612]\u001b[0m\n",
       "\n",
       "\u001b[31mSystem message:\u001b[0m\n",
       "\n",
@@ -797,10 +849,10 @@
       "\n",
       "[[ ## context ## ]]\n",
       "[1] «As far as I remember, High Memory is used for application space and Low Memory for the kernel. Advantage is that (user-space) applications cant access kernel-space memory.»\n",
-      "[2] «For the people looking for an explanation in the context of Linux kernel memory space, beware that there are two conflicting definitions of the high/low memory split (unfortunately there is no standard, one has to interpret that in context): High memory defined as the totality of kernel space in VIRTUAL memory. This is a region that only the kernel can access and comprises all virtual addresses greater or equal than PAGE_OFFSET. Low memory refers therefore to the region of the remaining addresses, which correspond to the user-space memory accessible from each user process. For example: on 32-bit x86 with a default PAGE_OFFSET, this means that high memory is any address ADDR with ADDR ≥ 0xC0000000 = PAGE_OFFSET (i.e. higher 1 GB). This is the reason why in Linux 32-bit processes are typically limited to 3 GB. Note that PAGE_OFFSET cannot be configured directly, it depends on the configurable VMSPLIT_x options (source). To summarize: in 32-bit archs, virtual memory is by default split into lower 3 GB (user space) and higher 1 GB (kernel space). For 64 bit, PAGE_OFFSET is not configurable and depends on architectural details that are sometimes detected at runtime during kernel load. On x86_64, PAGE_OFFSET is 0xffff888000000000 for 4-level paging (typical) and 0xff11000000000000 for 5-level paging (source). For ARM64 this is usually 0x8000000000000000. Note though, if KASLR is enabled, this value is intentionally unpredictable. High memory defined as the portion of PHYSICAL memory that cannot be mapped contiguously with the rest of the kernel virtual memory. A portion of the kernel virtual address space can be mapped as a single contiguous chunk into the so-called physical low memory. To fully understand what this means, a deeper knowledge of the Linux virtual memory space is required. I would recommend going through these slides. From the slides: This kind of high/low memory split is only applicable to 32-bit architectures where the installed physical RAM size is relatively high (more than ~1 GB). Otherwise, i.e. when the physical address space is small (<1 GB) or when the virtual memory space is large (64 bits), the whole physical space can be accessed from the kernel virtual memory space. In that case, all physical memory is considered low memory. It is preferable that high memory does not exist at all because the whole physical space can be accessed directly from the kernel, which makes memory management a lot simpler and efficient. This is especially important when dealing with DMAs (which typically require physically contiguous memory). See also the answer by @gilles»\n",
-      "[3] «Low and High do not refer to whether there is a lot of usage or not. They represent the way it is organized by the system. According to Wikipedia: High Memory is the part of physical memory in a computer which is not directly mapped by the page tables of its operating system kernel. There is no duration for the free command which simply computes a snapshot of the information available. Most people, including programmers, do not need to understand it more clearly as it is managed in a much simpler form through system calls and compiler/interpreter operations.»\n",
-      "[4] «This is relevant to the Linux kernel; Im not sure how any Unix kernel handles this. The High Memory is the segment of memory that user-space programs can address. It cannot touch Low Memory. Low Memory is the segment of memory that the Linux kernel can address directly. If the kernel must access High Memory, it has to map it into its own address space first. There was a patch introduced recently that lets you control where the segment is. The tradeoff is that you can take addressable memory away from user space so that the kernel can have more memory that it does not have to map before using. Additional resources: http://tldp.org/HOWTO/KernelAnalysis-HOWTO-7.html http://linux-mm.org/HighMemory»\n",
-      "[5] «HIGHMEM is a range of kernels memory space, but it is NOT memory you access but its a place where you put what you want to access. A typical 32bit Linux virtual memory map is like: 0x00000000-0xbfffffff: user process (3GB) 0xc0000000-0xffffffff: kernel space (1GB) (CPU-specific vector and whatsoever are ignored here). Linux splits the 1GB kernel space into 2 pieces, LOWMEM and HIGHMEM. The split varies from installation to installation. If an installation chooses, say, 512MB-512MB for LOW and HIGH mems, the 512MB LOWMEM (0xc0000000-0xdfffffff) is statically mapped at the kernel boot time; usually the first so many bytes of the physical memory is used for this so that virtual and physical addresses in this range have a constant offset of, say, 0xc0000000. On the other hand, the latter 512MB (HIGHMEM) has no static mapping (although you could leave pages semi-permanently mapped there, but you must do so explicitly in your driver code). Instead, pages are temporarily mapped and unmapped here so that virtual and physical addresses in this range have no consistent mapping. Typical uses of HIGHMEM include single-time data buffers.»\n",
+      "[2] «HIGHMEM is a range of kernels memory space, but it is NOT memory you access but its a place where you put what you want to access. A typical 32bit Linux virtual memory map is like: 0x00000000-0xbfffffff: user process (3GB) 0xc0000000-0xffffffff: kernel space (1GB) (CPU-specific vector and whatsoever are ignored here). Linux splits the 1GB kernel space into 2 pieces, LOWMEM and HIGHMEM. The split varies from installation to installation. If an installation chooses, say, 512MB-512MB for LOW and HIGH mems, the 512MB LOWMEM (0xc0000000-0xdfffffff) is statically mapped at the kernel boot time; usually the first so many bytes of the physical memory is used for this so that virtual and physical addresses in this range have a constant offset of, say, 0xc0000000. On the other hand, the latter 512MB (HIGHMEM) has no static mapping (although you could leave pages semi-permanently mapped there, but you must do so explicitly in your driver code). Instead, pages are temporarily mapped and unmapped here so that virtual and physical addresses in this range have no consistent mapping. Typical uses of HIGHMEM include single-time data buffers.»\n",
+      "[3] «This is relevant to the Linux kernel; Im not sure how any Unix kernel handles this. The High Memory is the segment of memory that user-space programs can address. It cannot touch Low Memory. Low Memory is the segment of memory that the Linux kernel can address directly. If the kernel must access High Memory, it has to map it into its own address space first. There was a patch introduced recently that lets you control where the segment is. The tradeoff is that you can take addressable memory away from user space so that the kernel can have more memory that it does not have to map before using. Additional resources: http://tldp.org/HOWTO/KernelAnalysis-HOWTO-7.html http://linux-mm.org/HighMemory»\n",
+      "[4] «The first reference to turn to is Linux Device Drivers (available both online and in book form), particularly chapter 15 which has a section on the topic. In an ideal world, every system component would be able to map all the memory it ever needs to access. And this is the case for processes on Linux and most operating systems: a 32-bit process can only access a little less than 2^32 bytes of virtual memory (in fact about 3GB on a typical Linux 32-bit architecture). It gets difficult for the kernel, which needs to be able to map the full memory of the process whose system call its executing, plus the whole physical memory, plus any other memory-mapped hardware device. So when a 32-bit kernel needs to map more than 4GB of memory, it must be compiled with high memory support. High memory is memory which is not permanently mapped in the kernels address space. (Low memory is the opposite: it is always mapped, so you can access it in the kernel simply by dereferencing a pointer.) When you access high memory from kernel code, you need to call kmap first, to obtain a pointer from a page data structure (struct page). Calling kmap works whether the page is in high or low memory. There is also kmap_atomic which has added constraints but is more efficient on multiprocessor machines because it uses finer-grained locking. The pointer obtained through kmap is a resource: it uses up address space. Once youve finished with it, you must call kunmap (or kunmap_atomic) to free that resource; then the pointer is no longer valid, and the contents of the page cant be accessed until you call kmap again.»\n",
+      "[5] «/proc/meminfo will tell you how free works, but /proc/kcore can tell you what the kernel uses. From the same page: /proc/kcore This file represents the physical memory of the system and is stored in the ELF core file format. With this pseudo-file, and an unstripped kernel (/usr/src/linux/vmlinux) binary, GDB can be used to examine the current state of any kernel data structures. The total length of the file is the size of physical memory (RAM) plus 4KB. /proc/meminfo This file reports statistics about memory usage on the system. It is used by free(1) to report the amount of free and used memory (both physical and swap) on the system as well as the shared memory and buffers used by the kernel. Each line of the file consists of a parameter name, followed by a colon, the value of the parameter, and an option unit of measurement (e.g., kB). The list below describes the parameter names and the format specifier required to read the field value. Except as noted below, all of the fields have been present since at least Linux 2.6.0. Some fileds are displayed only if the kernel was configured with various options; those dependencies are noted in the list. MemTotal %lu Total usable RAM (i.e., physical RAM minus a few reserved bits and the kernel binary code). MemFree %lu The sum of LowFree+HighFree. Buffers %lu Relatively temporary storage for raw disk blocks that shouldnt get tremendously large (20MB or so). Cached %lu In-memory cache for files read from the disk (the page cache). Doesnt include SwapCached. SwapCached %lu Memory that once was swapped out, is swapped back in but still also is in the swap file. (If memory pressure is high, these pages dont need to be swapped out again because they are already in the swap file. This saves I/O.) Active %lu Memory that has been used more recently and usually not reclaimed unless absolutely necessary. Inactive %lu Memory which has been less recently used. It is more eligible to be reclaimed for other purposes. Active(anon) %lu (since Linux 2.6.28) [To be documented.] Inactive(anon) %lu (since Linux 2.6.28) [To be documented.] Active(file) %lu (since Linux 2.6.28) [To be documented.] Inactive(file) %lu (since Linux 2.6.28) [To be documented.] Unevictable %lu (since Linux 2.6.28) (From Linux 2.6.28 to 2.6.30, CONFIG_UNEVICTABLE_LRU was required.) [To be documented.] Mlocked %lu (since Linux 2.6.28) (From Linux 2.6.28 to 2.6.30, CONFIG_UNEVICTABLE_LRU was required.) [To be documented.] HighTotal %lu (Starting with Linux 2.6.19, CONFIG_HIGHMEM is required.) Total amount of highmem. Highmem is all memory above ~860MB of physical memory. Highmem areas are for use by user-space programs, or for the page cache. The kernel must use tricks to access this memory, making it slower to access than lowmem. HighFree %lu (Starting with Linux 2.6.19, CONFIG_HIGHMEM is required.) Amount of free highmem. LowTotal %lu (Starting with Linux 2.6.19, CONFIG_HIGHMEM is required.) Total amount of lowmem. Lowmem is memory which can be used for everything that highmem can be used for, but it is also available for the kernels use for its own data structures. Among many other things, it is where everything from Slab is allocated. Bad things happen when youre out of lowmem. LowFree %lu (Starting with Linux 2.6.19, CONFIG_HIGHMEM is required.) Amount of free lowmem. MmapCopy %lu (since Linux 2.6.29) (CONFIG_MMU is required.) [To be documented.] SwapTotal %lu Total amount of swap space available. SwapFree %lu Amount of swap space that is currently unused. Dirty %lu Memory which is waiting to get written back to the disk. Writeback %lu Memory which is actively being written back to the disk. AnonPages %lu (since Linux 2.6.18) Non-file backed pages mapped into user-space page tables. Mapped %lu Files which have been mmaped, such as libraries. Shmem %lu (since Linux 2.6.32) [To be documented.] Slab %lu In-kernel data structures cache. SReclaimable %lu (since Linux 2.6.19) Part of Slab, that might be reclaimed, such as caches. SUnreclaim %lu (since Linux 2.6.19) Part of Slab, that cannot be reclaimed on memory pressure. KernelStack %lu (since Linux 2.6.32) Amount of memory allocated to kernel stacks. PageTables %lu (since Linux 2.6.18) Amount of memory dedicated to the lowest level of page tables. Quicklists %lu (since Linux 2.6.27) (CONFIG_QUICKLIST is required.) [To be documented.] NFS_Unstable %lu (since Linux 2.6.18) NFS pages sent to the server, but not yet committed to stable storage. Bounce %lu (since Linux 2.6.18) Memory used for block device bounce buffers. WritebackTmp %lu (since Linux 2.6.26) Memory used by FUSE for temporary writeback buffers. CommitLimit %lu (since Linux 2.6.10) Based on the overcommit ratio (vm.overcommit_ratio), this is the total amount of memory currently available to be allocated on the system. This limit is adhered to only if strict overcommit accounting is enabled (mode 2 in /proc/sys/vm/overcommit_ratio). The CommitLimit is calculated using the following formula: CommitLimit = ([total RAM pages] - [total huge TLB pages]) * overcommit_ratio / 100 + [total swap pages] For example, on a system with 1GB of physical RAM and 7GB of swap with a overcommit_ratio of 30, this formula yields a CommitLimit of 7.3GB. For more details, see the memory overcommit documentation in the kernel source file Documentation/vm/overcommit-accounting. Committed_AS %lu The amount of memory presently allocated on the system. The committed memory is a sum of all of the memory which has been allocated by processes, even if it has not been used by them as of yet. A process which allocates 1GB of memory (using malloc(3) or similar), but touches only 300MB of that memory will show up as using only 300MB of memory even if it has the address space allocated for the entire 1GB. This 1GB is memory which has been committed to by the VM and can be used at any time by the allocating application. With strict overcommit enabled on the system (mode 2 /proc/sys/vm/overcommit_memory), allocations w»\n",
       "\n",
       "[[ ## question ## ]]\n",
       "what are high memory and low memory on linux?\n",
@@ -811,10 +863,10 @@
       "\u001b[31mResponse:\u001b[0m\n",
       "\n",
       "\u001b[32m[[ ## reasoning ## ]]\n",
-      "High memory and low memory in Linux refer to the way the operating system organizes and manages memory for user-space applications and the kernel. Low memory is the portion of memory that is directly accessible by the kernel, while high memory is the part that is not directly mapped by the kernel's page tables. In a typical 32-bit architecture, low memory usually consists of the lower 3 GB of virtual memory, which is accessible to user-space applications, while high memory comprises the upper 1 GB, which is reserved for the kernel. The distinction is important for memory management, especially in systems with large amounts of RAM, as it affects how the kernel accesses and manages memory resources.\n",
+      "High Memory and Low Memory in Linux refer to two segments of the kernel's memory space. Low Memory is the portion of memory that the kernel can access directly and is statically mapped at boot time. This area is typically used for kernel data structures and is always accessible to the kernel. High Memory, on the other hand, is not permanently mapped in the kernel's address space, meaning that the kernel cannot access it directly without first mapping it into its address space. High Memory is used for user-space applications and temporary data buffers. The distinction allows for better memory management and security, as user-space applications cannot directly access kernel-space memory.\n",
       "\n",
       "[[ ## response ## ]]\n",
-      "In Linux, high memory refers to the portion of memory that is not directly mapped by the kernel's page tables, meaning the kernel cannot access it directly without mapping it into its address space first. Low memory, on the other hand, is the segment of memory that the kernel can access directly. In a typical 32-bit system, low memory consists of the lower 3 GB of virtual memory, while high memory comprises the upper 1 GB. This organization helps manage memory more efficiently, especially in systems with large physical memory.\n",
+      "In Linux, High Memory refers to the segment of memory that is not permanently mapped in the kernel's address space, which means the kernel must map it temporarily to access it. This area is typically used for user-space applications and temporary data buffers. Low Memory, in contrast, is the portion of memory that the kernel can access directly and is statically mapped at boot time. It is used for kernel data structures and is always accessible to the kernel. This separation enhances security by preventing user-space applications from accessing kernel-space memory directly.\n",
       "\n",
       "[[ ## completed ## ]]\u001b[0m\n",
       "\n",
@@ -833,20 +885,33 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Earlier with a CoT module, we got nearly 37% in terms of semantic F1 on our `devset`. Would this `RAG` module score better?"
+    "Earlier with a CoT module, we got around 40% in terms of semantic F1 on our `devset`. Would this `RAG` module score better?"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average Metric: 166.54 / 300 (55.5%): 100%|██████████| 300/300 [00:04<00:00, 61.40it/s] "
+     ]
+    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Average Metric: 74.61311832900337 / 150  (49.7): 100%|██████████| 150/150 [00:05<00:00, 27.92it/s] \n",
-      "2024/11/10 12:39:28 INFO dspy.evaluate.evaluate: Average Metric: 74.61311832900337 / 150 (49.7%)\n"
+      "2024/11/23 23:16:54 INFO dspy.evaluate.evaluate: Average Metric: 166.53601368289284 / 300 (55.5%)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
      ]
     },
     {
@@ -872,6 +937,7 @@
        "      <th></th>\n",
        "      <th>question</th>\n",
        "      <th>example_response</th>\n",
+       "      <th>gold_doc_ids</th>\n",
        "      <th>reasoning</th>\n",
        "      <th>pred_response</th>\n",
        "      <th>SemanticF1</th>\n",
@@ -880,44 +946,50 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>why is mercurial considered to be easier than git?</td>\n",
-       "      <td>Mercurial's syntax is considered more familiar, especially for tho...</td>\n",
-       "      <td>Mercurial is considered easier than Git for several reasons. First...</td>\n",
-       "      <td>Mercurial is considered easier than Git because it has a more fami...</td>\n",
-       "      <td>✔️ [0.797]</td>\n",
+       "      <td>when to use c over c++, and c++ over c?</td>\n",
+       "      <td>If you are equally familiar with both C++ and C, it's advisable to...</td>\n",
+       "      <td>[733]</td>\n",
+       "      <td>C should be used over C++ primarily in scenarios where simplicity ...</td>\n",
+       "      <td>Use C over C++ when working on embedded systems, requiring low-lev...</td>\n",
+       "      <td>✔️ [0.500]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>open finder window from current terminal location?</td>\n",
-       "      <td>If you type 'open .' in Terminal, it will open the current directo...</td>\n",
-       "      <td>To open a Finder window from the current terminal location, you ca...</td>\n",
-       "      <td>You can open a Finder window from your current terminal location b...</td>\n",
-       "      <td>✔️ [0.667]</td>\n",
+       "      <td>should images be stored in a git repository?</td>\n",
+       "      <td>One viewpoint expresses that there is no significant downside, esp...</td>\n",
+       "      <td>[6253, 6254, 6275, 6278, 8215]</td>\n",
+       "      <td>Storing images in a Git repository is generally not recommended du...</td>\n",
+       "      <td>While it is technically possible to store images in a Git reposito...</td>\n",
+       "      <td>✔️ [0.444]</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                             question  \\\n",
-       "0  why is mercurial considered to be easier than git?   \n",
-       "1  open finder window from current terminal location?   \n",
+       "                                       question  \\\n",
+       "0       when to use c over c++, and c++ over c?   \n",
+       "1  should images be stored in a git repository?   \n",
        "\n",
        "                                                        example_response  \\\n",
-       "0  Mercurial's syntax is considered more familiar, especially for tho...   \n",
-       "1  If you type 'open .' in Terminal, it will open the current directo...   \n",
+       "0  If you are equally familiar with both C++ and C, it's advisable to...   \n",
+       "1  One viewpoint expresses that there is no significant downside, esp...   \n",
+       "\n",
+       "                     gold_doc_ids  \\\n",
+       "0                           [733]   \n",
+       "1  [6253, 6254, 6275, 6278, 8215]   \n",
        "\n",
        "                                                               reasoning  \\\n",
-       "0  Mercurial is considered easier than Git for several reasons. First...   \n",
-       "1  To open a Finder window from the current terminal location, you ca...   \n",
+       "0  C should be used over C++ primarily in scenarios where simplicity ...   \n",
+       "1  Storing images in a Git repository is generally not recommended du...   \n",
        "\n",
        "                                                           pred_response  \\\n",
-       "0  Mercurial is considered easier than Git because it has a more fami...   \n",
-       "1  You can open a Finder window from your current terminal location b...   \n",
+       "0  Use C over C++ when working on embedded systems, requiring low-lev...   \n",
+       "1  While it is technically possible to store images in a Git reposito...   \n",
        "\n",
        "   SemanticF1  \n",
-       "0  ✔️ [0.797]  \n",
-       "1  ✔️ [0.667]  "
+       "0  ✔️ [0.500]  \n",
+       "1  ✔️ [0.444]  "
       ]
      },
      "metadata": {},
@@ -933,7 +1005,7 @@
        "                    font-weight: bold;\n",
        "                    color: #555;\n",
        "                    margin: 10px 0;'>\n",
-       "                    ... 148 more rows not displayed ...\n",
+       "                    ... 298 more rows not displayed ...\n",
        "                </div>\n",
        "                "
       ],
@@ -947,10 +1019,10 @@
     {
      "data": {
       "text/plain": [
-       "49.74"
+       "55.51"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -965,7 +1037,7 @@
    "source": [
     "## Using a DSPy Optimizer to improve your RAG prompt.\n",
     "\n",
-    "Off the shelf, our `RAG` module scores nearly 50%. What are our options to make it stronger? One of the various choices DSPy offers is optimizing the prompts in our pipeline.\n",
+    "Off the shelf, our `RAG` module scores 55%. What are our options to make it stronger? One of the various choices DSPy offers is optimizing the prompts in our pipeline.\n",
     "\n",
     "If there are many sub-modules in your program, all of them will be optimized together. In this case, there's only one: `self.respond = dspy.ChainOfThought('context, question -> response')`\n",
     "\n",
@@ -980,7 +1052,7 @@
    "source": [
     "tp = dspy.MIPROv2(metric=metric, auto=\"medium\", num_threads=24)  # use fewer threads if your rate limit is small\n",
     "\n",
-    "optimized_rag = tp.compile(RAG(), trainset=trainset, valset=valset,\n",
+    "optimized_rag = tp.compile(RAG(), trainset=trainset,\n",
     "                           max_bootstrapped_demos=2, max_labeled_demos=2,\n",
     "                           requires_permission_to_run=False)"
    ]
@@ -996,14 +1068,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "You are correct; cmd+Tab does not work on hidden or minimized windows in macOS. It is designed to switch between applications and will only show non-minimized windows of the active application. To access minimized windows, you need to click on them directly or use other shortcuts.\n"
+      "You are correct that cmd+tab does not work on hidden or minimized windows. To switch back to a minimized app, you must first switch to another application and let it take focus before returning to the minimized one.\n"
      ]
     }
    ],
@@ -1014,31 +1086,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "In macOS, the Command+Tab shortcut is specifically designed to switch between applications, not individual windows. This means that if an application is minimized or hidden, it will not be activated using Command+Tab. Here are some alternative methods to manage minimized or hidden windows:\n",
-      "\n",
-      "1. **Click on the Minimized Window:**\n",
-      "   - You can directly click on the minimized window in the Dock to restore it.\n",
-      "\n",
-      "2. **Use Command+M:**\n",
-      "   - If you want to minimize the current window, you can use Command+M. To restore it, you will need to click on it in the Dock.\n",
-      "\n",
-      "3. **Use Mission Control:**\n",
-      "   - You can activate Mission Control (F3 or Control+Up Arrow) to see all open windows and select the one you want to bring to the front.\n",
-      "\n",
-      "4. **Third-Party Applications:**\n",
-      "   - Consider using third-party applications like HyperSwitch or Witch, which can provide enhanced window management features, including switching between windows of the same application.\n",
-      "\n",
-      "5. **Keyboard Shortcuts for Specific Applications:**\n",
-      "   - Some applications may have their own shortcuts for managing windows. Check the preferences or documentation for the specific application you are using.\n",
-      "\n",
-      "By using these methods, you can effectively manage and restore minimized or hidden windows in macOS.\n"
+      "The Command + Tab shortcut on macOS is designed to switch between currently open applications, but it does not directly restore minimized or hidden windows. When you use Command + Tab, it cycles through the applications that are actively running, and minimized windows do not count as active. To manage minimized windows, you can use other shortcuts or methods. For example, you can use Command + Option + H + M to hide all other applications and minimize the most recently used one. Alternatively, you can navigate to the application you want to restore using Command + Tab and then manually click on the minimized window in the Dock to bring it back to focus.\n"
      ]
     }
    ],
@@ -1053,7 +1108,7 @@
    "source": [
     "You can use `dspy.inspect_history(n=2)` to view the RAG prompt [before optimization](https://gist.github.com/okhat/5d04648f2226e72e66e26a8cb1456ee4) and [after optimization](https://gist.github.com/okhat/79405b8889b4b07da577ee19f1a3479a).\n",
     "\n",
-    "Concretely, in of run of this notebook, the optimized prompt:\n",
+    "Concretely, in one of the runs of this notebook, the optimized prompt does the following (note that it may be different on a later rerun).\n",
     "\n",
     "1. Constructs the following instruction,\n",
     "```text\n",
@@ -1067,15 +1122,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average Metric: 183.32 / 300 (61.1%): 100%|██████████| 300/300 [00:02<00:00, 104.48it/s]"
+     ]
+    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Average Metric: 89.78303512426604 / 150  (59.9): 100%|██████████| 150/150 [00:00<00:00, 424.18it/s]\n",
-      "2024/11/10 12:39:36 INFO dspy.evaluate.evaluate: Average Metric: 89.78303512426604 / 150 (59.9%)\n"
+      "2024/11/23 23:17:21 INFO dspy.evaluate.evaluate: Average Metric: 183.3194433591069 / 300 (61.1%)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
      ]
     },
     {
@@ -1101,6 +1169,7 @@
        "      <th></th>\n",
        "      <th>question</th>\n",
        "      <th>example_response</th>\n",
+       "      <th>gold_doc_ids</th>\n",
        "      <th>reasoning</th>\n",
        "      <th>pred_response</th>\n",
        "      <th>SemanticF1</th>\n",
@@ -1109,44 +1178,50 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>why is mercurial considered to be easier than git?</td>\n",
-       "      <td>Mercurial's syntax is considered more familiar, especially for tho...</td>\n",
-       "      <td>Mercurial is often considered easier than Git for several reasons,...</td>\n",
-       "      <td>Mercurial is considered easier than Git for several key reasons: 1...</td>\n",
-       "      <td>✔️ [0.874]</td>\n",
+       "      <td>when to use c over c++, and c++ over c?</td>\n",
+       "      <td>If you are equally familiar with both C++ and C, it's advisable to...</td>\n",
+       "      <td>[733]</td>\n",
+       "      <td>The context provides insights into the strengths and weaknesses of...</td>\n",
+       "      <td>You should consider using C over C++ in scenarios where simplicity...</td>\n",
+       "      <td>✔️ [0.333]</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>open finder window from current terminal location?</td>\n",
-       "      <td>If you type 'open .' in Terminal, it will open the current directo...</td>\n",
-       "      <td>To open a Finder window from the current terminal location in macO...</td>\n",
-       "      <td>To open a Finder window from your current terminal location in mac...</td>\n",
-       "      <td>✔️ [0.600]</td>\n",
+       "      <td>should images be stored in a git repository?</td>\n",
+       "      <td>One viewpoint expresses that there is no significant downside, esp...</td>\n",
+       "      <td>[6253, 6254, 6275, 6278, 8215]</td>\n",
+       "      <td>The context discusses the challenges and considerations of storing...</td>\n",
+       "      <td>Storing images in a Git repository is generally considered bad pra...</td>\n",
+       "      <td>✔️ [0.500]</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                                             question  \\\n",
-       "0  why is mercurial considered to be easier than git?   \n",
-       "1  open finder window from current terminal location?   \n",
+       "                                       question  \\\n",
+       "0       when to use c over c++, and c++ over c?   \n",
+       "1  should images be stored in a git repository?   \n",
        "\n",
        "                                                        example_response  \\\n",
-       "0  Mercurial's syntax is considered more familiar, especially for tho...   \n",
-       "1  If you type 'open .' in Terminal, it will open the current directo...   \n",
+       "0  If you are equally familiar with both C++ and C, it's advisable to...   \n",
+       "1  One viewpoint expresses that there is no significant downside, esp...   \n",
+       "\n",
+       "                     gold_doc_ids  \\\n",
+       "0                           [733]   \n",
+       "1  [6253, 6254, 6275, 6278, 8215]   \n",
        "\n",
        "                                                               reasoning  \\\n",
-       "0  Mercurial is often considered easier than Git for several reasons,...   \n",
-       "1  To open a Finder window from the current terminal location in macO...   \n",
+       "0  The context provides insights into the strengths and weaknesses of...   \n",
+       "1  The context discusses the challenges and considerations of storing...   \n",
        "\n",
        "                                                           pred_response  \\\n",
-       "0  Mercurial is considered easier than Git for several key reasons: 1...   \n",
-       "1  To open a Finder window from your current terminal location in mac...   \n",
+       "0  You should consider using C over C++ in scenarios where simplicity...   \n",
+       "1  Storing images in a Git repository is generally considered bad pra...   \n",
        "\n",
        "   SemanticF1  \n",
-       "0  ✔️ [0.874]  \n",
-       "1  ✔️ [0.600]  "
+       "0  ✔️ [0.333]  \n",
+       "1  ✔️ [0.500]  "
       ]
      },
      "metadata": {},
@@ -1162,7 +1237,7 @@
        "                    font-weight: bold;\n",
        "                    color: #555;\n",
        "                    margin: 10px 0;'>\n",
-       "                    ... 148 more rows not displayed ...\n",
+       "                    ... 298 more rows not displayed ...\n",
        "                </div>\n",
        "                "
       ],
@@ -1176,10 +1251,10 @@
     {
      "data": {
       "text/plain": [
-       "59.86"
+       "61.11"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1199,7 +1274,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1219,19 +1294,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "Prediction(\n",
-       "    reasoning='The behavior of the Command+Tab shortcut in macOS is designed to switch between applications rather than individual windows. This means that if an application is minimized or hidden, it will not be brought to the forefront using Command+Tab. Instead, the shortcut will only cycle through applications that are currently open and not minimized. To manage minimized windows, users may need to use different shortcuts or methods to restore them.',\n",
-       "    response='In macOS, the Command+Tab shortcut is specifically designed to switch between applications, not individual windows. This means that if an application is minimized or hidden, it will not be activated using Command+Tab. Here are some alternative methods to manage minimized or hidden windows:\\n\\n1. **Click on the Minimized Window:**\\n   - You can directly click on the minimized window in the Dock to restore it.\\n\\n2. **Use Command+M:**\\n   - If you want to minimize the current window, you can use Command+M. To restore it, you will need to click on it in the Dock.\\n\\n3. **Use Mission Control:**\\n   - You can activate Mission Control (F3 or Control+Up Arrow) to see all open windows and select the one you want to bring to the front.\\n\\n4. **Third-Party Applications:**\\n   - Consider using third-party applications like HyperSwitch or Witch, which can provide enhanced window management features, including switching between windows of the same application.\\n\\n5. **Keyboard Shortcuts for Specific Applications:**\\n   - Some applications may have their own shortcuts for managing windows. Check the preferences or documentation for the specific application you are using.\\n\\nBy using these methods, you can effectively manage and restore minimized or hidden windows in macOS.'\n",
+       "    reasoning='The context explains how the Command + Tab shortcut functions on macOS, particularly in relation to switching between applications. It notes that this shortcut does not bring back minimized or hidden windows directly. Instead, it cycles through applications that are currently open and visible. The information also suggests alternative methods for managing minimized windows and provides insights into how to navigate between applications effectively.',\n",
+       "    response='The Command + Tab shortcut on macOS is designed to switch between currently open applications, but it does not directly restore minimized or hidden windows. When you use Command + Tab, it cycles through the applications that are actively running, and minimized windows do not count as active. To manage minimized windows, you can use other shortcuts or methods. For example, you can use Command + Option + H + M to hide all other applications and minimize the most recently used one. Alternatively, you can navigate to the application you want to restore using Command + Tab and then manually click on the minimized window in the Dock to bring it back to focus.'\n",
        ")"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1252,22 +1327,20 @@
     "\n",
     "## What's next?\n",
     "\n",
-    "Improving from around 37% to approximately 60% on this task, in terms of `SemanticF1`, was pretty easy.\n",
+    "Improving from around 42% to approximately 61% on this task, in terms of `SemanticF1`, was pretty easy.\n",
     "\n",
     "But DSPy gives you paths to continue iterating on the quality of your system and we have barely scratched the surface.\n",
     "\n",
     "In general, you have the following tools:\n",
     "\n",
-    "1. Explore better system architectures for your program, e.g. what if we ask the LM to generate search queries for the retriever? See this [notebook](https://colab.research.google.com/github/stanfordnlp/dspy/blob/main/intro.ipynb) or the [STORM pipeline](https://arxiv.org/abs/2402.14207) built in DSPy.\n",
+    "1. Explore better system architectures for your program, e.g. what if we ask the LM to generate search queries for the retriever? See, e.g., the [STORM pipeline](https://arxiv.org/abs/2402.14207) built in DSPy.\n",
     "2. Explore different [prompt optimizers](https://arxiv.org/abs/2406.11695) or [weight optimizers](https://arxiv.org/abs/2407.10930). See the **[Optimizers Docs](/building-blocks/6-optimizers)**.\n",
     "3. Scale inference time compute using DSPy Optimizers, e.g. this [notebook](https://github.com/stanfordnlp/dspy/blob/main/examples/agents/multi_agent.ipynb).\n",
-    "4. Cut cost by distilling to a smaller LM, via prompt or weight optimization, e.g. [this notebook](https://github.com/stanfordnlp/dspy/blob/main/examples/nli/scone/scone.ipynb) or [this notebook](https://colab.research.google.com/github/stanfordnlp/dspy/blob/main/examples/qa/hotpot/multihop_finetune.ipynb).\n",
+    "4. Cut cost by distilling to a smaller LM, via prompt or weight optimization, e.g. [this notebook](https://github.com/stanfordnlp/dspy/blob/main/examples/nli/scone/scone.ipynb).\n",
     "\n",
     "How do you decide which ones to proceed with first?\n",
     "\n",
-    "The first step is to look at your system outputs, which will allow you to identify the sources of lower performance if any. While doing all of this, make sure you continue to refine your metric, e.g. by optimizing against your judgments, and to collect more (or more realistic) data, e.g. from related domains or from putting a demo of your system in front of users.\n",
-    "\n",
-    "Learn more about the [development cycle](/building-blocks/solving_your_task) in DSPy."
+    "The first step is to look at your system outputs, which will allow you to identify the sources of lower performance if any. While doing all of this, make sure you continue to refine your metric, e.g. by optimizing against your judgments, and to collect more (or more realistic) data, e.g. from related domains or from putting a demo of your system in front of users."
    ]
   }
  ],
diff --git a/dsp/utils/settings.py b/dsp/utils/settings.py
index 00f01eeaf..4ffbd23d9 100644
--- a/dsp/utils/settings.py
+++ b/dsp/utils/settings.py
@@ -1,7 +1,8 @@
+import copy
 import threading
-from contextlib import contextmanager
-from copy import deepcopy
 
+from contextlib import contextmanager
+from contextvars import ContextVar
 from dsp.utils.utils import dotdict
 
 DEFAULT_CONFIG = dotdict(
@@ -27,85 +28,105 @@
     async_max_workers=8,
 )
 
+# Global base configuration
+main_thread_config = copy.deepcopy(DEFAULT_CONFIG)
+
+# Initialize the context variable with an empty dict as default
+dspy_ctx_overrides = ContextVar('dspy_ctx_overrides', default=dotdict())
+
 
 class Settings:
-    """DSP configuration settings."""
+    """
+    A singleton class for DSPy configuration settings.
+
+    This is thread-safe. User threads are supported both through ParallelExecutor and native threading.
+        - If native threading is used, the thread inherits the initial config from the main thread.
+        - If ParallelExecutor is used, the thread inherits the initial config from its parent thread.
+    """
 
     _instance = None
 
     def __new__(cls):
-        """
-        Singleton Pattern. See https://python-patterns.guide/gang-of-four/singleton/
-        """
-
         if cls._instance is None:
             cls._instance = super().__new__(cls)
-            cls._instance.lock = threading.Lock()
-            cls._instance.main_tid = threading.get_ident()
-            cls._instance.main_stack = []
-            cls._instance.stack_by_thread = {}
-            cls._instance.stack_by_thread[threading.get_ident()] = cls._instance.main_stack
+            cls._instance.lock = threading.Lock()  # maintained here for DSPy assertions.py
+        return cls._instance
 
-            #  TODO: remove first-class support for re-ranker and potentially combine with RM to form a pipeline of sorts
-            #  eg: RetrieveThenRerankPipeline(RetrievalModel, Reranker)
-            #  downstream operations like dsp.retrieve would use configs from the defined pipeline.
+    def __getattr__(self, name):
+        overrides = dspy_ctx_overrides.get()
+        if name in overrides:
+            return overrides[name]
+        elif name in main_thread_config:
+            return main_thread_config[name]
+        else:
+            raise AttributeError(f"'Settings' object has no attribute '{name}'")
 
-            # make a deepcopy of the default config to avoid modifying the default config
-            cls._instance.__append(deepcopy(DEFAULT_CONFIG))
+    def __setattr__(self, name, value):
+        if name in ('_instance',):
+            super().__setattr__(name, value)
+        else:
+            self.configure(**{name: value})
 
-        return cls._instance
+    # Dictionary-like access
 
-    @property
-    def config(self):
-        thread_id = threading.get_ident()
-        if thread_id not in self.stack_by_thread:
-            self.stack_by_thread[thread_id] = [self.main_stack[-1].copy()]
-        return self.stack_by_thread[thread_id][-1]
+    def __getitem__(self, key):
+        return self.__getattr__(key)
 
-    def __getattr__(self, name):
-        if hasattr(self.config, name):
-            return getattr(self.config, name)
+    def __setitem__(self, key, value):
+        self.__setattr__(key, value)
 
-        if name in self.config:
-            return self.config[name]
+    def __contains__(self, key):
+        overrides = dspy_ctx_overrides.get()
+        return key in overrides or key in main_thread_config
+
+    def get(self, key, default=None):
+        try:
+            return self[key]
+        except AttributeError:
+            return default
 
-        super().__getattr__(name)
+    def copy(self):
+        overrides = dspy_ctx_overrides.get()
+        return dotdict({**main_thread_config, **overrides})
 
-    def __append(self, config):
-        thread_id = threading.get_ident()
-        if thread_id not in self.stack_by_thread:
-            self.stack_by_thread[thread_id] = [self.main_stack[-1].copy()]
-        self.stack_by_thread[thread_id].append(config)
+    @property
+    def config(self):
+        config = self.copy()
+        del config['lock']
+        return config
 
-    def __pop(self):
-        thread_id = threading.get_ident()
-        if thread_id in self.stack_by_thread:
-            self.stack_by_thread[thread_id].pop()
+    # Configuration methods
 
-    def configure(self, inherit_config: bool = True, **kwargs):
-        """Set configuration settings.
+    def configure(self, return_token=False, **kwargs):
+        global main_thread_config
+        overrides = dspy_ctx_overrides.get()
+        new_overrides = dotdict({**copy.deepcopy(DEFAULT_CONFIG), **main_thread_config, **overrides, **kwargs})
+        token = dspy_ctx_overrides.set(new_overrides)
 
-        Args:
-            inherit_config (bool, optional): Set configurations for the given, and use existing configurations for the rest. Defaults to True.
-        """
-        if inherit_config:
-            config = {**self.config, **kwargs}
-        else:
-            config = {**kwargs}
+        # Update main_thread_config, in the main thread only
+        if threading.current_thread() is threading.main_thread():
+            main_thread_config = new_overrides
 
-        self.__append(config)
+        if return_token:
+            return token
 
     @contextmanager
-    def context(self, inherit_config=True, **kwargs):
-        self.configure(inherit_config=inherit_config, **kwargs)
-
+    def context(self, **kwargs):
+        """Context manager for temporary configuration changes."""
+        token = self.configure(return_token=True, **kwargs)
         try:
             yield
         finally:
-            self.__pop()
+            dspy_ctx_overrides.reset(token)
+
+            if threading.current_thread() is threading.main_thread():
+                global main_thread_config
+                main_thread_config = dotdict({**copy.deepcopy(DEFAULT_CONFIG), **dspy_ctx_overrides.get()})
 
-    def __repr__(self) -> str:
-        return repr(self.config)
+    def __repr__(self):
+        overrides = dspy_ctx_overrides.get()
+        combined_config = {**main_thread_config, **overrides}
+        return repr(combined_config)
 
 
-settings = Settings()
\ No newline at end of file
+settings = Settings()
diff --git a/dspy/__init__.py b/dspy/__init__.py
index 28e0a352b..9e3e85fd2 100644
--- a/dspy/__init__.py
+++ b/dspy/__init__.py
@@ -6,6 +6,8 @@
 from .retrieve import *
 from .signatures import *
 
+import dspy.retrievers
+
 # Functional must be imported after primitives, predict and signatures
 from .functional import * # isort: skip
 from dspy.evaluate import Evaluate # isort: skip
diff --git a/dspy/adapters/chat_adapter.py b/dspy/adapters/chat_adapter.py
index edb5e1870..a8ae380cd 100644
--- a/dspy/adapters/chat_adapter.py
+++ b/dspy/adapters/chat_adapter.py
@@ -1,26 +1,23 @@
-import re
-from typing import Any, Union
-from dsp.adapters.base_template import Field
-from dspy.signatures.signature import Signature
-from .base import Adapter
-from .image_utils import encode_image, Image
-
 import ast
-import json
 import enum
 import inspect
-import pydantic
+import json
+import re
 import textwrap
+from collections.abc import Mapping
 from itertools import chain
+from typing import Any, Dict, List, Literal, NamedTuple, Union, get_args, get_origin
+
+import pydantic
 from pydantic import TypeAdapter
-from collections.abc import Mapping
 from pydantic.fields import FieldInfo
-from typing import Dict, KeysView, List, Literal, NamedTuple, get_args, get_origin
 
+from dsp.adapters.base_template import Field
 from dspy.adapters.base import Adapter
-from ..signatures.field import OutputField
-from ..signatures.signature import SignatureMeta
-from ..signatures.utils import get_dspy_field_type
+from dspy.adapters.image_utils import Image, encode_image
+from dspy.signatures.field import OutputField
+from dspy.signatures.signature import Signature, SignatureMeta
+from dspy.signatures.utils import get_dspy_field_type
 
 field_header_pattern = re.compile(r"\[\[ ## (\w+) ## \]\]")
 
@@ -33,12 +30,15 @@ class FieldInfoWithName(NamedTuple):
 # Built-in field indicating that a chat turn has been completed.
 BuiltInCompletedOutputFieldInfo = FieldInfoWithName(name="completed", info=OutputField())
 
+
 class ChatAdapter(Adapter):
     def format(self, signature: Signature, demos: list[dict[str, Any]], inputs: dict[str, Any]) -> list[dict[str, Any]]:
         messages: list[dict[str, Any]] = []
 
         # Extract demos where some of the output_fields are not filled in.
-        incomplete_demos = [demo for demo in demos if not all(k in demo and demo[k] is not None for k in signature.fields)]
+        incomplete_demos = [
+            demo for demo in demos if not all(k in demo and demo[k] is not None for k in signature.fields)
+        ]
         complete_demos = [demo for demo in demos if demo not in incomplete_demos]
         # Filter out demos that don't have at least one input and one output field.
         incomplete_demos = [
@@ -99,6 +99,7 @@ def format_finetune_data(self, signature, demos, inputs, outputs):
 
         # Wrap the messages in a dictionary with a "messages" key
         return dict(messages=messages)
+
     def format_turn(self, signature, values, role, incomplete=False):
         return format_turn(signature, values, role, incomplete)
 
@@ -112,8 +113,7 @@ def format_fields(self, signature, values, role):
         }
 
         return format_fields(fields_with_values)
-        
-        
+
 
 def format_blob(blob):
     if "\n" not in blob and "«" not in blob and "»" not in blob:
@@ -139,6 +139,7 @@ def format_input_list_field_value(value: List[Any]) -> str:
 
     return "\n".join([f"[{idx+1}] {format_blob(txt)}" for idx, txt in enumerate(value)])
 
+
 def _serialize_for_json(value):
     if isinstance(value, pydantic.BaseModel):
         return value.model_dump()
@@ -149,6 +150,7 @@ def _serialize_for_json(value):
     else:
         return value
 
+
 def _format_field_value(field_info: FieldInfo, value: Any, assume_text=True) -> Union[str, dict]:
     """
     Formats the value of the specified field according to the field's DSPy type (input or output),
@@ -171,7 +173,7 @@ def _format_field_value(field_info: FieldInfo, value: Any, assume_text=True) ->
 
     if assume_text:
         return string_value
-    elif (isinstance(value, Image) or field_info.annotation == Image):
+    elif isinstance(value, Image) or field_info.annotation == Image:
         # This validation should happen somewhere else
         # Safe to import PIL here because it's only imported when an image is actually being formatted
         try:
@@ -193,7 +195,6 @@ def _format_field_value(field_info: FieldInfo, value: Any, assume_text=True) ->
         return {"type": "text", "text": string_value}
 
 
-
 def format_fields(fields_with_values: Dict[FieldInfoWithName, Any], assume_text=True) -> Union[str, List[dict]]:
     """
     Formats the values of the specified fields according to the field's DSPy type (input or output),
@@ -222,10 +223,11 @@ def format_fields(fields_with_values: Dict[FieldInfoWithName, Any], assume_text=
     else:
         return output
 
+
 def parse_value(value, annotation):
     if annotation is str:
         return str(value)
-    
+
     parsed_value = value
 
     if isinstance(annotation, enum.EnumMeta):
@@ -238,70 +240,85 @@ def parse_value(value, annotation):
                 parsed_value = ast.literal_eval(value)
             except (ValueError, SyntaxError):
                 parsed_value = value
-    
+
     return TypeAdapter(annotation).validate_python(parsed_value)
 
 
-def format_turn(signature, values, role, incomplete=False): 
-    fields_to_collapse = []      
+def format_turn(signature, values, role, incomplete=False):
     """
     Constructs a new message ("turn") to append to a chat thread. The message is carefully formatted
     so that it can instruct an LLM to generate responses conforming to the specified DSPy signature.
 
     Args:
-      signature: The DSPy signature to which future LLM responses should conform.
-      values: A dictionary mapping field names (from the DSPy signature) to corresponding values
-              that should be included in the message.
-      role: The role of the message, which can be either "user" or "assistant".
-      incomplete: If True, indicates that output field values are present in the set of specified
-                  ``values``. If False, indicates that ``values`` only contains input field values.
+        signature: The DSPy signature to which future LLM responses should conform.
+        values: A dictionary mapping field names (from the DSPy signature) to corresponding values
+            that should be included in the message.
+        role: The role of the message, which can be either "user" or "assistant".
+        incomplete: If True, indicates that output field values are present in the set of specified
+            ``values``. If False, indicates that ``values`` only contains input field values.
+
     Returns:
-      A chat message that can be appended to a chat thread. The message contains two string fields:
-      ``role`` ("user" or "assistant") and ``content`` (the message text).
+        A chat message that can be appended to a chat thread. The message contains two string fields:
+        ``role`` ("user" or "assistant") and ``content`` (the message text).
     """
+    fields_to_collapse = []
     content = []
 
     if role == "user":
-        fields: Dict[str, FieldInfo] = signature.input_fields
+        fields = signature.input_fields
         if incomplete:
-            fields_to_collapse.append({"type": "text", "text": "This is an example of the task, though some input or output fields are not supplied."})
+            fields_to_collapse.append(
+                {
+                    "type": "text",
+                    "text": "This is an example of the task, though some input or output fields are not supplied.",
+                }
+            )
     else:
-        fields: Dict[str, FieldInfo] = signature.output_fields
+        fields = signature.output_fields
         # Add the built-in field indicating that the chat turn has been completed
         fields[BuiltInCompletedOutputFieldInfo.name] = BuiltInCompletedOutputFieldInfo.info
         values = {**values, BuiltInCompletedOutputFieldInfo.name: ""}
-    field_names: KeysView = fields.keys()
+    field_names = fields.keys()
     if not incomplete:
         if not set(values).issuperset(set(field_names)):
             raise ValueError(f"Expected {field_names} but got {values.keys()}")
-    
-    fields_to_collapse.extend(format_fields(
-        fields_with_values={
-            FieldInfoWithName(name=field_name, info=field_info): values.get(
-                field_name, "Not supplied for this particular example."
-            )
-            for field_name, field_info in fields.items()
-        },
-        assume_text=False
-    ))
+
+    fields_to_collapse.extend(
+        format_fields(
+            fields_with_values={
+                FieldInfoWithName(name=field_name, info=field_info): values.get(
+                    field_name, "Not supplied for this particular example."
+                )
+                for field_name, field_info in fields.items()
+            },
+            assume_text=False,
+        )
+    )
 
     if role == "user":
         output_fields = list(signature.output_fields.keys())
+
         def type_info(v):
-            return f" (must be formatted as a valid Python {get_annotation_name(v.annotation)})" \
-                if v.annotation is not str else ""
+            return (
+                f" (must be formatted as a valid Python {get_annotation_name(v.annotation)})"
+                if v.annotation is not str
+                else ""
+            )
+
         if output_fields:
-            fields_to_collapse.append({
-                "type": "text",
-                "text":  "Respond with the corresponding output fields, starting with the field "
-                + ", then ".join(f"`[[ ## {f} ## ]]`{type_info(v)}" for f, v in signature.output_fields.items())
-                + ", and then ending with the marker for `[[ ## completed ## ]]`."
-            })
-        
+            fields_to_collapse.append(
+                {
+                    "type": "text",
+                    "text": "Respond with the corresponding output fields, starting with the field "
+                    + ", then ".join(f"`[[ ## {f} ## ]]`{type_info(v)}" for f, v in signature.output_fields.items())
+                    + ", and then ending with the marker for `[[ ## completed ## ]]`.",
+                }
+            )
+
     # flatmap the list if any items are lists otherwise keep the item
-    flattened_list = list(chain.from_iterable(
-        item if isinstance(item, list) else [item] for item in fields_to_collapse
-    ))
+    flattened_list = list(
+        chain.from_iterable(item if isinstance(item, list) else [item] for item in fields_to_collapse)
+    )
 
     if all(message.get("type", None) == "text" for message in flattened_list):
         content = "\n\n".join(message.get("text") for message in flattened_list)
@@ -314,16 +331,16 @@ def type_info(v):
         if not collapsed_messages:
             collapsed_messages.append(item)
             continue
-        
-        # If current item is image, add to collapsed_messages
+
+        # If the current item is image, add to collapsed_messages
         if item.get("type") == "image_url":
             if collapsed_messages[-1].get("type") == "text":
                 collapsed_messages[-1]["text"] += "\n"
             collapsed_messages.append(item)
-        # If previous item is text and current item is text, append to previous item
+        # If the previous item is text and current item is text, append to the previous item
         elif collapsed_messages[-1].get("type") == "text":
             collapsed_messages[-1]["text"] += "\n\n" + item["text"]
-        # If previous item is not text(aka image), add current item as a new item
+        # If the previous item is not text(aka image), add the current item as a new item
         else:
             item["text"] = "\n\n" + item["text"]
             collapsed_messages.append(item)
@@ -357,16 +374,18 @@ def enumerate_fields(fields: dict[str, Field]) -> str:
 def move_type_to_front(d):
     # Move the 'type' key to the front of the dictionary, recursively, for LLM readability/adherence.
     if isinstance(d, Mapping):
-        return {k: move_type_to_front(v) for k, v in sorted(d.items(), key=lambda item: (item[0] != 'type', item[0]))}
+        return {k: move_type_to_front(v) for k, v in sorted(d.items(), key=lambda item: (item[0] != "type", item[0]))}
     elif isinstance(d, list):
         return [move_type_to_front(item) for item in d]
     return d
 
+
 def prepare_schema(type_):
     schema = pydantic.TypeAdapter(type_).json_schema()
     schema = move_type_to_front(schema)
     return schema
 
+
 def prepare_instructions(signature: SignatureMeta):
     parts = []
     parts.append("Your input fields are:\n" + enumerate_fields(signature.input_fields))
@@ -374,21 +393,21 @@ def prepare_instructions(signature: SignatureMeta):
     parts.append("All interactions will be structured in the following way, with the appropriate values filled in.")
 
     def field_metadata(field_name, field_info):
-        type_ = field_info.annotation
+        field_type = field_info.annotation
 
-        if get_dspy_field_type(field_info) == 'input' or type_ is str:
+        if get_dspy_field_type(field_info) == "input" or field_type is str:
             desc = ""
-        elif type_ is bool:
+        elif field_type is bool:
             desc = "must be True or False"
-        elif type_ in (int, float):
-            desc = f"must be a single {type_.__name__} value"
-        elif inspect.isclass(type_) and issubclass(type_, enum.Enum):
-            desc= f"must be one of: {'; '.join(type_.__members__)}"
-        elif hasattr(type_, '__origin__') and type_.__origin__ is Literal:
-            desc = f"must be one of: {'; '.join([str(x) for x in type_.__args__])}"
+        elif field_type in (int, float):
+            desc = f"must be a single {field_type.__name__} value"
+        elif inspect.isclass(field_type) and issubclass(field_type, enum.Enum):
+            desc = f"must be one of: {'; '.join(field_type.__members__)}"
+        elif hasattr(field_type, "__origin__") and field_type.__origin__ is Literal:
+            desc = f"must be one of: {'; '.join([str(x) for x in field_type.__args__])}"
         else:
             desc = "must be pareseable according to the following JSON schema: "
-            desc += json.dumps(prepare_schema(type_), ensure_ascii=False)
+            desc += json.dumps(prepare_schema(field_type), ensure_ascii=False)
 
         desc = (" " * 8) + f"# note: the value you produce {desc}" if desc else ""
         return f"{{{field_name}}}{desc}"
@@ -399,7 +418,7 @@ def format_signature_fields_for_instructions(fields: Dict[str, FieldInfo]):
                 FieldInfoWithName(name=field_name, info=field_info): field_metadata(field_name, field_info)
                 for field_name, field_info in fields.items()
             },
-            assume_text=True
+            assume_text=True,
         )
 
     parts.append(format_signature_fields_for_instructions(signature.input_fields))
diff --git a/dspy/clients/__init__.py b/dspy/clients/__init__.py
index dc10f865f..546a96c75 100644
--- a/dspy/clients/__init__.py
+++ b/dspy/clients/__init__.py
@@ -1,15 +1,24 @@
 from .lm import LM
 from .provider import Provider, TrainingJob
 from .base_lm import BaseLM, inspect_history
-from .embedding import Embedding
+from .embedding import Embedder
 import litellm
 import os
 from pathlib import Path
 from litellm.caching import Cache
 
 DISK_CACHE_DIR = os.environ.get("DSPY_CACHEDIR") or os.path.join(Path.home(), ".dspy_cache")
+DISK_CACHE_LIMIT = int(os.environ.get("DSPY_CACHE_LIMIT", 3e10))  # 30 GB default
+
+# TODO: There's probably value in getting litellm to support FanoutCache and to separate the limit for
+# the LM cache from the embeddings cache. Then we can lower the default 30GB limit.
 litellm.cache = Cache(disk_cache_dir=DISK_CACHE_DIR, type="disk")
+
+if litellm.cache.cache.disk_cache.size_limit != DISK_CACHE_LIMIT:
+    litellm.cache.cache.disk_cache.reset('size_limit', DISK_CACHE_LIMIT)
+
 litellm.telemetry = False
+
 # Turn off by default to avoid LiteLLM logging during every LM call.
 litellm.suppress_debug_info = True
 
diff --git a/dspy/clients/embedding.py b/dspy/clients/embedding.py
index eec41c32b..ec7c1174e 100644
--- a/dspy/clients/embedding.py
+++ b/dspy/clients/embedding.py
@@ -2,7 +2,7 @@
 import numpy as np
 
 
-class Embedding:
+class Embedder:
     """DSPy embedding class.
 
     The class for computing embeddings for text inputs. This class provides a unified interface for both:
@@ -10,7 +10,7 @@ class Embedding:
     1. Hosted embedding models (e.g. OpenAI's text-embedding-3-small) via litellm integration
     2. Custom embedding functions that you provide
 
-    For hosted models, simply pass the model name as a string (e.g. "openai/text-embedding-3-small"). The class will use
+    For hosted models, simply pass the model name as a string (e.g., "openai/text-embedding-3-small"). The class will use
     litellm to handle the API calls and caching.
 
     For custom embedding models, pass a callable function that:
@@ -24,6 +24,9 @@ class Embedding:
         model: The embedding model to use. This can be either a string (representing the name of the hosted embedding
             model, must be an embedding model supported by litellm) or a callable that represents a custom embedding
             model.
+        batch_size (int, optional): The default batch size for processing inputs in batches. Defaults to 200.
+        caching (bool, optional): Whether to cache the embedding response when using a hosted model. Defaults to True.
+        **kwargs: Additional default keyword arguments to pass to the embedding model.
 
     Examples:
         Example 1: Using a hosted model.
@@ -31,7 +34,7 @@ class Embedding:
         ```python
         import dspy
 
-        embedder = dspy.Embedding("openai/text-embedding-3-small")
+        embedder = dspy.Embedder("openai/text-embedding-3-small", batch_size=100)
         embeddings = embedder(["hello", "world"])
 
         assert embeddings.shape == (2, 1536)
@@ -41,37 +44,78 @@ class Embedding:
 
         ```python
         import dspy
+        import numpy as np
 
         def my_embedder(texts):
             return np.random.rand(len(texts), 10)
 
-        embedder = dspy.Embedding(my_embedder)
-        embeddings = embedder(["hello", "world"])
+        embedder = dspy.Embedder(my_embedder)
+        embeddings = embedder(["hello", "world"], batch_size=1)
 
         assert embeddings.shape == (2, 10)
         ```
     """
 
-    def __init__(self, model):
+    def __init__(self, model, batch_size=200, caching=True, **kwargs):
         self.model = model
+        self.batch_size = batch_size
+        self.caching = caching
+        self.default_kwargs = kwargs
 
-    def __call__(self, inputs, caching=True, **kwargs):
+    def __call__(self, inputs, batch_size=None, caching=None, **kwargs):
         """Compute embeddings for the given inputs.
 
         Args:
             inputs: The inputs to compute embeddings for, can be a single string or a list of strings.
-            caching: Whether to cache the embedding response, only valid when using a hosted embedding model.
-            kwargs: Additional keyword arguments to pass to the embedding model.
+            batch_size (int, optional): The batch size for processing inputs. If None, defaults to the batch_size set during initialization.
+            caching (bool, optional): Whether to cache the embedding response when using a hosted model. If None, defaults to the caching setting from initialization.
+            **kwargs: Additional keyword arguments to pass to the embedding model. These will override the default kwargs provided during initialization.
 
         Returns:
-            A 2-D numpy array of embeddings, one embedding per row.
+            numpy.ndarray: If the input is a single string, returns a 1D numpy array representing the embedding.
+            If the input is a list of strings, returns a 2D numpy array of embeddings, one embedding per row.
         """
+
         if isinstance(inputs, str):
+            is_single_input = True
             inputs = [inputs]
-        if isinstance(self.model, str):
-            embedding_response = litellm.embedding(model=self.model, input=inputs, caching=caching, **kwargs)
-            return np.array([data["embedding"] for data in embedding_response.data], dtype=np.float32)
-        elif callable(self.model):
-            return np.array(self.model(inputs, **kwargs), dtype=np.float32)
         else:
-            raise ValueError(f"`model` in `dspy.Embedding` must be a string or a callable, but got {type(self.model)}.")
+            is_single_input = False
+
+        assert all(isinstance(inp, str) for inp in inputs), "All inputs must be strings."
+
+        if batch_size is None:
+            batch_size = self.batch_size
+        if caching is None:
+            caching = self.caching
+
+        merged_kwargs = self.default_kwargs.copy()
+        merged_kwargs.update(kwargs)
+
+        embeddings_list = []
+
+        def chunk(inputs_list, size):
+            for i in range(0, len(inputs_list), size):
+                yield inputs_list[i : i + size]
+
+        for batch_inputs in chunk(inputs, batch_size):
+            if isinstance(self.model, str):
+                embedding_response = litellm.embedding(
+                    model=self.model, input=batch_inputs, caching=caching, **merged_kwargs
+                )
+                batch_embeddings = [data["embedding"] for data in embedding_response.data]
+            elif callable(self.model):
+                batch_embeddings = self.model(batch_inputs, **merged_kwargs)
+            else:
+                raise ValueError(
+                    f"`model` in `dspy.Embedder` must be a string or a callable, but got {type(self.model)}."
+                )
+
+            embeddings_list.extend(batch_embeddings)
+
+        embeddings = np.array(embeddings_list, dtype=np.float32)
+
+        if is_single_input:
+            return embeddings[0]
+        else:
+            return embeddings
diff --git a/dspy/datasets/dataloader.py b/dspy/datasets/dataloader.py
index 87d534f9d..351b84418 100644
--- a/dspy/datasets/dataloader.py
+++ b/dspy/datasets/dataloader.py
@@ -10,9 +10,7 @@
 
 
 class DataLoader(Dataset):
-    def __init__(
-        self,
-    ):
+    def __init__(self):
         pass
 
     def from_huggingface(
@@ -97,8 +95,7 @@ def from_parquet(self, file_path: str, fields: List[str] = None, input_keys: Tup
 
         return [dspy.Example({field: row[field] for field in fields}).with_inputs(input_keys) for row in dataset]
 
-    def from_rm(self, num_samples: int,
-                fields: List[str], input_keys: List[str]) -> List[dspy.Example]:
+    def from_rm(self, num_samples: int, fields: List[str], input_keys: List[str]) -> List[dspy.Example]:
         try:
             rm = dspy.settings.rm
             try:
@@ -107,9 +104,13 @@ def from_rm(self, num_samples: int,
                     for row in rm.get_objects(num_samples=num_samples, fields=fields)
                 ]
             except AttributeError:
-                raise ValueError("Retrieval module does not support `get_objects`. Please use a different retrieval module.")
+                raise ValueError(
+                    "Retrieval module does not support `get_objects`. Please use a different retrieval module."
+                )
         except AttributeError:
-            raise ValueError("Retrieval module not found. Please set a retrieval module using `dspy.settings.configure`.")
+            raise ValueError(
+                "Retrieval module not found. Please set a retrieval module using `dspy.settings.configure`."
+            )
 
     def sample(
         self,
@@ -119,7 +120,9 @@ def sample(
         **kwargs,
     ) -> List[dspy.Example]:
         if not isinstance(dataset, list):
-            raise ValueError(f"Invalid dataset provided of type {type(dataset)}. Please provide a list of examples.")
+            raise ValueError(
+                f"Invalid dataset provided of type {type(dataset)}. Please provide a list of `dspy.Example`s."
+            )
 
         return random.sample(dataset, n, *args, **kwargs)
 
@@ -141,7 +144,11 @@ def train_test_split(
         elif train_size is not None and isinstance(train_size, int):
             train_end = train_size
         else:
-            raise ValueError("Invalid train_size. Please provide a float between 0 and 1 or an int.")
+            raise ValueError(
+                "Invalid `train_size`. Please provide a float between 0 and 1 to represent the proportion of the "
+                "dataset to include in the train split or an int to represent the absolute number of samples to "
+                f"include in the train split. Received `train_size`: {train_size}."
+            )
 
         if test_size is not None:
             if isinstance(test_size, float) and (0 < test_size < 1):
@@ -149,9 +156,16 @@ def train_test_split(
             elif isinstance(test_size, int):
                 test_end = test_size
             else:
-                raise ValueError("Invalid test_size. Please provide a float between 0 and 1 or an int.")
+                raise ValueError(
+                    "Invalid `test_size`. Please provide a float between 0 and 1 to represent the proportion of the "
+                    "dataset to include in the test split or an int to represent the absolute number of samples to "
+                    f"include in the test split. Received `test_size`: {test_size}."
+                )
             if train_end + test_end > len(dataset_shuffled):
-                raise ValueError("train_size + test_size cannot exceed the total number of samples.")
+                raise ValueError(
+                    "`train_size` + `test_size` cannot exceed the total number of samples. Received "
+                    f"`train_size`: {train_end}, `test_size`: {test_end}, and `dataset_size`: {len(dataset_shuffled)}."
+                )
         else:
             test_end = len(dataset_shuffled) - train_end
 
diff --git a/dspy/evaluate/auto_evaluation.py b/dspy/evaluate/auto_evaluation.py
index 38b02fe35..d98332143 100644
--- a/dspy/evaluate/auto_evaluation.py
+++ b/dspy/evaluate/auto_evaluation.py
@@ -14,14 +14,35 @@ class SemanticRecallPrecision(dspy.Signature):
     precision: float = dspy.OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth")
 
 
+class DecompositionalSemanticRecallPrecision(dspy.Signature):
+    """
+    Compare a system's response to the ground truth to compute recall and precision of key ideas.
+    You will first enumerate key ideas in each response, discuss their overlap, and then report recall and precision.
+    """
+
+    question: str = dspy.InputField()
+    ground_truth: str = dspy.InputField()
+    system_response: str = dspy.InputField()
+    ground_truth_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the ground truth")
+    system_response_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the system response")
+    discussion: str = dspy.OutputField(desc="discussion of the overlap between ground truth and system response")
+    recall: float = dspy.OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
+    precision: float = dspy.OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth")
+
+
 def f1_score(precision, recall):
+    precision, recall = max(0.0, min(1.0, precision)), max(0.0, min(1.0, recall))
     return 0.0 if precision + recall == 0 else 2 * (precision * recall) / (precision + recall)
 
 
 class SemanticF1(dspy.Module):
-    def __init__(self, threshold=0.66):
+    def __init__(self, threshold=0.66, decompositional=False):
         self.threshold = threshold
-        self.module = dspy.ChainOfThought(SemanticRecallPrecision)
+
+        if decompositional:
+            self.module = dspy.ChainOfThought(DecompositionalSemanticRecallPrecision)
+        else:
+            self.module = dspy.ChainOfThought(SemanticRecallPrecision)
 
     def forward(self, example, pred, trace=None):
         scores = self.module(question=example.question, ground_truth=example.response, system_response=pred.response)
@@ -30,42 +51,92 @@ def forward(self, example, pred, trace=None):
         return score if trace is None else score >= self.threshold
 
 
-"""
-Soon-to-be deprecated Signatures & Modules Below.
-"""
+
+###########
+
+
+class AnswerCompleteness(dspy.Signature):
+    """
+    Estimate the completeness of a system's responses, against the ground truth.
+    You will first enumerate key ideas in each response, discuss their overlap, and then report completeness.
+    """
+
+    question: str = dspy.InputField()
+    ground_truth: str = dspy.InputField()
+    system_response: str = dspy.InputField()
+    ground_truth_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the ground truth")
+    system_response_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the system response")
+    discussion: str = dspy.OutputField(desc="discussion of the overlap between ground truth and system response")
+    completeness: float = dspy.OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
+
+
+
+class AnswerGroundedness(dspy.Signature):
+    """
+    Estimate the groundedness of a system's responses, against real retrieved documents written by people.
+    You will first enumerate whatever non-trivial or check-worthy claims are made in the system response, and then
+    discuss the extent to which some or all of them can be deduced from the retrieved context and basic commonsense.
+    """
+
+    question: str = dspy.InputField()
+    retrieved_context: str = dspy.InputField()
+    system_response: str = dspy.InputField()
+    system_response_claims: str = dspy.OutputField(desc="enumeration of non-trivial or check-worthy claims in the system response")
+    discussion: str = dspy.OutputField(desc="discussion of how supported the claims are by the retrieved context")
+    groundedness: float = dspy.OutputField(desc="fraction (out of 1.0) of system response supported by the retrieved context")
+
+
+class CompleteAndGrounded(dspy.Module):
+    def __init__(self, threshold=0.66):
+        self.threshold = threshold
+        self.completeness_module = dspy.ChainOfThought(AnswerCompleteness)
+        self.groundedness_module = dspy.ChainOfThought(AnswerGroundedness)
+
+    def forward(self, example, pred, trace=None):
+        completeness = self.completeness_module(question=example.question, ground_truth=example.response, system_response=pred.response)
+        groundedness = self.groundedness_module(question=example.question, retrieved_context=pred.context, system_response=pred.response)
+        score = f1_score(groundedness.groundedness, completeness.completeness)
+
+        return score if trace is None else score >= self.threshold
+
+
+
+# """
+# Soon-to-be deprecated Signatures & Modules Below.
+# """
 
 
-class AnswerCorrectnessSignature(dspy.Signature):
-    """Verify that the predicted answer matches the gold answer."""
+# class AnswerCorrectnessSignature(dspy.Signature):
+#     """Verify that the predicted answer matches the gold answer."""
 
-    question = dspy.InputField()
-    gold_answer = dspy.InputField(desc="correct answer for question")
-    predicted_answer = dspy.InputField(desc="predicted answer for question")
-    is_correct = dspy.OutputField(desc="True or False")
+#     question = dspy.InputField()
+#     gold_answer = dspy.InputField(desc="correct answer for question")
+#     predicted_answer = dspy.InputField(desc="predicted answer for question")
+#     is_correct = dspy.OutputField(desc="True or False")
 
 
-class AnswerCorrectness(dspy.Module):
-    def __init__(self):
-        super().__init__()
-        self.evaluate_correctness = dspy.ChainOfThought(AnswerCorrectnessSignature)
+# class AnswerCorrectness(dspy.Module):
+#     def __init__(self):
+#         super().__init__()
+#         self.evaluate_correctness = dspy.ChainOfThought(AnswerCorrectnessSignature)
 
-    def forward(self, question, gold_answer, predicted_answer):
-        return self.evaluate_correctness(question=question, gold_answer=gold_answer, predicted_answer=predicted_answer)
+#     def forward(self, question, gold_answer, predicted_answer):
+#         return self.evaluate_correctness(question=question, gold_answer=gold_answer, predicted_answer=predicted_answer)
 
 
-class AnswerFaithfulnessSignature(dspy.Signature):
-    """Verify that the predicted answer is based on the provided context."""
+# class AnswerFaithfulnessSignature(dspy.Signature):
+#     """Verify that the predicted answer is based on the provided context."""
 
-    context = dspy.InputField(desc="relevant facts for producing answer")
-    question = dspy.InputField()
-    answer = dspy.InputField(desc="often between 1 and 5 words")
-    is_faithful = dspy.OutputField(desc="True or False")
+#     context = dspy.InputField(desc="relevant facts for producing answer")
+#     question = dspy.InputField()
+#     answer = dspy.InputField(desc="often between 1 and 5 words")
+#     is_faithful = dspy.OutputField(desc="True or False")
 
 
-class AnswerFaithfulness(dspy.Module):
-    def __init__(self):
-        super().__init__()
-        self.evaluate_faithfulness = dspy.ChainOfThought(AnswerFaithfulnessSignature)
+# class AnswerFaithfulness(dspy.Module):
+#     def __init__(self):
+#         super().__init__()
+#         self.evaluate_faithfulness = dspy.ChainOfThought(AnswerFaithfulnessSignature)
 
-    def forward(self, context, question, answer):
-        return self.evaluate_faithfulness(context=context, question=question, answer=answer)
+#     def forward(self, context, question, answer):
+#         return self.evaluate_faithfulness(context=context, question=question, answer=answer)
diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
index 04a135537..9ef0bf733 100644
--- a/dspy/evaluate/evaluate.py
+++ b/dspy/evaluate/evaluate.py
@@ -54,6 +54,7 @@ def __init__(
         return_all_scores=False,
         return_outputs=False,
         provide_traceback=False,
+        failure_score=0.0,
         **_kwargs,
     ):
         self.devset = devset
@@ -65,6 +66,7 @@ def __init__(
         self.return_all_scores = return_all_scores
         self.return_outputs = return_outputs
         self.provide_traceback = provide_traceback
+        self.failure_score = failure_score
 
     def __call__(
         self,
@@ -85,7 +87,6 @@ def __call__(
         return_all_scores = return_all_scores if return_all_scores is not None else self.return_all_scores
         return_outputs = return_outputs if return_outputs is not None else self.return_outputs
 
-        devset = list(enumerate(devset))
         tqdm.tqdm._instances.clear()
 
         executor = ParallelExecutor(
@@ -96,39 +97,27 @@ def __call__(
             compare_results=True,
         )
 
-        def process_item(item):
-            try:
-                example_idx, example = item
-                prediction = program(**example.inputs())
-                score = metric(example, prediction)
+        def process_item(example):
+            prediction = program(**example.inputs())
+            score = metric(example, prediction)
 
-                # Increment assert and suggest failures to program's attributes
-                if hasattr(program, "_assert_failures"):
-                    program._assert_failures += dspy.settings.get("assert_failures")
-                if hasattr(program, "_suggest_failures"):
-                    program._suggest_failures += dspy.settings.get("suggest_failures")
+            # Increment assert and suggest failures to program's attributes
+            if hasattr(program, "_assert_failures"):
+                program._assert_failures += dspy.settings.get("assert_failures")
+            if hasattr(program, "_suggest_failures"):
+                program._suggest_failures += dspy.settings.get("suggest_failures")
 
-                return example_idx, example, prediction, score
-            except Exception:
-                return example_idx, example, {}, 0.0
+            return prediction, score
 
         results = executor.execute(process_item, devset)
-        reordered_devset = [r for r in results if r is not None]
+        assert len(devset) == len(results)
 
-        ncorrect = sum(score for _, _, _, score in reordered_devset)
-        ntotal = len(reordered_devset)
-
-        if ntotal == 0:
-            logger.warning("No valid results to compute metrics.")
-            return 0.0
+        results = [((dspy.Prediction(), self.failure_score) if r is None else r) for r in results]
+        results = [(example, prediction, score) for example, (prediction, score) in zip(devset, results)]
+        ncorrect, ntotal = sum(score for *_, score in results), len(devset)
 
         logger.info(f"Average Metric: {ncorrect} / {ntotal} ({round(100 * ncorrect / ntotal, 1)}%)")
-
-        predicted_devset = sorted(reordered_devset)
-
-        if return_outputs:  # Handle the return_outputs logic
-            results = [(example, prediction, score) for _, example, prediction, score in predicted_devset]
-
+            
         def prediction_is_dictlike(prediction):
             # Downstream logic for displaying dictionary-like predictions depends solely on the predictions
             # having a method called `items()` for iterating through key/value pairs
@@ -140,12 +129,12 @@ def prediction_is_dictlike(prediction):
                 if prediction_is_dictlike(prediction)
                 else dict(example) | {"prediction": prediction, "correct": score}
             )
-            for _, example, prediction, score in predicted_devset
+            for example, prediction, score in results
         ]
 
-        result_df = pd.DataFrame(data)
 
         # Truncate every cell in the DataFrame (DataFrame.applymap was renamed to DataFrame.map in Pandas 2.1.0)
+        result_df = pd.DataFrame(data)
         result_df = result_df.map(truncate_cell) if hasattr(result_df, "map") else result_df.applymap(truncate_cell)
 
         # Rename the 'correct' column to the name of the metric object
@@ -179,9 +168,9 @@ def prediction_is_dictlike(prediction):
                 display(HTML(message))
 
         if return_all_scores and return_outputs:
-            return round(100 * ncorrect / ntotal, 2), results, [score for *_, score in predicted_devset]
+            return round(100 * ncorrect / ntotal, 2), results, [score for *_, score in results]
         if return_all_scores:
-            return round(100 * ncorrect / ntotal, 2), [score for *_, score in predicted_devset]
+            return round(100 * ncorrect / ntotal, 2), [score for *_, score in results]
         if return_outputs:
             return round(100 * ncorrect / ntotal, 2), results
 
diff --git a/dspy/predict/knn.py b/dspy/predict/knn.py
index 434a07aaa..17a5a3fb7 100644
--- a/dspy/predict/knn.py
+++ b/dspy/predict/knn.py
@@ -13,7 +13,7 @@ def __init__(self, k: int, trainset: List[dsp.Example], vectorizer=None):
         Args:
             k: Number of nearest neighbors to retrieve
             trainset: List of training examples to search through
-            vectorizer: Optional dspy.Embedding for computing embeddings. If None, uses sentence-transformers.
+            vectorizer: Optional dspy.Embedder for computing embeddings. If None, uses sentence-transformers.
 
         Example:
             >>> trainset = [dsp.Example(input="hello", output="world"), ...]
@@ -24,7 +24,7 @@ def __init__(self, k: int, trainset: List[dsp.Example], vectorizer=None):
 
         self.k = k
         self.trainset = trainset
-        self.embedding = vectorizer or dspy.Embedding(dsp.SentenceTransformersVectorizer())
+        self.embedding = vectorizer or dspy.Embedder(dsp.SentenceTransformersVectorizer())
         trainset_casted_to_vectorize = [
             " | ".join([f"{key}: {value}" for key, value in example.items() if key in example._input_keys])
             for example in self.trainset
diff --git a/dspy/predict/react.py b/dspy/predict/react.py
index ce8edaa46..28640d5d1 100644
--- a/dspy/predict/react.py
+++ b/dspy/predict/react.py
@@ -9,7 +9,7 @@
 
 class Tool:
     def __init__(self, func: Callable, name: str = None, desc: str = None, args: dict[str, Any] = None):
-        annotations_func = func if inspect.isfunction(func) else func.__call__
+        annotations_func = func if inspect.isfunction(func) or inspect.ismethod(func) else func.__call__
         self.func = func
         self.name = name or getattr(func, '__name__', type(func).__name__)
         self.desc = desc or getattr(func, '__doc__', None) or getattr(annotations_func, '__doc__', "")
diff --git a/dspy/retrievers/__init__.py b/dspy/retrievers/__init__.py
new file mode 100644
index 000000000..3fdc977bb
--- /dev/null
+++ b/dspy/retrievers/__init__.py
@@ -0,0 +1 @@
+from .embeddings import Embeddings
\ No newline at end of file
diff --git a/dspy/retrievers/embeddings.py b/dspy/retrievers/embeddings.py
new file mode 100644
index 000000000..75e1ff1fb
--- /dev/null
+++ b/dspy/retrievers/embeddings.py
@@ -0,0 +1,83 @@
+import numpy as np
+from typing import Any, List, Optional
+from dspy.utils.unbatchify import Unbatchify
+
+# TODO: Add .save and .load methods!
+
+
+class Embeddings:
+    def __init__(
+        self,
+        corpus: List[str],
+        embedder,
+        k: int = 5,
+        callbacks: Optional[List[Any]] = None,
+        cache: bool = False,
+        brute_force_threshold: int = 20_000,
+        normalize: bool = True
+    ):
+        assert cache is False, "Caching is not supported for embeddings-based retrievers"
+
+        self.embedder = embedder
+        self.k = k
+        self.corpus = corpus
+        self.normalize = normalize
+
+        self.corpus_embeddings = self.embedder(self.corpus)
+        self.corpus_embeddings = self._normalize(self.corpus_embeddings) if self.normalize else self.corpus_embeddings
+
+        self.index = self._build_faiss() if len(corpus) >= brute_force_threshold else None
+        self.search_fn = Unbatchify(self._batch_forward)
+
+    def __call__(self, query: str):
+        return self.forward(query)
+
+    def forward(self, query: str):
+        import dspy
+        return dspy.Prediction(passages=self.search_fn(query))
+
+    def _batch_forward(self, queries: List[str]):
+        q_embeds = self.embedder(queries)
+        q_embeds = self._normalize(q_embeds) if self.normalize else q_embeds
+
+        pids = self._faiss_search(q_embeds, self.k * 10) if self.index else None
+        pids = np.tile(np.arange(len(self.corpus)), (len(queries), 1)) if pids is None else pids
+        
+        return self._rerank_and_predict(q_embeds, pids)
+
+    def _build_faiss(self):
+        nbytes = 32
+        partitions = int(2 * np.sqrt(len(self.corpus)))
+        dim = self.corpus_embeddings.shape[1]
+
+        try:
+            import faiss
+        except ImportError:
+            raise ImportError("Please `pip install faiss-cpu` or increase `brute_force_threshold` to avoid FAISS.")
+
+        quantizer = faiss.IndexFlatL2(dim)
+        index = faiss.IndexIVFPQ(quantizer, dim, partitions, nbytes, 8)
+
+        print(f"Training a {nbytes}-byte FAISS index with {partitions} partitions, based on "
+              f"{len(self.corpus)} x {dim}-dim embeddings")
+        index.train(self.corpus_embeddings)
+        index.add(self.corpus_embeddings)
+        index.nprobe = min(16, partitions)
+
+        return index
+
+    def _faiss_search(self, query_embeddings: np.ndarray, num_candidates: int):
+        return self.index.search(query_embeddings, num_candidates)[1]
+
+    def _rerank_and_predict(self, q_embeds: np.ndarray, candidate_indices: np.ndarray):
+        candidate_embeddings = self.corpus_embeddings[candidate_indices]
+        scores = np.einsum('qd,qkd->qk', q_embeds, candidate_embeddings)
+
+        top_k_indices = np.argsort(-scores, axis=1)[:, :self.k]
+        top_indices = candidate_indices[np.arange(len(q_embeds))[:, None], top_k_indices]
+
+        return [[self.corpus[idx] for idx in indices] for indices in top_indices]
+
+    def _normalize(self, embeddings: np.ndarray):
+        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
+        return embeddings / np.maximum(norms, 1e-10)
diff --git a/dspy/signatures/signature.py b/dspy/signatures/signature.py
index 88812be0c..387b9d771 100644
--- a/dspy/signatures/signature.py
+++ b/dspy/signatures/signature.py
@@ -1,5 +1,6 @@
 import ast
 import inspect
+import logging
 import re
 import types
 import typing
@@ -11,8 +12,9 @@
 from pydantic.fields import FieldInfo
 
 import dsp
-from dspy.signatures.field import InputField, OutputField, new_to_old_field
 from dspy.adapters.image_utils import Image
+from dspy.signatures.field import InputField, OutputField, new_to_old_field
+
 
 def signature_to_template(signature, adapter=None) -> dsp.Template:
     """Convert from new to legacy format."""
@@ -242,8 +244,8 @@ class Signature(BaseModel, metaclass=SignatureMeta):
     @classmethod
     @contextmanager
     def replace(
-        cls: "Signature",
-        new_signature: "Signature",
+        cls,
+        new_signature: "Type[Signature]",
         validate_new_signature: bool = True,
     ) -> typing.Generator[None, None, None]:
         """Replace the signature with an updated version.
@@ -262,16 +264,35 @@ def replace(
                         f"Field '{field}' is missing from the updated signature '{new_signature.__class__}.",
                     )
 
-        class OldSignature(cls, Signature):
+        class OldSignature(cls):
             pass
 
-        replace_fields = ["__doc__", "model_fields", "model_extra", "model_config"]
-        for field in replace_fields:
-            setattr(cls, field, getattr(new_signature, field))
+        def swap_attributes(source: Type[Signature]):
+            unhandled = {}
+
+            for attr in ["__doc__", "__pydantic_fields__", "model_fields", "model_extra", "model_config"]:
+                try:
+                    setattr(cls, attr, getattr(source, attr))
+                except AttributeError as exc:
+                    if attr in ("__pydantic_fields__", "model_fields"):
+                        version = "< 2.10" if attr == "__pydantic_fields__" else ">= 2.10"
+                        logging.debug(f"Model attribute {attr} not replaced, expected with pydantic {version}")
+                        unhandled[attr] = exc
+                    else:
+                        raise exc
+
+            # if neither of the attributes were replaced, raise an error to prevent silent failures
+            if set(unhandled.keys()) >= {"model_fields", "__pydantic_fields__"}:
+                raise ValueError("Failed to replace either model_fields or __pydantic_fields__") from (
+                    unhandled.get("model_fields") or unhandled.get("__pydantic_fields__")
+                )
+
+        swap_attributes(new_signature)
         cls.model_rebuild(force=True)
+
         yield
-        for field in replace_fields:
-            setattr(cls, field, getattr(OldSignature, field))
+
+        swap_attributes(OldSignature)
         cls.model_rebuild(force=True)
 
 
@@ -383,7 +404,7 @@ def _parse_type_node(node, names=None) -> Any:
 
     without using structural pattern matching introduced in Python 3.10.
     """
-    
+
     if names is None:
         names = typing.__dict__
 
@@ -401,7 +422,7 @@ def _parse_type_node(node, names=None) -> Any:
         id_ = node.id
         if id_ in names:
             return names[id_]
-        
+
         for type_ in [int, str, float, bool, list, tuple, dict, Image]:
             if type_.__name__ == id_:
                 return type_
@@ -420,7 +441,7 @@ def _parse_type_node(node, names=None) -> Any:
         keys = [kw.arg for kw in node.keywords]
         values = [kw.value.value for kw in node.keywords]
         return Field(**dict(zip(keys, values)))
-    
+
     if isinstance(node, ast.Attribute) and node.attr == "Image":
         return Image
 
diff --git a/dspy/utils/__init__.py b/dspy/utils/__init__.py
index f12b34b18..ba205504e 100644
--- a/dspy/utils/__init__.py
+++ b/dspy/utils/__init__.py
@@ -2,3 +2,17 @@
 from dspy.utils.dummies import *
 from dspy.utils.caching import *
 from dspy.utils.logging_utils import *
+
+import os
+import ujson
+import requests
+
+def download(url):
+    filename = os.path.basename(url)
+    remote_size = int(requests.head(url, allow_redirects=True).headers.get('Content-Length', 0))
+    local_size = os.path.getsize(filename) if os.path.exists(filename) else 0
+
+    if local_size != remote_size:
+        print(f"Downloading '{filename}'...")
+        with requests.get(url, stream=True) as r, open(filename, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192): f.write(chunk)
diff --git a/dspy/utils/asyncify.py b/dspy/utils/asyncify.py
index ca801e12a..03bd9a7e9 100644
--- a/dspy/utils/asyncify.py
+++ b/dspy/utils/asyncify.py
@@ -24,22 +24,7 @@ def get_limiter():
 
 
 def asyncify(program):
-    import dspy
     import threading
-
-    assert threading.get_ident() == dspy.settings.main_tid, "asyncify can only be called from the main thread"
-
-    def wrapped(*args, **kwargs):
-        thread_stacks = dspy.settings.stack_by_thread
-        current_thread_id = threading.get_ident()
-        creating_new_thread = current_thread_id not in thread_stacks
-
-        assert creating_new_thread
-        thread_stacks[current_thread_id] = list(dspy.settings.main_stack)
-
-        try:
-            return program(*args, **kwargs)
-        finally:
-            del thread_stacks[threading.get_ident()]
-
-    return asyncer.asyncify(wrapped, abandon_on_cancel=True, limiter=get_limiter())
+    assert threading.current_thread() is threading.main_thread(), "asyncify can only be called from the main thread"
+    # NOTE: To allow this to be nested, we'd need behavior with contextvars like parallelizer.py
+    return asyncer.asyncify(program, abandon_on_cancel=True, limiter=get_limiter())
diff --git a/dspy/utils/parallelizer.py b/dspy/utils/parallelizer.py
index 27983632b..c6b5f3d5f 100644
--- a/dspy/utils/parallelizer.py
+++ b/dspy/utils/parallelizer.py
@@ -1,16 +1,15 @@
-import logging
 import sys
 import tqdm
-import dspy
 import signal
+import logging
 import threading
 import traceback
 import contextlib
 
+from contextvars import copy_context
 from tqdm.contrib.logging import logging_redirect_tqdm
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -23,6 +22,8 @@ def __init__(
         provide_traceback=False,
         compare_results=False,
     ):
+        """Offers isolation between the tasks (dspy.settings) irrespective of whether num_threads == 1 or > 1."""
+        
         self.num_threads = num_threads
         self.disable_progress_bar = disable_progress_bar
         self.max_errors = max_errors
@@ -33,34 +34,18 @@ def __init__(
         self.error_lock = threading.Lock()
         self.cancel_jobs = threading.Event()
 
-
     def execute(self, function, data):
         wrapped_function = self._wrap_function(function)
         if self.num_threads == 1:
-            return self._execute_single_thread(wrapped_function, data)
+            return self._execute_isolated_single_thread(wrapped_function, data)
         else:
             return self._execute_multi_thread(wrapped_function, data)
 
-
     def _wrap_function(self, function):
-        # Wrap the function with threading context and error handling
-        def wrapped(item, parent_id=None):
-            thread_stacks = dspy.settings.stack_by_thread
-            current_thread_id = threading.get_ident()
-            creating_new_thread = current_thread_id not in thread_stacks
-
-            assert creating_new_thread or threading.get_ident() == dspy.settings.main_tid
-
-            if creating_new_thread:
-                # If we have a parent thread ID, copy its stack. TODO: Should the caller just pass a copy of the stack?
-                if parent_id and parent_id in thread_stacks:
-                    thread_stacks[current_thread_id] = list(thread_stacks[parent_id])
-                else:
-                    thread_stacks[current_thread_id] = list(dspy.settings.main_stack)
-
-                # TODO: Consider the behavior below.
-                # import copy; thread_stacks[current_thread_id].append(copy.deepcopy(thread_stacks[current_thread_id][-1]))
-
+        # Wrap the function with error handling
+        def wrapped(item):
+            if self.cancel_jobs.is_set():
+                return None
             try:
                 return function(item)
             except Exception as e:
@@ -79,45 +64,53 @@ def wrapped(item, parent_id=None):
                         f"Error processing item {item}: {e}. Set `provide_traceback=True` to see the stack trace."
                     )
                 return None
-            finally:
-                if creating_new_thread:
-                    del thread_stacks[threading.get_ident()]
         return wrapped
 
-
-    def _execute_single_thread(self, function, data):
+    def _execute_isolated_single_thread(self, function, data):
         results = []
         pbar = tqdm.tqdm(
             total=len(data),
             dynamic_ncols=True,
             disable=self.disable_progress_bar,
-            file=sys.stdout,
+            file=sys.stdout
         )
+
         for item in data:
             with logging_redirect_tqdm():
                 if self.cancel_jobs.is_set():
                     break
-                result = function(item)
+
+                # Create an isolated context for each task
+                task_ctx = copy_context()
+                result = task_ctx.run(function, item)
                 results.append(result)
+
                 if self.compare_results:
                     # Assumes score is the last element of the result tuple
-                    self._update_progress(pbar, sum([r[-1] for r in results if r is not None]), len([r for r in data if r is not None]))
+                    self._update_progress(
+                        pbar,
+                        sum([r[-1] for r in results if r is not None]),
+                        len([r for r in data if r is not None]),
+                    )
                 else:
                     self._update_progress(pbar, len(results), len(data))
+
         pbar.close()
+
         if self.cancel_jobs.is_set():
             logger.warning("Execution was cancelled due to errors.")
             raise Exception("Execution was cancelled due to errors.")
-        return results
 
+        return results
 
     def _update_progress(self, pbar, nresults, ntotal):
         if self.compare_results:
-            pbar.set_description(f"Average Metric: {nresults:.2f} / {ntotal} ({round(100 * nresults / ntotal, 1) if ntotal > 0 else 0}%)")
+            percentage = round(100 * nresults / ntotal, 1) if ntotal > 0 else 0
+            pbar.set_description(f"Average Metric: {nresults:.2f} / {ntotal} ({percentage}%)")
         else:
             pbar.set_description(f"Processed {nresults} / {ntotal} examples")
-        pbar.update()
 
+        pbar.update()
 
     def _execute_multi_thread(self, function, data):
         results = [None] * len(data)  # Pre-allocate results list to maintain order
@@ -132,6 +125,7 @@ def interrupt_handler_manager():
                 def interrupt_handler(sig, frame):
                     self.cancel_jobs.set()
                     logger.warning("Received SIGINT. Cancelling execution.")
+                    # Re-raise the signal to allow default behavior
                     default_handler(sig, frame)
 
                 signal.signal(signal.SIGINT, interrupt_handler)
@@ -143,37 +137,53 @@ def interrupt_handler(sig, frame):
                 # If not in the main thread, skip setting signal handlers
                 yield
 
-        def cancellable_function(index_item, parent_id=None):
+        def cancellable_function(index_item):
             index, item = index_item
             if self.cancel_jobs.is_set():
                 return index, job_cancelled
-            return index, function(item, parent_id)
-        
-        parent_id = threading.get_ident() if threading.current_thread() is not threading.main_thread() else None
+            return index, function(item)
 
         with ThreadPoolExecutor(max_workers=self.num_threads) as executor, interrupt_handler_manager():
-            futures = {executor.submit(cancellable_function, pair, parent_id): pair for pair in enumerate(data)}
+            futures = {}
+            for pair in enumerate(data):
+                # Capture the context for each task
+                task_ctx = copy_context()
+                future = executor.submit(task_ctx.run, cancellable_function, pair)
+                futures[future] = pair
+
             pbar = tqdm.tqdm(
                 total=len(data),
                 dynamic_ncols=True,
                 disable=self.disable_progress_bar,
-                file=sys.stdout,
+                file=sys.stdout
             )
 
             for future in as_completed(futures):
                 index, result = future.result()
-            
+
                 if result is job_cancelled:
                     continue
+
                 results[index] = result
 
                 if self.compare_results:
                     # Assumes score is the last element of the result tuple
-                    self._update_progress(pbar, sum([r[-1] for r in results if r is not None]), len([r for r in results if r is not None]))
+                    self._update_progress(
+                        pbar,
+                        sum([r[-1] for r in results if r is not None]),
+                        len([r for r in results if r is not None]),
+                    )
                 else:
-                    self._update_progress(pbar, len([r for r in results if r is not None]), len(data))
+                    self._update_progress(
+                        pbar,
+                        len([r for r in results if r is not None]),
+                        len(data),
+                    )
+
             pbar.close()
+
         if self.cancel_jobs.is_set():
             logger.warning("Execution was cancelled due to errors.")
             raise Exception("Execution was cancelled due to errors.")
+
         return results
diff --git a/dspy/utils/unbatchify.py b/dspy/utils/unbatchify.py
new file mode 100644
index 000000000..bafdc8cb3
--- /dev/null
+++ b/dspy/utils/unbatchify.py
@@ -0,0 +1,111 @@
+import time
+import queue
+import threading
+from typing import Any, Callable, List
+from concurrent.futures import Future
+
+class Unbatchify:
+    def __init__(
+        self,
+        batch_fn: Callable[[List[Any]], List[Any]],
+        max_batch_size: int = 32,
+        max_wait_time: float = 0.1
+    ):
+        """
+        Initializes the Unbatchify.
+
+        Args:
+            batch_fn: The batch-processing function that accepts a list of inputs and returns a list of outputs.
+            max_batch_size: The maximum number of items to include in a batch.
+            max_wait_time: The maximum time (in seconds) to wait for batch to fill before processing.
+        """
+
+        self.batch_fn = batch_fn
+        self.max_batch_size = max_batch_size
+        self.max_wait_time = max_wait_time
+        self.input_queue = queue.Queue()
+        self.stop_event = threading.Event()
+        self.worker_thread = threading.Thread(target=self._worker)
+        self.worker_thread.daemon = True  # Ensures thread exits when main program exits
+        self.worker_thread.start()
+
+    def __call__(self, input_item: Any) -> Any:
+        """
+        Thread-safe function that accepts a single input and returns the corresponding output.
+
+        Args:
+            input_item: The single input item to process.
+
+        Returns:
+            The output corresponding to the input_item after processing through batch_fn.
+        """
+        future = Future()
+        self.input_queue.put((input_item, future))
+        try:
+            result = future.result()
+        except Exception as e:
+            raise e
+        return result
+
+    def _worker(self):
+        """
+        Worker thread that batches inputs and processes them using batch_fn.
+        """
+        while not self.stop_event.is_set():
+            batch = []
+            futures = []
+            start_time = time.time()
+            while len(batch) < self.max_batch_size and (time.time() - start_time) < self.max_wait_time:
+                try:
+                    input_item, future = self.input_queue.get(timeout=self.max_wait_time)
+                    batch.append(input_item)
+                    futures.append(future)
+                except queue.Empty:
+                    break
+
+            if batch:
+                try:
+                    outputs = self.batch_fn(batch)
+                    for output, future in zip(outputs, futures):
+                        future.set_result(output)
+                except Exception as e:
+                    for future in futures:
+                        future.set_exception(e)
+            else:
+                time.sleep(0.01)
+
+        # Clean up remaining items when stopping
+        while True:
+            try:
+                _, future = self.input_queue.get_nowait()
+                future.set_exception(RuntimeError("Unbatchify is closed"))
+            except queue.Empty:
+                break
+
+        print("Worker thread has been terminated.")
+
+    def close(self):
+        """
+        Stops the worker thread and cleans up resources.
+        """
+        if not self.stop_event.is_set():
+            self.stop_event.set()
+            self.worker_thread.join()
+
+    def __enter__(self):
+        """
+        Enables use as a context manager.
+        """
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        """
+        Ensures resources are cleaned up when exiting context.
+        """
+        self.close()
+
+    def __del__(self):
+        """
+        Ensures the worker thread is terminated when the object is garbage collected.
+        """
+        self.close()
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 98d89e732..23984fa07 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,4 +1,7 @@
 black==24.2.0
+datamodel-code-generator==0.26.3
+litellm[proxy]==1.51.0
+pillow==10.4.0
 pre-commit==3.7.0
 pytest==8.3.3
 pytest-env==1.1.3
@@ -6,5 +9,3 @@ pytest-mock==3.12.0
 ruff==0.3.0
 torch==2.2.1
 transformers==4.38.2
-pillow==10.4.0
-litellm[proxy]==1.51.0
diff --git a/tests/clients/test_embedding.py b/tests/clients/test_embedding.py
index d12850e52..0ac9e24ba 100644
--- a/tests/clients/test_embedding.py
+++ b/tests/clients/test_embedding.py
@@ -2,7 +2,7 @@
 from unittest.mock import Mock, patch
 import numpy as np
 
-from dspy.clients.embedding import Embedding
+from dspy.clients.embedding import Embedder
 
 
 # Mock response format similar to litellm's embedding response.
@@ -27,7 +27,7 @@ def test_litellm_embedding():
         mock_litellm.return_value = MockEmbeddingResponse(mock_embeddings)
 
         # Create embedding instance and call it.
-        embedding = Embedding(model)
+        embedding = Embedder(model)
         result = embedding(inputs)
 
         # Verify litellm was called with correct parameters.
@@ -51,7 +51,7 @@ def mock_embedding_fn(texts):
         return expected_embeddings
 
     # Create embedding instance with callable
-    embedding = Embedding(mock_embedding_fn)
+    embedding = Embedder(mock_embedding_fn)
     result = embedding(inputs)
 
     np.testing.assert_allclose(result, expected_embeddings)
@@ -60,5 +60,5 @@ def mock_embedding_fn(texts):
 def test_invalid_model_type():
     # Test that invalid model type raises ValueError
     with pytest.raises(ValueError):
-        embedding = Embedding(123)  # Invalid model type
+        embedding = Embedder(123)  # Invalid model type
         embedding(["test"])
diff --git a/tests/predict/test_react.py b/tests/predict/test_react.py
index 8435f86a9..4c6a150db 100644
--- a/tests/predict/test_react.py
+++ b/tests/predict/test_react.py
@@ -2,6 +2,7 @@
 
 import dspy
 from dspy.utils.dummies import DummyLM, dummy_rm
+from dspy.predict import react
 
 
 # def test_example_no_tools():
@@ -121,4 +122,28 @@
 #     react = dspy.ReAct(ExampleSignature)
 
 #     assert react.react[0].signature.instructions is not None
-#     assert react.react[0].signature.instructions.startswith("You are going to generate output based on input.")
\ No newline at end of file
+#     assert react.react[0].signature.instructions.startswith("You are going to generate output based on input.")
+
+def test_tool_from_function():
+    def foo(a: int, b: int) -> int:
+        """Add two numbers."""
+        return a + b
+    
+    tool = react.Tool(foo)
+    assert tool.name == "foo"
+    assert tool.desc == "Add two numbers."
+    assert tool.args == {"a": "int", "b": "int"}
+
+def test_tool_from_class():
+    class Foo:
+        def __init__(self, user_id: str):
+            self.user_id = user_id
+
+        def foo(self, a: int, b: int) -> int:
+            """Add two numbers."""
+            return a + b
+
+    tool = react.Tool(Foo("123").foo)
+    assert tool.name == "foo"
+    assert tool.desc == "Add two numbers."
+    assert tool.args == {"a": "int", "b": "int"}