diff --git a/docs/evaluation/tutorials/rag.mdx b/docs/evaluation/tutorials/rag.mdx
index 7dec0ed5..123d6b23 100644
--- a/docs/evaluation/tutorials/rag.mdx
+++ b/docs/evaluation/tutorials/rag.mdx
@@ -2,482 +2,1422 @@
sidebar_position: 4
---
-# RAG Evaluations
-
-We will walk through the evaluation workflow for RAG (retrieval augmented generation).
-
-## Overview
-
-We will discuss each piece of the workflow below.
-
-![rag_overview.png](./static/rag_overview.png)
-
-## Dataset
-
-Here is a dataset of [LCEL (LangChain Expression Language)](https://python.langchain.com/v0.1/docs/expression_language/) related questions that we will use.
-
-This dataset was created using csv upload in the LangSmith UI:
-
-https://smith.langchain.com/public/730d833b-74da-43e2-a614-4e2ca2502606/d
-
-Here, we ensure that API keys for OpenAI as well as LangSmith are set.
-
-```python
-import getpass
-import os
-
-def _set_env(var: str):
- if not os.environ.get(var):
- os.environ[var] = getpass.getpass(f"{var}: ")
-
-_set_env("OPENAI_API_KEY")
-os.environ["LANGCHAIN_TRACING_V2"] = "true"
-os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com" # Update appropriately for self-hosted installations or the EU region
-_set_env("LANGCHAIN_API_KEY")
-```
-
-```python
-### Dataset name
-
-# Clone dataset
-client = Client()
-dataset = client.clone_public_dataset(
- "https://smith.langchain.com/public/730d833b-74da-43e2-a614-4e2ca2502606/d"
-)
-
-dataset_name = "LCEL-QA"
-```
-
-## Task
-
-Here is a chain that will perform RAG on [LCEL (LangChain Expression Language)](https://python.langchain.com/v0.1/docs/expression_language/) docs.
-
-We will be using LangChain strictly for creating the retriever and retrieving the relevant documents.
-
-The overall pipeline does not use LangChain; LangSmith works regardless of whether or not your pipeline is built with LangChain.
+import {
+ CodeTabs,
+ python,
+ typescript,
+} from "@site/src/components/InstructionsWithCode";
-Here, we return the retrieved documents as part of the final answer.
-
-However, below we will show that this is not required (using evaluation of intermediate steps).
-
-See our [RAG-From-Scratch](https://github.com/langchain-ai/rag-from-scratch) repo and tutorial video series for more on this.
-
-```python
-### INDEX
-
-from bs4 import BeautifulSoup as Soup
-from langchain_community.vectorstores import Chroma
-from langchain_openai import OpenAIEmbeddings
-from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-
-# Load docs
-url = "https://python.langchain.com/v0.1/docs/expression_language/"
-loader = RecursiveUrlLoader(
- url=url, max_depth=20, extractor=lambda x: Soup(x, "html.parser").text
-)
-docs = loader.load()
-
-# Split into chunks
-text_splitter = RecursiveCharacterTextSplitter(chunk_size=4500, chunk_overlap=200)
-splits = text_splitter.split_documents(docs)
-
-# Embed and store in Chroma
-vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
-
-# Index
-retriever = vectorstore.as_retriever()
-```
+# RAG Evaluations
-```python
-### RAG bot
+:::info Key concepts
+[RAG evaluation](/evaluation/concepts#retrieval-augmented-generation-rag) | [Evaluators](/evaluation/concepts#evaluators) | [LLM-as-judge evaluators](/evaluation/concepts#llm-as-judge)
+:::
-import openai
-from langsmith import traceable
-from langsmith.wrappers import wrap_openai
+Retrieval Augmented Generation (RAG) is a technique that enhances Large Language Models (LLMs) by providing them with relevant external knowledge. It has become one of the most widely used approaches for building LLM applications.
-class RagBot:
+This tutorial will show you how to evaluate your RAG applications using LangSmith. You'll learn:
- def __init__(self, retriever, model: str = "gpt-4-0125-preview"):
- self._retriever = retriever
- # Wrapping the client instruments the LLM
- self._client = wrap_openai(openai.Client())
- self._model = model
+1. How to create test datasets
+2. How to run your RAG application on those datasets
+3. How to measure your application's performance using different evaluation metrics
- @traceable()
- def retrieve_docs(self, question):
- return self._retriever.invoke(question)
+## Overview
- @traceable()
- def invoke_llm(self, question, docs):
- response = self._client.chat.completions.create(
- model=self._model,
- messages=[
- {
- "role": "system",
- "content": "You are a helpful AI code assistant with expertise in LCEL."
- " Use the following docs to produce a concise code solution to the user question.\n\n"
- f"## Docs\n\n{docs}",
- },
- {"role": "user", "content": question},
- ],
+A typical RAG evaluation workflow consists of three main steps:
+
+1. Creating a dataset with questions and their expected answers
+2. Running your RAG application on those questions
+3. Using evaluators to measure how well your application performed, looking at factors like:
+ - Answer relevance
+ - Answer accuracy
+ - Retrieval quality
+
+For this tutorial, we'll create and evaluate a bot that answers questions about a few of [Lilian Weng's](https://lilianweng.github.io/) insightful blog posts.
+
+## Setup
+
+### Environment
+
+First, let's set our environment variables:
+
+
+
+And install the dependencies we'll need:
+
+
+
+### Application
+
+:::info Framework Flexibility
+While this tutorial uses LangChain, the evaluation techniques and LangSmith functionality demonstrated here work with any framework. Feel free to use your preferred tools and libraries.
+:::
+
+In this section, we'll build a basic Retrieval-Augmented Generation (RAG) application.
+
+We'll stick to a simple implementation that:
+- Indexing: chunks and indexes a few of Lilian Weng's blogs in a vector store
+- Retrieval: retrieves those chunks based on the user question
+- Generation: passes the question and retrieved docs to an LLM.
+
+#### Indexing and retrieval
+
+First, lets load the blog posts we want to build a chatbot for and index them.
+
+
+
+
+
+#### Generation
+
+We can now define the generative pipeline.
+
+ dict:
+ # LangChain retriever will be automatically traced
+ docs = retriever.invoke(question)
+ docs_string = "\n\n".join(doc.page_content for doc in docs)
+
+ instructions = f"""You are a helpful assistant who is good at analyzing source information and answering questions. \
+ Use the following source documents to answer the user's questions. \
+ If you don't know the answer, just say that you don't know. \
+ Use three sentences maximum and keep the answer concise.
+
+ Documents:
+ {docs_string}"""
+
+ # langchain ChatModel will be automatically traced
+ ai_msg = llm.invoke([
+ {"role": "system", "content": instructions},
+ {"role": "user", "content": question},
+ ],
+ )
+
+ return {"answer": ai_msg.content, "documents": docs}
+ `,
+ typescript`
+ import { ChatOpenAI } from "@langchain/openai";
+ import { traceable } from "langsmith/traceable";
+
+ const llm = new ChatOpenAI({
+ model: "gpt-4o",
+ temperature: 1,
+ })
+
+ // Add decorator so this function is traced in LangSmith
+ const ragBot = traceable(
+ async (question: string) => {
+ // LangChain retriever will be automatically traced
+ const retrievedDocs = await vectorStore.similaritySearch(question);
+ const docsContent = retrievedDocs.map((doc) => doc.pageContent).join("\n");
+
+ const instructions = \`You are a helpful assistant who is good at analyzing source information and answering questions
+ Use the following source documents to answer the user's questions.
+ If you don't know the answer, just say that you don't know.
+ Use three sentences maximum and keep the answer concise.
+ Documents:
+ \${docsContent}\`;
+
+ const aiMsg = await llm.invoke([
+ {
+ role: "system",
+ content: instructions
+ },
+ {
+ role: "user",
+ content: question
+ }
+ ])
+
+ return {"answer": aiMsg.content, "documents": retrievedDocs}
+ }
+ )
+
+ `
+ ]}
+/>
-```python
-def predict_rag_answer(example: dict):
- """Use this for answer evaluation"""
- response = rag_bot.get_answer(example["input_question"])
- return {"answer": response["answer"]}
+## Dataset
-def predict_rag_answer_with_context(example: dict):
- """Use this for evaluation of retrieved documents and hallucinations"""
- response = rag_bot.get_answer(example["input_question"])
- return {"answer": response["answer"], "contexts": response["contexts"]}
-```
+Now that we've got our application, let's build a dataset to evaluate it. Our dataset will be very simple in this case: we'll have example questions and reference answers.
+
+, Array<{ outputs: string }>]
+ >(
+ ([inputs, outputs], item) => [
+ [...inputs, { input: item[0] }],
+ [...outputs, { outputs: item[1] }],
+ ],
+ [[], []]
+ );
+
+ const datasetName = "Lilian Weng Blogs Q&A";
+ const dataset = await client.createDataset(datasetName);
+ await client.createExamples({ inputs, outputs, datasetId: dataset.id })
+ `,
+ ]}
+/>
-## Evaluator
+## Evaluators
-There are at least 4 types of RAG eval that users are typically interested in.
+One way to think about different types of RAG evaluators is as a tuple of what is being evaluated X what its being evaluated against:
-1. **Response vs reference answer**
+1. **Correctness**: Response vs reference answer
- `Goal`: Measure "_how similar/correct is the RAG chain answer, relative to a ground-truth answer_"
-- `Mode`: Uses ground truth (reference) answer supplied through a dataset
-- `Judge`: Use LLM-as-judge to assess answer correctness.
+- `Mode`: Requires a ground truth (reference) answer supplied through a dataset
+- `Evaluator`: Use LLM-as-judge to assess answer correctness.
-2. **Response vs input**
+2. **Relevance**: Response vs input
- `Goal`: Measure "_how well does the generated response address the initial user input_"
-- `Mode`: Reference-free, because it will compare the answer to the input question
-- `Judge`: Use LLM-as-judge to assess answer relevance, helpfulness, etc.
+- `Mode`: Does not require reference answer, because it will compare the answer to the input question
+- `Evaluator`: Use LLM-as-judge to assess answer relevance, helpfulness, etc.
-3. **Response vs retrieved docs**
+3. **Groundedness**: Response vs retrieved docs
- `Goal`: Measure "_to what extent does the generated response agree with the retrieved context_"
-- `Mode`: Reference-free, because it will compare the answer to the retrieved context
-- `Judge`: Use LLM-as-judge to assess faithfulness, hallucinations, etc.
+- `Mode`: Does not require reference answer, because it will compare the answer to the retrieved context
+- `Evaluator`: Use LLM-as-judge to assess faithfulness, hallucinations, etc.
-4. **Retrieved docs vs input**
+4. **Retrieval relevance**: Retrieved docs vs input
-- `Goal`: Measure "_how good are my retrieved results for this query_"
-- `Mode`: Reference-free, because it will compare the question to the retrieved context
-- `Judge`: Use LLM-as-judge to assess relevance
+- `Goal`: Measure "_how relevant are my retrieved results for this query_"
+- `Mode`: Does not require reference answer, because it will compare the question to the retrieved context
+- `Evaluator`: Use LLM-as-judge to assess relevance
![](./static/rag_eval_overview.png)
-### **Response vs reference answer**
-
-Here is an example prompt that we can use:
-
-https://smith.langchain.com/hub/langchain-ai/rag-answer-vs-reference
-
-Here is the a video from our LangSmith evaluation series for reference:
-
-https://youtu.be/lTfhw_9cJqc?feature=shared
-
-Here is our evaluator function:
-
-- `run` is the invocation of `predict_rag_answer`, which has key `answer`
-- `example` is from our eval set, which has keys `input_question` and `output_answer`
-- We extract these values and pass them into our grader
-
-```python
-from langchain import hub
-from langchain_openai import ChatOpenAI
-
-# Grade prompt
-grade_prompt_answer_accuracy = prompt = hub.pull("langchain-ai/rag-answer-vs-reference")
-
-def answer_evaluator(run, example) -> dict:
- """
- A simple evaluator for RAG answer accuracy
- """
-
- # Get question, ground truth answer, RAG chain answer
- input_question = example.inputs["input_question"]
- reference = example.outputs["output_answer"]
- prediction = run.outputs["answer"]
-
- # LLM grader
- llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)
-
- # Structured prompt
- answer_grader = grade_prompt_answer_accuracy | llm
-
- # Run evaluator
- score = answer_grader.invoke({"question": input_question,
- "correct_answer": reference,
- "student_answer": prediction})
- score = score["Score"]
-
- return {"key": "answer_v_reference_score", "score": score}
-```
-
-Now, we kick off evaluation:
-
-- `predict_rag_answer`: Takes an `example` from our eval set, extracts the question, passes to our RAG chain
-- `answer_evaluator`: Passes RAG chain answer, question, and ground truth answer to an evaluator
-
-```python
-from langsmith import evaluate
-
-experiment_results = evaluate(
- predict_rag_answer,
- data=dataset_name,
- evaluators=[answer_evaluator],
- experiment_prefix="rag-answer-v-reference",
- metadata={"version": "LCEL context, gpt-4-0125-preview"},
-)
-```
-
-### **Response vs input**
-
-Here is an example prompt that we can use:
-
-https://smith.langchain.com/hub/langchain-ai/rag-answer-helpfulness
-
-The information flow is similar to above, but we simply look at the `run` answer versus the `example` question.
-
-```python
-# Grade prompt
-grade_prompt_answer_helpfulness = prompt = hub.pull("langchain-ai/rag-answer-helpfulness")
-
-def answer_helpfulness_evaluator(run, example) -> dict:
- """
- A simple evaluator for RAG answer helpfulness
- """
-
- # Get question, ground truth answer, RAG chain answer
- input_question = example.inputs["input_question"]
- prediction = run.outputs["answer"]
-
- # LLM grader
- llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)
-
- # Structured prompt
- answer_grader = grade_prompt_answer_helpfulness | llm
-
- # Run evaluator
- score = answer_grader.invoke({"question": input_question,
- "student_answer": prediction})
- score = score["Score"]
-
- return {"key": "answer_helpfulness_score", "score": score}
-```
-
-```python
-experiment_results = evaluate(
- predict_rag_answer,
- data=dataset_name,
- evaluators=[answer_helpfulness_evaluator],
- experiment_prefix="rag-answer-helpfulness",
- metadata={"version": "LCEL context, gpt-4-0125-preview"},
-)
-```
-
-### **Response vs retrieved docs**
+### Correctness: Response vs reference answer
+
+ bool:
+ """An evaluator for RAG answer accuracy"""
+ answers = f"""\
+ QUESTION: {inputs['question']}
+ GROUND TRUTH ANSWER: {reference_outputs['answer']}
+ STUDENT ANSWER: {outputs['answer']}"""
+
+ # Run evaluator
+ grade = grader_llm.invoke([{"role": "system", "content": correctness_instructions}, {"role": "user", "content": answers}])
+ return grade["correct"]
+ `,
+ typescript`
+ import type { EvaluationResult } from "langsmith/evaluation";
+ import { z } from "zod";
+
+ // Grade prompt
+ const correctnessInstructions = \`You are a teacher grading a quiz.
+
+ You will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER.
+
+ Here is the grade criteria to follow:
+ (1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer.
+ (2) Ensure that the student answer does not contain any conflicting statements.
+ (3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the ground truth answer.
+
+ Correctness:
+ A correctness value of True means that the student's answer meets all of the criteria.
+ A correctness value of False means that the student's answer does not meet all of the criteria.
+
+ Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct.
+
+ Avoid simply stating the correct answer at the outset.\`
+
+ const graderLLM = new ChatOpenAI({
+ model: "gpt-4o",
+ temperature: 0,
+ }).withStructuredOutput(
+ z
+ .object({
+ explanation: z
+ .string()
+ .describe("Explain your reasoning for the score"),
+ correct: z
+ .boolean()
+ .describe("True if the answer is correct, False otherwise.")
+ })
+ .describe("Correctness score for reference answer v.s. generated answer.")
+ );
+
+ async function correctness({
+ inputs,
+ outputs,
+ referenceOutputs,
+ }: {
+ inputs: Record;
+ outputs: Record;
+ referenceOutputs?: Record;
+ }): Promise => {
+ const answer = \`QUESTION: \${inputs.question}
+ GROUND TRUTH ANSWER: \${reference_outputs.answer}
+ STUDENT ANSWER: \${outputs.answer}\`
+
+ // Run evaluator
+ const grade = graderLLM.invoke([{role: "system", content: correctnessInstructions}, {role: "user", content: answer}])\
+ return grade.score
+ };
+ `,
+ ]}
+/>
+
+### Relevance: Response vs input
+
+The flow is similar to above, but we simply look at the `inputs` and `outputs` without needing the `reference_outputs`.
+Without a reference answer we can't grade accuracy, but can still grade relevance—as in, did the model address the user's question or not.
+
+ bool:
+ """A simple evaluator for RAG answer helpfulness."""
+ answer = f"""\
+ QUESTION: {inputs['question']}
+ STUDENT ANSWER: {outputs['answer']}"""
+ grade = relevance_llm.invoke([{"role": "system", "content": relevance_instructions}, {"role": "user", "content": answer}])
+ return grade["relevant"]
+ `,
+ typescript`
+ import type { EvaluationResult } from "langsmith/evaluation";
+ import { z } from "zod";
+
+ // Grade prompt
+ const relevanceInstructions = \`You are a teacher grading a quiz.
+
+ You will be given a QUESTION and a STUDENT ANSWER.
+
+ Here is the grade criteria to follow:
+ (1) Ensure the STUDENT ANSWER is concise and relevant to the QUESTION
+ (2) Ensure the STUDENT ANSWER helps to answer the QUESTION
+
+ Relevance:
+ A relevance value of True means that the student's answer meets all of the criteria.
+ A relevance value of False means that the student's answer does not meet all of the criteria.
+
+ Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct.
+
+ Avoid simply stating the correct answer at the outset.\`
+
+ const relevanceLLM = new ChatOpenAI({
+ model: "gpt-4o",
+ temperature: 0,
+ }).withStructuredOutput(
+ z
+ .object({
+ explanation: z
+ .string()
+ .describe("Explain your reasoning for the score"),
+ relevant: z
+ .boolean()
+ .describe("Provide the score on whether the answer addresses the question")
+ })
+ .describe("Relevance score for gene")
+ );
+
+ async function relevance({
+ inputs,
+ outputs,
+ }: {
+ inputs: Record;
+ outputs: Record;
+ }): Promise => {
+ const answer = \`QUESTION: \${inputs.question}
+ STUDENT ANSWER: \${outputs.answer}\`
+
+ // Run evaluator
+ const grade = relevanceLLM.invoke([{role: "system", content: relevanceInstructions}, {role: "user", content: answer}])\
+ return grade.relevant
+ };
+ `,
+ ]}
+/>
+
+### Groundedness: Response vs retrieved docs
+
+Another useful way to evaluate responses without needing reference answers is to check if the response is justified by (or "grounded in") the retrieved documents.
+
+ bool:
+ """A simple evaluator for RAG answer groundedness."""
+ doc_string = "\n\n".join(doc.page_content for doc in outputs["documents"])
+ answer = f"""\
+ FACTS: {doc_string}
+ STUDENT ANSWER: {outputs['answer']}"""
+ grade = grounded_llm.invoke([{"role": "system", "content": grounded_instructions}, {"role": "user", "content": answer}])
+ return grade["grounded"]
+ `,
+ typescript`
+ import type { EvaluationResult } from "langsmith/evaluation";
+ import { z } from "zod";
+
+ // Grade prompt
+ const groundedInstructions = \`You are a teacher grading a quiz.
+
+ You will be given FACTS and a STUDENT ANSWER.
+
+ Here is the grade criteria to follow:
+ (1) Ensure the STUDENT ANSWER is grounded in the FACTS.
+ (2) Ensure the STUDENT ANSWER does not contain "hallucinated" information outside the scope of the FACTS.
+
+ Grounded:
+ A grounded value of True means that the student's answer meets all of the criteria.
+ A grounded value of False means that the student's answer does not meet all of the criteria.
+
+ Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct.
+
+ Avoid simply stating the correct answer at the outset.\`
+
+ const groundedLLM = new ChatOpenAI({
+ model: "gpt-4o",
+ temperature: 0,
+ }).withStructuredOutput(
+ z
+ .object({
+ explanation: z
+ .string()
+ .describe("Explain your reasoning for the score"),
+ grounded: z
+ .boolean()
+ .describe("Provide the score on if the answer hallucinates from the documents")
+ })
+ .describe("Grounded score for the answer from the retrieved documents.")
+ );
+
+ async function grounded({
+ inputs,
+ outputs,
+ }: {
+ inputs: Record;
+ outputs: Record;
+ }): Promise => {
+ const docString = outputs.documents.map((doc) => doc.pageContent).join("\n");
+ const answer = \`FACTS: \${docString}
+ STUDENT ANSWER: \${outputs.answer}\`
+
+ // Run evaluator
+ const grade = groundedLLM.invoke([{role: "system", content: groundedInstructions}, {role: "user", content: answer}])\
+ return grade.grounded
+ };
+ `,
+ ]}
+/>
+
+### Retrieval relevance: Retrieved docs vs input
+
+ bool:
+ """An evaluator for document relevance"""
+ doc_string = "\n\n".join(doc.page_content for doc in outputs["documents"])
+ answer = f"""\
+ FACTS: {doc_string}
+ QUESTION: {inputs['question']}"""
+
+ # Run evaluator
+ grade = retrieval_relevance_llm.invoke([{"role": "system", "content": retrieval_relevance_instructions}, {"role": "user", "content": answer}])
+ return grade["relevant"]
+ `,
+ typescript`
+ import type { EvaluationResult } from "langsmith/evaluation";
+ import { z } from "zod";
+
+ // Grade prompt
+ const retrievalRelevanceInstructions = \`You are a teacher grading a quiz.
+
+ You will be given a QUESTION and a set of FACTS provided by the student.
+
+ Here is the grade criteria to follow:
+ (1) You goal is to identify FACTS that are completely unrelated to the QUESTION
+ (2) If the facts contain ANY keywords or semantic meaning related to the question, consider them relevant
+ (3) It is OK if the facts have SOME information that is unrelated to the question as long as (2) is met
+
+ Relevance:
+ A relevance value of True means that the FACTS contain ANY keywords or semantic meaning related to the QUESTION and are therefore relevant.
+ A relevance value of False means that the FACTS are completely unrelated to the QUESTION.
+
+ Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct.
+
+ Avoid simply stating the correct answer at the outset.\`
+
+ const retrievalRelevanceLLM = new ChatOpenAI({
+ model: "gpt-4o",
+ temperature: 0,
+ }).withStructuredOutput(
+ z
+ .object({
+ explanation: z
+ .string()
+ .describe("Explain your reasoning for the score"),
+ relevant: z
+ .boolean()
+ .describe("True if the retrieved documents are relevant to the question, False otherwise")
+ })
+ .describe("Retrieval relevance score for the retrieved documents v.s. the question.")
+ );
+
+ async function retrievalRelevance({
+ inputs,
+ outputs,
+ }: {
+ inputs: Record;
+ outputs: Record;
+ }): Promise => {
+ const docString = outputs.documents.map((doc) => doc.pageContent).join("\n");
+ const answer = \`FACTS: \${docString}
+ QUESTION: \${inputs.question}\`
+
+ // Run evaluator
+ const grade = retrievalRelevanceLLM.invoke([{role: "system", content: retrievalRelevanceInstructions}, {role: "user", content: answer}])\
+ return grade.relevant
+ };
+ `,
+ ]}
+/>
+
+## Run evaluation
+
+We can now kick off our evaluation job with all of our different evaluators.
+
+ dict:
+ return rag_bot(inputs["question"])
+
+ experiment_results = client.evaluate(
+ target,
+ data=dataset_name,
+ evaluators=[correctness, groundedness, relevance, retrieval_relevance],
+ experiment_prefix="rag-doc-relevance",
+ metadata={"version": "LCEL context, gpt-4-0125-preview"},
+ )
+ # Explore results locally as a dataframe if you have pandas installed
+ # experiment_results.to_pandas()
+ `,
+ typescript`
+ import { evaluate } from "langsmith/evaluation";
+
+ const targetFunc = (input: Record) => {
+ return ragBot(inputs.question)
+ };
+
+ const experimentResults = await evaluate(targetFunc, {
+ data: datasetName,
+ evaluators: [correctness, groundedness, relevance, retrievalRelevance],
+ experimentPrefix="rag-doc-relevance",
+ metadata={version: "LCEL context, gpt-4-0125-preview"},
+ });
+ `,
+ ]}
+/>
+
+You can see an example of what these results look like here: [LangSmith link](https://smith.langchain.com/public/302573e2-20bf-4f8c-bdad-e97c20f33f1b/d)
+
+## Reference code
+
+
+
+Here's a consolidated script with all the above code:
+
+ dict:
+ # langchain Retriever will be automatically traced
+ docs = retriever.invoke(question)
+
+ docs_string = "\n\n".join(doc.page_content for doc in docs)
+ instructions = f"""You are a helpful assistant who is good at analyzing source information and answering questions. \
+ Use the following source documents to answer the user's questions. \
+ If you don't know the answer, just say that you don't know. \
+ Use three sentences maximum and keep the answer concise.
+
+ Documents:
+ {docs_string}"""
+ # langchain ChatModel will be automatically traced
+ ai_msg = llm.invoke(
+ [
+ {"role": "system", "content": instructions},
+ {"role": "user", "content": question},
+ ],
+ )
+
+ return {"answer": ai_msg.content, "documents": docs}
+
+
+ client = Client()
+
+ # Define the examples for the dataset
+ examples = [
+ (
+ "How does the ReAct agent use self-reflection? ",
+ "ReAct integrates reasoning and acting, performing actions - such tools like Wikipedia search API - and then observing / reasoning about the tool outputs.",
+ ),
+ (
+ "What are the types of biases that can arise with few-shot prompting?",
+ "The biases that can arise with few-shot prompting include (1) Majority label bias, (2) Recency bias, and (3) Common token bias.",
+ ),
+ (
+ "What are five types of adversarial attacks?",
+ "Five types of adversarial attacks are (1) Token manipulation, (2) Gradient based attack, (3) Jailbreak prompting, (4) Human red-teaming, (5) Model red-teaming.",
+ ),
+ ]
+
+ # Create the dataset and examples in LangSmith
+ dataset_name = "Lilian Weng Blogs Q&A"
+ if not client.has_dataset(dataset_name=dataset_name):
+ dataset = client.create_dataset(dataset_name=dataset_name)
+ client.create_examples(
+ inputs=[{"question": q} for q, _ in examples],
+ outputs=[{"answer": a} for _, a in examples],
+ dataset_id=dataset.id,
+ )
+
+
+ # Grade output schema
+ class CorrectnessGrade(TypedDict):
+ # Note that the order in the fields are defined is the order in which the model will generate them.
+ # It is useful to put explanations before responses because it forces the model to think through
+ # its final response before generating it:
+ explanation: Annotated[str, ..., "Explain your reasoning for the score"]
+ correct: Annotated[bool, ..., "True if the answer is correct, False otherwise."]
+
+
+ # Grade prompt
+ correctness_instructions = """You are a teacher grading a quiz.
+
+ You will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER.
+
+ Here is the grade criteria to follow:
+ (1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer.
+ (2) Ensure that the student answer does not contain any conflicting statements.
+ (3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the ground truth answer.
+
+ Correctness:
+ A correctness value of True means that the student's answer meets all of the criteria.
+ A correctness value of False means that the student's answer does not meet all of the criteria.
+
+ Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct.
+
+ Avoid simply stating the correct answer at the outset."""
+
+ # Grader LLM
+ grader_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(
+ CorrectnessGrade, method="json_schema", strict=True
+ )
-https://youtu.be/IlNglM9bKLw?feature=shared
-```python
-# Prompt
-grade_prompt_hallucinations = prompt = hub.pull("langchain-ai/rag-answer-hallucination")
+ def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
+ """An evaluator for RAG answer accuracy"""
+ answers = f"""\
+ QUESTION: {inputs['question']}
+ GROUND TRUTH ANSWER: {reference_outputs['answer']}
+ STUDENT ANSWER: {outputs['answer']}"""
-def answer_hallucination_evaluator(run, example) -> dict:
- """
- A simple evaluator for generation hallucination
- """
+ # Run evaluator
+ grade = grader_llm.invoke(
+ [
+ {"role": "system", "content": correctness_instructions},
+ {"role": "user", "content": answers},
+ ]
+ )
+ return grade["correct"]
- # RAG inputs
- input_question = example.inputs["input_question"]
- contexts = run.outputs["contexts"]
- # RAG answer
- prediction = run.outputs["answer"]
+ # Grade output schema
+ class RelevanceGrade(TypedDict):
+ explanation: Annotated[str, ..., "Explain your reasoning for the score"]
+ relevant: Annotated[
+ bool, ..., "Provide the score on whether the answer addresses the question"
+ ]
- # LLM grader
- llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)
- # Structured prompt
- answer_grader = grade_prompt_hallucinations | llm
+ # Grade prompt
+ relevance_instructions = """You are a teacher grading a quiz.
- # Get score
- score = answer_grader.invoke({"documents": contexts,
- "student_answer": prediction})
- score = score["Score"]
+ You will be given a QUESTION and a STUDENT ANSWER.
- return {"key": "answer_hallucination", "score": score}
-```
+ Here is the grade criteria to follow:
+ (1) Ensure the STUDENT ANSWER is concise and relevant to the QUESTION
+ (2) Ensure the STUDENT ANSWER helps to answer the QUESTION
-```python
-experiment_results = evaluate(
- predict_rag_answer_with_context,
- data=dataset_name,
- evaluators=[answer_hallucination_evaluator],
- experiment_prefix="rag-answer-hallucination",
- metadata={"version": "LCEL context, gpt-4-0125-preview"},
-)
-```
+ Relevance:
+ A relevance value of True means that the student's answer meets all of the criteria.
+ A relevance value of False means that the student's answer does not meet all of the criteria.
-### **Retrieved docs vs input**
+ Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct.
-Here is an example prompt that we can use:
+ Avoid simply stating the correct answer at the outset."""
-https://smith.langchain.com/hub/langchain-ai/rag-document-relevance
+ # Grader LLM
+ relevance_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(
+ RelevanceGrade, method="json_schema", strict=True
+ )
-Here is the a video from our LangSmith evaluation series for reference:
-https://youtu.be/Fr_7HtHjcf0?feature=shared
+ # Evaluator
+ def relevance(inputs: dict, outputs: dict) -> bool:
+ """A simple evaluator for RAG answer helpfulness."""
+ answer = f"""\
+ QUESTION: {inputs['question']}
+ STUDENT ANSWER: {outputs['answer']}"""
+ grade = relevance_llm.invoke(
+ [
+ {"role": "system", "content": relevance_instructions},
+ {"role": "user", "content": answer},
+ ]
+ )
+ return grade["relevant"]
-```python
-# Grade prompt
-grade_prompt_doc_relevance = hub.pull("langchain-ai/rag-document-relevance")
-def docs_relevance_evaluator(run, example) -> dict:
- """
- A simple evaluator for document relevance
- """
+ # Grade output schema
+ class GroundedGrade(TypedDict):
+ explanation: Annotated[str, ..., "Explain your reasoning for the score"]
+ grounded: Annotated[
+ bool, ..., "Provide the score on if the answer hallucinates from the documents"
+ ]
- # RAG inputs
- input_question = example.inputs["input_question"]
- contexts = run.outputs["contexts"]
- # LLM grader
- llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)
+ # Grade prompt
+ grounded_instructions = """You are a teacher grading a quiz.
- # Structured prompt
- answer_grader = grade_prompt_doc_relevance | llm
+ You will be given FACTS and a STUDENT ANSWER.
- # Get score
- score = answer_grader.invoke({"question":input_question,
- "documents":contexts})
- score = score["Score"]
+ Here is the grade criteria to follow:
+ (1) Ensure the STUDENT ANSWER is grounded in the FACTS.
+ (2) Ensure the STUDENT ANSWER does not contain "hallucinated" information outside the scope of the FACTS.
- return {"key": "document_relevance", "score": score}
-```
+ Grounded:
+ A grounded value of True means that the student's answer meets all of the criteria.
+ A grounded value of False means that the student's answer does not meet all of the criteria.
-```python
-experiment_results = evaluate(
- predict_rag_answer_with_context,
- data=dataset_name,
- evaluators=[docs_relevance_evaluator],
- experiment_prefix="rag-doc-relevance",
- metadata={"version": "LCEL context, gpt-4-0125-preview"},
-)
-```
+ Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct.
-## Evaluating intermediate steps
+ Avoid simply stating the correct answer at the outset."""
-Above, we returned the retrieved documents as part of the final answer.
+ # Grader LLM
+ grounded_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(
+ GroundedGrade, method="json_schema", strict=True
+ )
-However, we will show that this is not required.
-We can isolate them as intermediate chain steps.
+ # Evaluator
+ def groundedness(inputs: dict, outputs: dict) -> bool:
+ """A simple evaluator for RAG answer groundedness."""
+ doc_string = "\n\n".join(doc.page_content for doc in outputs["documents"])
+ answer = f"""\
+ FACTS: {doc_string}
+ STUDENT ANSWER: {outputs['answer']}"""
+ grade = grounded_llm.invoke(
+ [
+ {"role": "system", "content": grounded_instructions},
+ {"role": "user", "content": answer},
+ ]
+ )
+ return grade["grounded"]
-See detail on isolating intermediate chain steps [here](../how_to_guides/evaluate_on_intermediate_steps).
-Here is the a video from our LangSmith evaluation series for reference:
+ # Grade output schema
+ class RetrievalRelevanceGrade(TypedDict):
+ explanation: Annotated[str, ..., "Explain your reasoning for the score"]
+ relevant: Annotated[
+ bool,
+ ...,
+ "True if the retrieved documents are relevant to the question, False otherwise",
+ ]
-https://youtu.be/yx3JMAaNggQ?feature=shared
-```python
-from langsmith.schemas import Example, Run
-from langsmith import evaluate
+ # Grade prompt
+ retrieval_relevance_instructions = """You are a teacher grading a quiz.
-def document_relevance_grader(root_run: Run, example: Example) -> dict:
- """
- A simple evaluator that checks to see if retrieved documents are relevant to the question
- """
+ You will be given a QUESTION and a set of FACTS provided by the student.
- # Get specific steps in our RAG pipeline, which are noted with @traceable decorator
- rag_pipeline_run = next(
- run for run in root_run.child_runs if run.name == "get_answer"
- )
- retrieve_run = next(
- run for run in rag_pipeline_run.child_runs if run.name == "retrieve_docs"
- )
- contexts = "\n\n".join(doc.page_content for doc in retrieve_run.outputs["output"])
- input_question = example.inputs["input_question"]
+ Here is the grade criteria to follow:
+ (1) You goal is to identify FACTS that are completely unrelated to the QUESTION
+ (2) If the facts contain ANY keywords or semantic meaning related to the question, consider them relevant
+ (3) It is OK if the facts have SOME information that is unrelated to the question as long as (2) is met
- # LLM grader
- llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)
+ Relevance:
+ A relevance value of True means that the FACTS contain ANY keywords or semantic meaning related to the QUESTION and are therefore relevant.
+ A relevance value of False means that the FACTS are completely unrelated to the QUESTION.
- # Structured prompt
- answer_grader = grade_prompt_doc_relevance | llm
+ Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct.
- # Get score
- score = answer_grader.invoke({"question":input_question,
- "documents":contexts})
- score = score["Score"]
+ Avoid simply stating the correct answer at the outset."""
- return {"key": "document_relevance", "score": score}
+ # Grader LLM
+ retrieval_relevance_llm = ChatOpenAI(
+ model="gpt-4o", temperature=0
+ ).with_structured_output(RetrievalRelevanceGrade, method="json_schema", strict=True)
-def answer_hallucination_grader(root_run: Run, example: Example) -> dict:
- """
- A simple evaluator that checks to see the answer is grounded in the documents
- """
- # RAG input
- rag_pipeline_run = next(
- run for run in root_run.child_runs if run.name == "get_answer"
- )
- retrieve_run = next(
- run for run in rag_pipeline_run.child_runs if run.name == "retrieve_docs"
- )
- contexts = "\n\n".join(doc.page_content for doc in retrieve_run.outputs["output"])
+ def retrieval_relevance(inputs: dict, outputs: dict) -> bool:
+ """An evaluator for document relevance"""
+ doc_string = "\n\n".join(doc.page_content for doc in outputs["documents"])
+ answer = f"""\
+ FACTS: {doc_string}
+ QUESTION: {inputs['question']}"""
- # RAG output
- prediction = rag_pipeline_run.outputs["answer"]
+ # Run evaluator
+ grade = retrieval_relevance_llm.invoke(
+ [
+ {"role": "system", "content": retrieval_relevance_instructions},
+ {"role": "user", "content": answer},
+ ]
+ )
+ return grade["relevant"]
- # LLM grader
- llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)
- # Structured prompt
- answer_grader = grade_prompt_hallucinations | llm
+ def target(inputs: dict) -> dict:
+ return rag_bot(inputs["question"])
- # Get score
- score = answer_grader.invoke({"documents": contexts,
- "student_answer": prediction})
- score = score["Score"]
- return {"key": "answer_hallucination", "score": score}
+ experiment_results = client.evaluate(
+ target,
+ data=dataset_name,
+ evaluators=[correctness, groundedness, relevance, retrieval_relevance],
+ experiment_prefix="rag-doc-relevance",
+ metadata={"version": "LCEL context, gpt-4-0125-preview"},
+ )
-experiment_results = evaluate(
- predict_rag_answer,
- data=dataset_name,
- evaluators=[document_relevance_grader, answer_hallucination_grader],
- metadata={"version": "LCEL context, gpt-4-0125-preview"},
-)
-```
+ # Explore results locally as a dataframe if you have pandas installed
+ # experiment_results.to_pandas()
+ #endregion
+ `,
+ typescript`
+ import { OpenAIEmbeddings, ChatOpenAI } from "@langchain/openai";
+ import { MemoryVectorStore } from "langchain/vectorstores/memory";
+ import { BrowserbaseLoader } from "@langchain/community/document_loaders/web/browserbase";
+ import { traceable } from "langsmith/traceable";
+ import { Client } from "langsmith";
+ import { evaluate, type EvaluationResult } from "langsmith/evaluation";
+ import { z } from "zod";
+
+ // List of URLs to load documents from
+ const urls = [
+ "https://lilianweng.github.io/posts/2023-06-23-agent/",
+ "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
+ "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
+ ]
+ const loader = new BrowserbaseLoader(urls, {
+ textContent: true,
+ });
+ const docs = await loader.load();
+
+ const splitter = new RecursiveCharacterTextSplitter({
+ chunkSize: 1000, chunkOverlap: 200
+ });
+ const allSplits = await splitter.splitDocuments(docs);
+
+ const embeddings = new OpenAIEmbeddings({
+ model: "text-embedding-3-large"
+ });
+
+ const vectorStore = new MemoryVectorStore(embeddings);
+
+ // Index chunks
+ await vectorStore.addDocuments(allSplits)
+
+ const llm = new ChatOpenAI({
+ model: "gpt-4o",
+ temperature: 1,
+ })
+
+ // Add decorator so this function is traced in LangSmith
+ const ragBot = traceable(
+ async (question: string) => {
+ // LangChain retriever will be automatically traced
+ const retrievedDocs = await vectorStore.similaritySearch(question);
+ const docsContent = retrievedDocs.map((doc) => doc.pageContent).join("\n");
+
+ const instructions = \`You are a helpful assistant who is good at analyzing source information and answering questions.
+ Use the following source documents to answer the user's questions.
+ If you don't know the answer, just say that you don't know.
+ Use three sentences maximum and keep the answer concise.
+
+ Documents:
+ \${docsContent}\`
+
+ const aiMsg = await llm.invoke([
+ {
+ role: "system",
+ content: instructions
+ },
+ {
+ role: "user",
+ content: question
+ }
+ ])
+
+ return {"answer": aiMsg.content, "documents": retrievedDocs}
+ }
+ )
+
+ const client = new Client();
+
+ // Define the examples for the dataset
+ const examples = [
+ [
+ "How does the ReAct agent use self-reflection? ",
+ "ReAct integrates reasoning and acting, performing actions - such tools like Wikipedia search API - and then observing / reasoning about the tool outputs.",
+ ],
+ [
+ "What are the types of biases that can arise with few-shot prompting?",
+ "The biases that can arise with few-shot prompting include (1) Majority label bias, (2) Recency bias, and (3) Common token bias.",
+ ],
+ [
+ "What are five types of adversarial attacks?",
+ "Five types of adversarial attacks are (1) Token manipulation, (2) Gradient based attack, (3) Jailbreak prompting, (4) Human red-teaming, (5) Model red-teaming.",
+ ]
+ ]
+
+ const [inputs, outputs] = examples.reduce<
+ [Array<{ input: string }>, Array<{ outputs: string }>]
+ >(
+ ([inputs, outputs], item) => [
+ [...inputs, { input: item[0] }],
+ [...outputs, { outputs: item[1] }],
+ ],
+ [[], []]
+ );
+
+ const datasetName = "Lilian Weng Blogs Q&A";
+ const dataset = await client.createDataset(datasetName);
+ await client.createExamples({ inputs, outputs, datasetId: dataset.id })
+
+ // Grade prompt
+ const correctnessInstructions = \`You are a teacher grading a quiz.
+
+ You will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER.
+
+ Here is the grade criteria to follow:
+ (1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer.
+ (2) Ensure that the student answer does not contain any conflicting statements.
+ (3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the ground truth answer.
+
+ Correctness:
+ A correctness value of True means that the student's answer meets all of the criteria.
+ A correctness value of False means that the student's answer does not meet all of the criteria.
+
+ Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct.
+
+ Avoid simply stating the correct answer at the outset.\`
+
+ const graderLLM = new ChatOpenAI({
+ model: "gpt-4o",
+ temperature: 0,
+ }).withStructuredOutput(
+ z
+ .object({
+ explanation: z
+ .string()
+ .describe("Explain your reasoning for the score"),
+ correct: z
+ .boolean()
+ .describe("True if the answer is correct, False otherwise.")
+ })
+ .describe("Correctness score for reference answer v.s. generated answer.")
+ );
+
+ async function correctness({
+ inputs,
+ outputs,
+ referenceOutputs,
+ }: {
+ inputs: Record;
+ outputs: Record;
+ referenceOutputs?: Record;
+ }): Promise => {
+ const answer = \`QUESTION: \${inputs.question}
+ GROUND TRUTH ANSWER: \${reference_outputs.answer}
+ STUDENT ANSWER: \${outputs.answer}\`
+
+ // Run evaluator
+ const grade = graderLLM.invoke([{role: "system", content: correctnessInstructions}, {role: "user", content: answer}])\
+ return grade.score
+ };
+
+ // Grade prompt
+ const relevanceInstructions = \`You are a teacher grading a quiz.
+
+ You will be given a QUESTION and a STUDENT ANSWER.
+
+ Here is the grade criteria to follow:
+ (1) Ensure the STUDENT ANSWER is concise and relevant to the QUESTION
+ (2) Ensure the STUDENT ANSWER helps to answer the QUESTION
+
+ Relevance:
+ A relevance value of True means that the student's answer meets all of the criteria.
+ A relevance value of False means that the student's answer does not meet all of the criteria.
+
+ Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct.
+
+ Avoid simply stating the correct answer at the outset.\`
+
+ const relevanceLLM = new ChatOpenAI({
+ model: "gpt-4o",
+ temperature: 0,
+ }).withStructuredOutput(
+ z
+ .object({
+ explanation: z
+ .string()
+ .describe("Explain your reasoning for the score"),
+ relevant: z
+ .boolean()
+ .describe("Provide the score on whether the answer addresses the question")
+ })
+ .describe("Relevance score for gene")
+ );
+
+ async function relevance({
+ inputs,
+ outputs,
+ }: {
+ inputs: Record;
+ outputs: Record;
+ }): Promise => {
+ const answer = \`QUESTION: \${inputs.question}
+ STUDENT ANSWER: \${outputs.answer}\`
+
+ // Run evaluator
+ const grade = relevanceLLM.invoke([{role: "system", content: relevanceInstructions}, {role: "user", content: answer}])\
+ return grade.relevant
+ };
+
+ // Grade prompt
+ const groundedInstructions = \`You are a teacher grading a quiz.
+
+ You will be given FACTS and a STUDENT ANSWER.
+
+ Here is the grade criteria to follow:
+ (1) Ensure the STUDENT ANSWER is grounded in the FACTS.
+ (2) Ensure the STUDENT ANSWER does not contain "hallucinated" information outside the scope of the FACTS.
+
+ Grounded:
+ A grounded value of True means that the student's answer meets all of the criteria.
+ A grounded value of False means that the student's answer does not meet all of the criteria.
+
+ Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct.
+
+ Avoid simply stating the correct answer at the outset.\`
+
+ const groundedLLM = new ChatOpenAI({
+ model: "gpt-4o",
+ temperature: 0,
+ }).withStructuredOutput(
+ z
+ .object({
+ explanation: z
+ .string()
+ .describe("Explain your reasoning for the score"),
+ grounded: z
+ .boolean()
+ .describe("Provide the score on if the answer hallucinates from the documents")
+ })
+ .describe("Grounded score for the answer from the retrieved documents.")
+ );
+
+ async function grounded({
+ inputs,
+ outputs,
+ }: {
+ inputs: Record;
+ outputs: Record;
+ }): Promise => {
+ const docString = outputs.documents.map((doc) => doc.pageContent).join("\n");
+ const answer = \`FACTS: \${docString}
+ STUDENT ANSWER: \${outputs.answer}\`
+
+ // Run evaluator
+ const grade = groundedLLM.invoke([{role: "system", content: groundedInstructions}, {role: "user", content: answer}])\
+ return grade.grounded
+ };
+
+ // Grade prompt
+ const retrievalRelevanceInstructions = \`You are a teacher grading a quiz.
+
+ You will be given a QUESTION and a set of FACTS provided by the student.
+
+ Here is the grade criteria to follow:
+ (1) You goal is to identify FACTS that are completely unrelated to the QUESTION
+ (2) If the facts contain ANY keywords or semantic meaning related to the question, consider them relevant
+ (3) It is OK if the facts have SOME information that is unrelated to the question as long as (2) is met
+
+ Relevance:
+ A relevance value of True means that the FACTS contain ANY keywords or semantic meaning related to the QUESTION and are therefore relevant.
+ A relevance value of False means that the FACTS are completely unrelated to the QUESTION.
+
+ Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct.
+
+ Avoid simply stating the correct answer at the outset.\`
+
+ const retrievalRelevanceLLM = new ChatOpenAI({
+ model: "gpt-4o",
+ temperature: 0,
+ }).withStructuredOutput(
+ z
+ .object({
+ explanation: z
+ .string()
+ .describe("Explain your reasoning for the score"),
+ relevant: z
+ .boolean()
+ .describe("True if the retrieved documents are relevant to the question, False otherwise")
+ })
+ .describe("Retrieval relevance score for the retrieved documents v.s. the question.")
+ );
+
+ async function retrievalRelevance({
+ inputs,
+ outputs,
+ }: {
+ inputs: Record;
+ outputs: Record;
+ }): Promise => {
+ const docString = outputs.documents.map((doc) => doc.pageContent).join("\n");
+ const answer = \`FACTS: \${docString}
+ QUESTION: \${inputs.question}\`
+
+ // Run evaluator
+ const grade = retrievalRelevanceLLM.invoke([{role: "system", content: retrievalRelevanceInstructions}, {role: "user", content: answer}])\
+ return grade.relevant
+ };
+
+ const targetFunc = (input: Record) => {
+ return ragBot(inputs.question)
+ };
+
+ const experimentResults = await evaluate(targetFunc, {
+ data: datasetName,
+ evaluators: [correctness, groundedness, relevance, retrievalRelevance],
+ experimentPrefix="rag-doc-relevance",
+ metadata={version: "LCEL context, gpt-4-0125-preview"},
+ });
+ `,
+ ]}
+/>
+
\ No newline at end of file
diff --git a/docs/evaluation/tutorials/static/rag_eval_overview.png b/docs/evaluation/tutorials/static/rag_eval_overview.png
index 1a2043a7..1452a2a4 100644
Binary files a/docs/evaluation/tutorials/static/rag_eval_overview.png and b/docs/evaluation/tutorials/static/rag_eval_overview.png differ
diff --git a/docs/evaluation/tutorials/static/rag_overview.png b/docs/evaluation/tutorials/static/rag_overview.png
index 52eb40d2..3b8fb7c2 100644
Binary files a/docs/evaluation/tutorials/static/rag_overview.png and b/docs/evaluation/tutorials/static/rag_overview.png differ