diff --git a/docs/evaluation/tutorials/rag.mdx b/docs/evaluation/tutorials/rag.mdx index 7dec0ed5..123d6b23 100644 --- a/docs/evaluation/tutorials/rag.mdx +++ b/docs/evaluation/tutorials/rag.mdx @@ -2,482 +2,1422 @@ sidebar_position: 4 --- -# RAG Evaluations - -We will walk through the evaluation workflow for RAG (retrieval augmented generation). - -## Overview - -We will discuss each piece of the workflow below. - -![rag_overview.png](./static/rag_overview.png) - -## Dataset - -Here is a dataset of [LCEL (LangChain Expression Language)](https://python.langchain.com/v0.1/docs/expression_language/) related questions that we will use. - -This dataset was created using csv upload in the LangSmith UI: - -https://smith.langchain.com/public/730d833b-74da-43e2-a614-4e2ca2502606/d - -Here, we ensure that API keys for OpenAI as well as LangSmith are set. - -```python -import getpass -import os - -def _set_env(var: str): - if not os.environ.get(var): - os.environ[var] = getpass.getpass(f"{var}: ") - -_set_env("OPENAI_API_KEY") -os.environ["LANGCHAIN_TRACING_V2"] = "true" -os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com" # Update appropriately for self-hosted installations or the EU region -_set_env("LANGCHAIN_API_KEY") -``` - -```python -### Dataset name - -# Clone dataset -client = Client() -dataset = client.clone_public_dataset( - "https://smith.langchain.com/public/730d833b-74da-43e2-a614-4e2ca2502606/d" -) - -dataset_name = "LCEL-QA" -``` - -## Task - -Here is a chain that will perform RAG on [LCEL (LangChain Expression Language)](https://python.langchain.com/v0.1/docs/expression_language/) docs. - -We will be using LangChain strictly for creating the retriever and retrieving the relevant documents. - -The overall pipeline does not use LangChain; LangSmith works regardless of whether or not your pipeline is built with LangChain. +import { + CodeTabs, + python, + typescript, +} from "@site/src/components/InstructionsWithCode"; -Here, we return the retrieved documents as part of the final answer. - -However, below we will show that this is not required (using evaluation of intermediate steps). - -See our [RAG-From-Scratch](https://github.com/langchain-ai/rag-from-scratch) repo and tutorial video series for more on this. - -```python -### INDEX - -from bs4 import BeautifulSoup as Soup -from langchain_community.vectorstores import Chroma -from langchain_openai import OpenAIEmbeddings -from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader -from langchain_text_splitters import RecursiveCharacterTextSplitter - -# Load docs -url = "https://python.langchain.com/v0.1/docs/expression_language/" -loader = RecursiveUrlLoader( - url=url, max_depth=20, extractor=lambda x: Soup(x, "html.parser").text -) -docs = loader.load() - -# Split into chunks -text_splitter = RecursiveCharacterTextSplitter(chunk_size=4500, chunk_overlap=200) -splits = text_splitter.split_documents(docs) - -# Embed and store in Chroma -vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings()) - -# Index -retriever = vectorstore.as_retriever() -``` +# RAG Evaluations -```python -### RAG bot +:::info Key concepts +[RAG evaluation](/evaluation/concepts#retrieval-augmented-generation-rag) | [Evaluators](/evaluation/concepts#evaluators) | [LLM-as-judge evaluators](/evaluation/concepts#llm-as-judge) +::: -import openai -from langsmith import traceable -from langsmith.wrappers import wrap_openai +Retrieval Augmented Generation (RAG) is a technique that enhances Large Language Models (LLMs) by providing them with relevant external knowledge. It has become one of the most widely used approaches for building LLM applications. -class RagBot: +This tutorial will show you how to evaluate your RAG applications using LangSmith. You'll learn: - def __init__(self, retriever, model: str = "gpt-4-0125-preview"): - self._retriever = retriever - # Wrapping the client instruments the LLM - self._client = wrap_openai(openai.Client()) - self._model = model +1. How to create test datasets +2. How to run your RAG application on those datasets +3. How to measure your application's performance using different evaluation metrics - @traceable() - def retrieve_docs(self, question): - return self._retriever.invoke(question) +## Overview - @traceable() - def invoke_llm(self, question, docs): - response = self._client.chat.completions.create( - model=self._model, - messages=[ - { - "role": "system", - "content": "You are a helpful AI code assistant with expertise in LCEL." - " Use the following docs to produce a concise code solution to the user question.\n\n" - f"## Docs\n\n{docs}", - }, - {"role": "user", "content": question}, - ], +A typical RAG evaluation workflow consists of three main steps: + +1. Creating a dataset with questions and their expected answers +2. Running your RAG application on those questions +3. Using evaluators to measure how well your application performed, looking at factors like: + - Answer relevance + - Answer accuracy + - Retrieval quality + +For this tutorial, we'll create and evaluate a bot that answers questions about a few of [Lilian Weng's](https://lilianweng.github.io/) insightful blog posts. + +## Setup + +### Environment + +First, let's set our environment variables: + + + +And install the dependencies we'll need: + + + +### Application + +:::info Framework Flexibility +While this tutorial uses LangChain, the evaluation techniques and LangSmith functionality demonstrated here work with any framework. Feel free to use your preferred tools and libraries. +::: + +In this section, we'll build a basic Retrieval-Augmented Generation (RAG) application. + +We'll stick to a simple implementation that: +- Indexing: chunks and indexes a few of Lilian Weng's blogs in a vector store +- Retrieval: retrieves those chunks based on the user question +- Generation: passes the question and retrieved docs to an LLM. + +#### Indexing and retrieval + +First, lets load the blog posts we want to build a chatbot for and index them. + + + + + +#### Generation + +We can now define the generative pipeline. + + dict: + # LangChain retriever will be automatically traced + docs = retriever.invoke(question) + docs_string = "\n\n".join(doc.page_content for doc in docs) + + instructions = f"""You are a helpful assistant who is good at analyzing source information and answering questions. \ + Use the following source documents to answer the user's questions. \ + If you don't know the answer, just say that you don't know. \ + Use three sentences maximum and keep the answer concise. + + Documents: + {docs_string}""" + + # langchain ChatModel will be automatically traced + ai_msg = llm.invoke([ + {"role": "system", "content": instructions}, + {"role": "user", "content": question}, + ], + ) + + return {"answer": ai_msg.content, "documents": docs} + `, + typescript` + import { ChatOpenAI } from "@langchain/openai"; + import { traceable } from "langsmith/traceable"; + + const llm = new ChatOpenAI({ + model: "gpt-4o", + temperature: 1, + }) + + // Add decorator so this function is traced in LangSmith + const ragBot = traceable( + async (question: string) => { + // LangChain retriever will be automatically traced + const retrievedDocs = await vectorStore.similaritySearch(question); + const docsContent = retrievedDocs.map((doc) => doc.pageContent).join("\n"); + + const instructions = \`You are a helpful assistant who is good at analyzing source information and answering questions + Use the following source documents to answer the user's questions. + If you don't know the answer, just say that you don't know. + Use three sentences maximum and keep the answer concise. + Documents: + \${docsContent}\`; + + const aiMsg = await llm.invoke([ + { + role: "system", + content: instructions + }, + { + role: "user", + content: question + } + ]) + + return {"answer": aiMsg.content, "documents": retrievedDocs} + } + ) + + ` + ]} +/> -```python -def predict_rag_answer(example: dict): - """Use this for answer evaluation""" - response = rag_bot.get_answer(example["input_question"]) - return {"answer": response["answer"]} +## Dataset -def predict_rag_answer_with_context(example: dict): - """Use this for evaluation of retrieved documents and hallucinations""" - response = rag_bot.get_answer(example["input_question"]) - return {"answer": response["answer"], "contexts": response["contexts"]} -``` +Now that we've got our application, let's build a dataset to evaluate it. Our dataset will be very simple in this case: we'll have example questions and reference answers. + +, Array<{ outputs: string }>] + >( + ([inputs, outputs], item) => [ + [...inputs, { input: item[0] }], + [...outputs, { outputs: item[1] }], + ], + [[], []] + ); + + const datasetName = "Lilian Weng Blogs Q&A"; + const dataset = await client.createDataset(datasetName); + await client.createExamples({ inputs, outputs, datasetId: dataset.id }) + `, + ]} +/> -## Evaluator +## Evaluators -There are at least 4 types of RAG eval that users are typically interested in. +One way to think about different types of RAG evaluators is as a tuple of what is being evaluated X what its being evaluated against: -1. **Response vs reference answer** +1. **Correctness**: Response vs reference answer - `Goal`: Measure "_how similar/correct is the RAG chain answer, relative to a ground-truth answer_" -- `Mode`: Uses ground truth (reference) answer supplied through a dataset -- `Judge`: Use LLM-as-judge to assess answer correctness. +- `Mode`: Requires a ground truth (reference) answer supplied through a dataset +- `Evaluator`: Use LLM-as-judge to assess answer correctness. -2. **Response vs input** +2. **Relevance**: Response vs input - `Goal`: Measure "_how well does the generated response address the initial user input_" -- `Mode`: Reference-free, because it will compare the answer to the input question -- `Judge`: Use LLM-as-judge to assess answer relevance, helpfulness, etc. +- `Mode`: Does not require reference answer, because it will compare the answer to the input question +- `Evaluator`: Use LLM-as-judge to assess answer relevance, helpfulness, etc. -3. **Response vs retrieved docs** +3. **Groundedness**: Response vs retrieved docs - `Goal`: Measure "_to what extent does the generated response agree with the retrieved context_" -- `Mode`: Reference-free, because it will compare the answer to the retrieved context -- `Judge`: Use LLM-as-judge to assess faithfulness, hallucinations, etc. +- `Mode`: Does not require reference answer, because it will compare the answer to the retrieved context +- `Evaluator`: Use LLM-as-judge to assess faithfulness, hallucinations, etc. -4. **Retrieved docs vs input** +4. **Retrieval relevance**: Retrieved docs vs input -- `Goal`: Measure "_how good are my retrieved results for this query_" -- `Mode`: Reference-free, because it will compare the question to the retrieved context -- `Judge`: Use LLM-as-judge to assess relevance +- `Goal`: Measure "_how relevant are my retrieved results for this query_" +- `Mode`: Does not require reference answer, because it will compare the question to the retrieved context +- `Evaluator`: Use LLM-as-judge to assess relevance ![](./static/rag_eval_overview.png) -### **Response vs reference answer** - -Here is an example prompt that we can use: - -https://smith.langchain.com/hub/langchain-ai/rag-answer-vs-reference - -Here is the a video from our LangSmith evaluation series for reference: - -https://youtu.be/lTfhw_9cJqc?feature=shared - -Here is our evaluator function: - -- `run` is the invocation of `predict_rag_answer`, which has key `answer` -- `example` is from our eval set, which has keys `input_question` and `output_answer` -- We extract these values and pass them into our grader - -```python -from langchain import hub -from langchain_openai import ChatOpenAI - -# Grade prompt -grade_prompt_answer_accuracy = prompt = hub.pull("langchain-ai/rag-answer-vs-reference") - -def answer_evaluator(run, example) -> dict: - """ - A simple evaluator for RAG answer accuracy - """ - - # Get question, ground truth answer, RAG chain answer - input_question = example.inputs["input_question"] - reference = example.outputs["output_answer"] - prediction = run.outputs["answer"] - - # LLM grader - llm = ChatOpenAI(model="gpt-4-turbo", temperature=0) - - # Structured prompt - answer_grader = grade_prompt_answer_accuracy | llm - - # Run evaluator - score = answer_grader.invoke({"question": input_question, - "correct_answer": reference, - "student_answer": prediction}) - score = score["Score"] - - return {"key": "answer_v_reference_score", "score": score} -``` - -Now, we kick off evaluation: - -- `predict_rag_answer`: Takes an `example` from our eval set, extracts the question, passes to our RAG chain -- `answer_evaluator`: Passes RAG chain answer, question, and ground truth answer to an evaluator - -```python -from langsmith import evaluate - -experiment_results = evaluate( - predict_rag_answer, - data=dataset_name, - evaluators=[answer_evaluator], - experiment_prefix="rag-answer-v-reference", - metadata={"version": "LCEL context, gpt-4-0125-preview"}, -) -``` - -### **Response vs input** - -Here is an example prompt that we can use: - -https://smith.langchain.com/hub/langchain-ai/rag-answer-helpfulness - -The information flow is similar to above, but we simply look at the `run` answer versus the `example` question. - -```python -# Grade prompt -grade_prompt_answer_helpfulness = prompt = hub.pull("langchain-ai/rag-answer-helpfulness") - -def answer_helpfulness_evaluator(run, example) -> dict: - """ - A simple evaluator for RAG answer helpfulness - """ - - # Get question, ground truth answer, RAG chain answer - input_question = example.inputs["input_question"] - prediction = run.outputs["answer"] - - # LLM grader - llm = ChatOpenAI(model="gpt-4-turbo", temperature=0) - - # Structured prompt - answer_grader = grade_prompt_answer_helpfulness | llm - - # Run evaluator - score = answer_grader.invoke({"question": input_question, - "student_answer": prediction}) - score = score["Score"] - - return {"key": "answer_helpfulness_score", "score": score} -``` - -```python -experiment_results = evaluate( - predict_rag_answer, - data=dataset_name, - evaluators=[answer_helpfulness_evaluator], - experiment_prefix="rag-answer-helpfulness", - metadata={"version": "LCEL context, gpt-4-0125-preview"}, -) -``` - -### **Response vs retrieved docs** +### Correctness: Response vs reference answer + + bool: + """An evaluator for RAG answer accuracy""" + answers = f"""\ + QUESTION: {inputs['question']} + GROUND TRUTH ANSWER: {reference_outputs['answer']} + STUDENT ANSWER: {outputs['answer']}""" + + # Run evaluator + grade = grader_llm.invoke([{"role": "system", "content": correctness_instructions}, {"role": "user", "content": answers}]) + return grade["correct"] + `, + typescript` + import type { EvaluationResult } from "langsmith/evaluation"; + import { z } from "zod"; + + // Grade prompt + const correctnessInstructions = \`You are a teacher grading a quiz. + + You will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER. + + Here is the grade criteria to follow: + (1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer. + (2) Ensure that the student answer does not contain any conflicting statements. + (3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the ground truth answer. + + Correctness: + A correctness value of True means that the student's answer meets all of the criteria. + A correctness value of False means that the student's answer does not meet all of the criteria. + + Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. + + Avoid simply stating the correct answer at the outset.\` + + const graderLLM = new ChatOpenAI({ + model: "gpt-4o", + temperature: 0, + }).withStructuredOutput( + z + .object({ + explanation: z + .string() + .describe("Explain your reasoning for the score"), + correct: z + .boolean() + .describe("True if the answer is correct, False otherwise.") + }) + .describe("Correctness score for reference answer v.s. generated answer.") + ); + + async function correctness({ + inputs, + outputs, + referenceOutputs, + }: { + inputs: Record; + outputs: Record; + referenceOutputs?: Record; + }): Promise => { + const answer = \`QUESTION: \${inputs.question} + GROUND TRUTH ANSWER: \${reference_outputs.answer} + STUDENT ANSWER: \${outputs.answer}\` + + // Run evaluator + const grade = graderLLM.invoke([{role: "system", content: correctnessInstructions}, {role: "user", content: answer}])\ + return grade.score + }; + `, + ]} +/> + +### Relevance: Response vs input + +The flow is similar to above, but we simply look at the `inputs` and `outputs` without needing the `reference_outputs`. +Without a reference answer we can't grade accuracy, but can still grade relevance—as in, did the model address the user's question or not. + + bool: + """A simple evaluator for RAG answer helpfulness.""" + answer = f"""\ + QUESTION: {inputs['question']} + STUDENT ANSWER: {outputs['answer']}""" + grade = relevance_llm.invoke([{"role": "system", "content": relevance_instructions}, {"role": "user", "content": answer}]) + return grade["relevant"] + `, + typescript` + import type { EvaluationResult } from "langsmith/evaluation"; + import { z } from "zod"; + + // Grade prompt + const relevanceInstructions = \`You are a teacher grading a quiz. + + You will be given a QUESTION and a STUDENT ANSWER. + + Here is the grade criteria to follow: + (1) Ensure the STUDENT ANSWER is concise and relevant to the QUESTION + (2) Ensure the STUDENT ANSWER helps to answer the QUESTION + + Relevance: + A relevance value of True means that the student's answer meets all of the criteria. + A relevance value of False means that the student's answer does not meet all of the criteria. + + Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. + + Avoid simply stating the correct answer at the outset.\` + + const relevanceLLM = new ChatOpenAI({ + model: "gpt-4o", + temperature: 0, + }).withStructuredOutput( + z + .object({ + explanation: z + .string() + .describe("Explain your reasoning for the score"), + relevant: z + .boolean() + .describe("Provide the score on whether the answer addresses the question") + }) + .describe("Relevance score for gene") + ); + + async function relevance({ + inputs, + outputs, + }: { + inputs: Record; + outputs: Record; + }): Promise => { + const answer = \`QUESTION: \${inputs.question} + STUDENT ANSWER: \${outputs.answer}\` + + // Run evaluator + const grade = relevanceLLM.invoke([{role: "system", content: relevanceInstructions}, {role: "user", content: answer}])\ + return grade.relevant + }; + `, + ]} +/> + +### Groundedness: Response vs retrieved docs + +Another useful way to evaluate responses without needing reference answers is to check if the response is justified by (or "grounded in") the retrieved documents. + + bool: + """A simple evaluator for RAG answer groundedness.""" + doc_string = "\n\n".join(doc.page_content for doc in outputs["documents"]) + answer = f"""\ + FACTS: {doc_string} + STUDENT ANSWER: {outputs['answer']}""" + grade = grounded_llm.invoke([{"role": "system", "content": grounded_instructions}, {"role": "user", "content": answer}]) + return grade["grounded"] + `, + typescript` + import type { EvaluationResult } from "langsmith/evaluation"; + import { z } from "zod"; + + // Grade prompt + const groundedInstructions = \`You are a teacher grading a quiz. + + You will be given FACTS and a STUDENT ANSWER. + + Here is the grade criteria to follow: + (1) Ensure the STUDENT ANSWER is grounded in the FACTS. + (2) Ensure the STUDENT ANSWER does not contain "hallucinated" information outside the scope of the FACTS. + + Grounded: + A grounded value of True means that the student's answer meets all of the criteria. + A grounded value of False means that the student's answer does not meet all of the criteria. + + Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. + + Avoid simply stating the correct answer at the outset.\` + + const groundedLLM = new ChatOpenAI({ + model: "gpt-4o", + temperature: 0, + }).withStructuredOutput( + z + .object({ + explanation: z + .string() + .describe("Explain your reasoning for the score"), + grounded: z + .boolean() + .describe("Provide the score on if the answer hallucinates from the documents") + }) + .describe("Grounded score for the answer from the retrieved documents.") + ); + + async function grounded({ + inputs, + outputs, + }: { + inputs: Record; + outputs: Record; + }): Promise => { + const docString = outputs.documents.map((doc) => doc.pageContent).join("\n"); + const answer = \`FACTS: \${docString} + STUDENT ANSWER: \${outputs.answer}\` + + // Run evaluator + const grade = groundedLLM.invoke([{role: "system", content: groundedInstructions}, {role: "user", content: answer}])\ + return grade.grounded + }; + `, + ]} +/> + +### Retrieval relevance: Retrieved docs vs input + + bool: + """An evaluator for document relevance""" + doc_string = "\n\n".join(doc.page_content for doc in outputs["documents"]) + answer = f"""\ + FACTS: {doc_string} + QUESTION: {inputs['question']}""" + + # Run evaluator + grade = retrieval_relevance_llm.invoke([{"role": "system", "content": retrieval_relevance_instructions}, {"role": "user", "content": answer}]) + return grade["relevant"] + `, + typescript` + import type { EvaluationResult } from "langsmith/evaluation"; + import { z } from "zod"; + + // Grade prompt + const retrievalRelevanceInstructions = \`You are a teacher grading a quiz. + + You will be given a QUESTION and a set of FACTS provided by the student. + + Here is the grade criteria to follow: + (1) You goal is to identify FACTS that are completely unrelated to the QUESTION + (2) If the facts contain ANY keywords or semantic meaning related to the question, consider them relevant + (3) It is OK if the facts have SOME information that is unrelated to the question as long as (2) is met + + Relevance: + A relevance value of True means that the FACTS contain ANY keywords or semantic meaning related to the QUESTION and are therefore relevant. + A relevance value of False means that the FACTS are completely unrelated to the QUESTION. + + Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. + + Avoid simply stating the correct answer at the outset.\` + + const retrievalRelevanceLLM = new ChatOpenAI({ + model: "gpt-4o", + temperature: 0, + }).withStructuredOutput( + z + .object({ + explanation: z + .string() + .describe("Explain your reasoning for the score"), + relevant: z + .boolean() + .describe("True if the retrieved documents are relevant to the question, False otherwise") + }) + .describe("Retrieval relevance score for the retrieved documents v.s. the question.") + ); + + async function retrievalRelevance({ + inputs, + outputs, + }: { + inputs: Record; + outputs: Record; + }): Promise => { + const docString = outputs.documents.map((doc) => doc.pageContent).join("\n"); + const answer = \`FACTS: \${docString} + QUESTION: \${inputs.question}\` + + // Run evaluator + const grade = retrievalRelevanceLLM.invoke([{role: "system", content: retrievalRelevanceInstructions}, {role: "user", content: answer}])\ + return grade.relevant + }; + `, + ]} +/> + +## Run evaluation + +We can now kick off our evaluation job with all of our different evaluators. + + dict: + return rag_bot(inputs["question"]) + + experiment_results = client.evaluate( + target, + data=dataset_name, + evaluators=[correctness, groundedness, relevance, retrieval_relevance], + experiment_prefix="rag-doc-relevance", + metadata={"version": "LCEL context, gpt-4-0125-preview"}, + ) + # Explore results locally as a dataframe if you have pandas installed + # experiment_results.to_pandas() + `, + typescript` + import { evaluate } from "langsmith/evaluation"; + + const targetFunc = (input: Record) => { + return ragBot(inputs.question) + }; + + const experimentResults = await evaluate(targetFunc, { + data: datasetName, + evaluators: [correctness, groundedness, relevance, retrievalRelevance], + experimentPrefix="rag-doc-relevance", + metadata={version: "LCEL context, gpt-4-0125-preview"}, + }); + `, + ]} +/> + +You can see an example of what these results look like here: [LangSmith link](https://smith.langchain.com/public/302573e2-20bf-4f8c-bdad-e97c20f33f1b/d) + +## Reference code + +
+ +Here's a consolidated script with all the above code: + + dict: + # langchain Retriever will be automatically traced + docs = retriever.invoke(question) + + docs_string = "\n\n".join(doc.page_content for doc in docs) + instructions = f"""You are a helpful assistant who is good at analyzing source information and answering questions. \ + Use the following source documents to answer the user's questions. \ + If you don't know the answer, just say that you don't know. \ + Use three sentences maximum and keep the answer concise. + + Documents: + {docs_string}""" + # langchain ChatModel will be automatically traced + ai_msg = llm.invoke( + [ + {"role": "system", "content": instructions}, + {"role": "user", "content": question}, + ], + ) + + return {"answer": ai_msg.content, "documents": docs} + + + client = Client() + + # Define the examples for the dataset + examples = [ + ( + "How does the ReAct agent use self-reflection? ", + "ReAct integrates reasoning and acting, performing actions - such tools like Wikipedia search API - and then observing / reasoning about the tool outputs.", + ), + ( + "What are the types of biases that can arise with few-shot prompting?", + "The biases that can arise with few-shot prompting include (1) Majority label bias, (2) Recency bias, and (3) Common token bias.", + ), + ( + "What are five types of adversarial attacks?", + "Five types of adversarial attacks are (1) Token manipulation, (2) Gradient based attack, (3) Jailbreak prompting, (4) Human red-teaming, (5) Model red-teaming.", + ), + ] + + # Create the dataset and examples in LangSmith + dataset_name = "Lilian Weng Blogs Q&A" + if not client.has_dataset(dataset_name=dataset_name): + dataset = client.create_dataset(dataset_name=dataset_name) + client.create_examples( + inputs=[{"question": q} for q, _ in examples], + outputs=[{"answer": a} for _, a in examples], + dataset_id=dataset.id, + ) + + + # Grade output schema + class CorrectnessGrade(TypedDict): + # Note that the order in the fields are defined is the order in which the model will generate them. + # It is useful to put explanations before responses because it forces the model to think through + # its final response before generating it: + explanation: Annotated[str, ..., "Explain your reasoning for the score"] + correct: Annotated[bool, ..., "True if the answer is correct, False otherwise."] + + + # Grade prompt + correctness_instructions = """You are a teacher grading a quiz. + + You will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER. + + Here is the grade criteria to follow: + (1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer. + (2) Ensure that the student answer does not contain any conflicting statements. + (3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the ground truth answer. + + Correctness: + A correctness value of True means that the student's answer meets all of the criteria. + A correctness value of False means that the student's answer does not meet all of the criteria. + + Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. + + Avoid simply stating the correct answer at the outset.""" + + # Grader LLM + grader_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output( + CorrectnessGrade, method="json_schema", strict=True + ) -https://youtu.be/IlNglM9bKLw?feature=shared -```python -# Prompt -grade_prompt_hallucinations = prompt = hub.pull("langchain-ai/rag-answer-hallucination") + def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool: + """An evaluator for RAG answer accuracy""" + answers = f"""\ + QUESTION: {inputs['question']} + GROUND TRUTH ANSWER: {reference_outputs['answer']} + STUDENT ANSWER: {outputs['answer']}""" -def answer_hallucination_evaluator(run, example) -> dict: - """ - A simple evaluator for generation hallucination - """ + # Run evaluator + grade = grader_llm.invoke( + [ + {"role": "system", "content": correctness_instructions}, + {"role": "user", "content": answers}, + ] + ) + return grade["correct"] - # RAG inputs - input_question = example.inputs["input_question"] - contexts = run.outputs["contexts"] - # RAG answer - prediction = run.outputs["answer"] + # Grade output schema + class RelevanceGrade(TypedDict): + explanation: Annotated[str, ..., "Explain your reasoning for the score"] + relevant: Annotated[ + bool, ..., "Provide the score on whether the answer addresses the question" + ] - # LLM grader - llm = ChatOpenAI(model="gpt-4-turbo", temperature=0) - # Structured prompt - answer_grader = grade_prompt_hallucinations | llm + # Grade prompt + relevance_instructions = """You are a teacher grading a quiz. - # Get score - score = answer_grader.invoke({"documents": contexts, - "student_answer": prediction}) - score = score["Score"] + You will be given a QUESTION and a STUDENT ANSWER. - return {"key": "answer_hallucination", "score": score} -``` + Here is the grade criteria to follow: + (1) Ensure the STUDENT ANSWER is concise and relevant to the QUESTION + (2) Ensure the STUDENT ANSWER helps to answer the QUESTION -```python -experiment_results = evaluate( - predict_rag_answer_with_context, - data=dataset_name, - evaluators=[answer_hallucination_evaluator], - experiment_prefix="rag-answer-hallucination", - metadata={"version": "LCEL context, gpt-4-0125-preview"}, -) -``` + Relevance: + A relevance value of True means that the student's answer meets all of the criteria. + A relevance value of False means that the student's answer does not meet all of the criteria. -### **Retrieved docs vs input** + Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. -Here is an example prompt that we can use: + Avoid simply stating the correct answer at the outset.""" -https://smith.langchain.com/hub/langchain-ai/rag-document-relevance + # Grader LLM + relevance_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output( + RelevanceGrade, method="json_schema", strict=True + ) -Here is the a video from our LangSmith evaluation series for reference: -https://youtu.be/Fr_7HtHjcf0?feature=shared + # Evaluator + def relevance(inputs: dict, outputs: dict) -> bool: + """A simple evaluator for RAG answer helpfulness.""" + answer = f"""\ + QUESTION: {inputs['question']} + STUDENT ANSWER: {outputs['answer']}""" + grade = relevance_llm.invoke( + [ + {"role": "system", "content": relevance_instructions}, + {"role": "user", "content": answer}, + ] + ) + return grade["relevant"] -```python -# Grade prompt -grade_prompt_doc_relevance = hub.pull("langchain-ai/rag-document-relevance") -def docs_relevance_evaluator(run, example) -> dict: - """ - A simple evaluator for document relevance - """ + # Grade output schema + class GroundedGrade(TypedDict): + explanation: Annotated[str, ..., "Explain your reasoning for the score"] + grounded: Annotated[ + bool, ..., "Provide the score on if the answer hallucinates from the documents" + ] - # RAG inputs - input_question = example.inputs["input_question"] - contexts = run.outputs["contexts"] - # LLM grader - llm = ChatOpenAI(model="gpt-4-turbo", temperature=0) + # Grade prompt + grounded_instructions = """You are a teacher grading a quiz. - # Structured prompt - answer_grader = grade_prompt_doc_relevance | llm + You will be given FACTS and a STUDENT ANSWER. - # Get score - score = answer_grader.invoke({"question":input_question, - "documents":contexts}) - score = score["Score"] + Here is the grade criteria to follow: + (1) Ensure the STUDENT ANSWER is grounded in the FACTS. + (2) Ensure the STUDENT ANSWER does not contain "hallucinated" information outside the scope of the FACTS. - return {"key": "document_relevance", "score": score} -``` + Grounded: + A grounded value of True means that the student's answer meets all of the criteria. + A grounded value of False means that the student's answer does not meet all of the criteria. -```python -experiment_results = evaluate( - predict_rag_answer_with_context, - data=dataset_name, - evaluators=[docs_relevance_evaluator], - experiment_prefix="rag-doc-relevance", - metadata={"version": "LCEL context, gpt-4-0125-preview"}, -) -``` + Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. -## Evaluating intermediate steps + Avoid simply stating the correct answer at the outset.""" -Above, we returned the retrieved documents as part of the final answer. + # Grader LLM + grounded_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output( + GroundedGrade, method="json_schema", strict=True + ) -However, we will show that this is not required. -We can isolate them as intermediate chain steps. + # Evaluator + def groundedness(inputs: dict, outputs: dict) -> bool: + """A simple evaluator for RAG answer groundedness.""" + doc_string = "\n\n".join(doc.page_content for doc in outputs["documents"]) + answer = f"""\ + FACTS: {doc_string} + STUDENT ANSWER: {outputs['answer']}""" + grade = grounded_llm.invoke( + [ + {"role": "system", "content": grounded_instructions}, + {"role": "user", "content": answer}, + ] + ) + return grade["grounded"] -See detail on isolating intermediate chain steps [here](../how_to_guides/evaluate_on_intermediate_steps). -Here is the a video from our LangSmith evaluation series for reference: + # Grade output schema + class RetrievalRelevanceGrade(TypedDict): + explanation: Annotated[str, ..., "Explain your reasoning for the score"] + relevant: Annotated[ + bool, + ..., + "True if the retrieved documents are relevant to the question, False otherwise", + ] -https://youtu.be/yx3JMAaNggQ?feature=shared -```python -from langsmith.schemas import Example, Run -from langsmith import evaluate + # Grade prompt + retrieval_relevance_instructions = """You are a teacher grading a quiz. -def document_relevance_grader(root_run: Run, example: Example) -> dict: - """ - A simple evaluator that checks to see if retrieved documents are relevant to the question - """ + You will be given a QUESTION and a set of FACTS provided by the student. - # Get specific steps in our RAG pipeline, which are noted with @traceable decorator - rag_pipeline_run = next( - run for run in root_run.child_runs if run.name == "get_answer" - ) - retrieve_run = next( - run for run in rag_pipeline_run.child_runs if run.name == "retrieve_docs" - ) - contexts = "\n\n".join(doc.page_content for doc in retrieve_run.outputs["output"]) - input_question = example.inputs["input_question"] + Here is the grade criteria to follow: + (1) You goal is to identify FACTS that are completely unrelated to the QUESTION + (2) If the facts contain ANY keywords or semantic meaning related to the question, consider them relevant + (3) It is OK if the facts have SOME information that is unrelated to the question as long as (2) is met - # LLM grader - llm = ChatOpenAI(model="gpt-4-turbo", temperature=0) + Relevance: + A relevance value of True means that the FACTS contain ANY keywords or semantic meaning related to the QUESTION and are therefore relevant. + A relevance value of False means that the FACTS are completely unrelated to the QUESTION. - # Structured prompt - answer_grader = grade_prompt_doc_relevance | llm + Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. - # Get score - score = answer_grader.invoke({"question":input_question, - "documents":contexts}) - score = score["Score"] + Avoid simply stating the correct answer at the outset.""" - return {"key": "document_relevance", "score": score} + # Grader LLM + retrieval_relevance_llm = ChatOpenAI( + model="gpt-4o", temperature=0 + ).with_structured_output(RetrievalRelevanceGrade, method="json_schema", strict=True) -def answer_hallucination_grader(root_run: Run, example: Example) -> dict: - """ - A simple evaluator that checks to see the answer is grounded in the documents - """ - # RAG input - rag_pipeline_run = next( - run for run in root_run.child_runs if run.name == "get_answer" - ) - retrieve_run = next( - run for run in rag_pipeline_run.child_runs if run.name == "retrieve_docs" - ) - contexts = "\n\n".join(doc.page_content for doc in retrieve_run.outputs["output"]) + def retrieval_relevance(inputs: dict, outputs: dict) -> bool: + """An evaluator for document relevance""" + doc_string = "\n\n".join(doc.page_content for doc in outputs["documents"]) + answer = f"""\ + FACTS: {doc_string} + QUESTION: {inputs['question']}""" - # RAG output - prediction = rag_pipeline_run.outputs["answer"] + # Run evaluator + grade = retrieval_relevance_llm.invoke( + [ + {"role": "system", "content": retrieval_relevance_instructions}, + {"role": "user", "content": answer}, + ] + ) + return grade["relevant"] - # LLM grader - llm = ChatOpenAI(model="gpt-4-turbo", temperature=0) - # Structured prompt - answer_grader = grade_prompt_hallucinations | llm + def target(inputs: dict) -> dict: + return rag_bot(inputs["question"]) - # Get score - score = answer_grader.invoke({"documents": contexts, - "student_answer": prediction}) - score = score["Score"] - return {"key": "answer_hallucination", "score": score} + experiment_results = client.evaluate( + target, + data=dataset_name, + evaluators=[correctness, groundedness, relevance, retrieval_relevance], + experiment_prefix="rag-doc-relevance", + metadata={"version": "LCEL context, gpt-4-0125-preview"}, + ) -experiment_results = evaluate( - predict_rag_answer, - data=dataset_name, - evaluators=[document_relevance_grader, answer_hallucination_grader], - metadata={"version": "LCEL context, gpt-4-0125-preview"}, -) -``` + # Explore results locally as a dataframe if you have pandas installed + # experiment_results.to_pandas() + #endregion + `, + typescript` + import { OpenAIEmbeddings, ChatOpenAI } from "@langchain/openai"; + import { MemoryVectorStore } from "langchain/vectorstores/memory"; + import { BrowserbaseLoader } from "@langchain/community/document_loaders/web/browserbase"; + import { traceable } from "langsmith/traceable"; + import { Client } from "langsmith"; + import { evaluate, type EvaluationResult } from "langsmith/evaluation"; + import { z } from "zod"; + + // List of URLs to load documents from + const urls = [ + "https://lilianweng.github.io/posts/2023-06-23-agent/", + "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/", + "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/", + ] + const loader = new BrowserbaseLoader(urls, { + textContent: true, + }); + const docs = await loader.load(); + + const splitter = new RecursiveCharacterTextSplitter({ + chunkSize: 1000, chunkOverlap: 200 + }); + const allSplits = await splitter.splitDocuments(docs); + + const embeddings = new OpenAIEmbeddings({ + model: "text-embedding-3-large" + }); + + const vectorStore = new MemoryVectorStore(embeddings); + + // Index chunks + await vectorStore.addDocuments(allSplits) + + const llm = new ChatOpenAI({ + model: "gpt-4o", + temperature: 1, + }) + + // Add decorator so this function is traced in LangSmith + const ragBot = traceable( + async (question: string) => { + // LangChain retriever will be automatically traced + const retrievedDocs = await vectorStore.similaritySearch(question); + const docsContent = retrievedDocs.map((doc) => doc.pageContent).join("\n"); + + const instructions = \`You are a helpful assistant who is good at analyzing source information and answering questions. + Use the following source documents to answer the user's questions. + If you don't know the answer, just say that you don't know. + Use three sentences maximum and keep the answer concise. + + Documents: + \${docsContent}\` + + const aiMsg = await llm.invoke([ + { + role: "system", + content: instructions + }, + { + role: "user", + content: question + } + ]) + + return {"answer": aiMsg.content, "documents": retrievedDocs} + } + ) + + const client = new Client(); + + // Define the examples for the dataset + const examples = [ + [ + "How does the ReAct agent use self-reflection? ", + "ReAct integrates reasoning and acting, performing actions - such tools like Wikipedia search API - and then observing / reasoning about the tool outputs.", + ], + [ + "What are the types of biases that can arise with few-shot prompting?", + "The biases that can arise with few-shot prompting include (1) Majority label bias, (2) Recency bias, and (3) Common token bias.", + ], + [ + "What are five types of adversarial attacks?", + "Five types of adversarial attacks are (1) Token manipulation, (2) Gradient based attack, (3) Jailbreak prompting, (4) Human red-teaming, (5) Model red-teaming.", + ] + ] + + const [inputs, outputs] = examples.reduce< + [Array<{ input: string }>, Array<{ outputs: string }>] + >( + ([inputs, outputs], item) => [ + [...inputs, { input: item[0] }], + [...outputs, { outputs: item[1] }], + ], + [[], []] + ); + + const datasetName = "Lilian Weng Blogs Q&A"; + const dataset = await client.createDataset(datasetName); + await client.createExamples({ inputs, outputs, datasetId: dataset.id }) + + // Grade prompt + const correctnessInstructions = \`You are a teacher grading a quiz. + + You will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER. + + Here is the grade criteria to follow: + (1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer. + (2) Ensure that the student answer does not contain any conflicting statements. + (3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the ground truth answer. + + Correctness: + A correctness value of True means that the student's answer meets all of the criteria. + A correctness value of False means that the student's answer does not meet all of the criteria. + + Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. + + Avoid simply stating the correct answer at the outset.\` + + const graderLLM = new ChatOpenAI({ + model: "gpt-4o", + temperature: 0, + }).withStructuredOutput( + z + .object({ + explanation: z + .string() + .describe("Explain your reasoning for the score"), + correct: z + .boolean() + .describe("True if the answer is correct, False otherwise.") + }) + .describe("Correctness score for reference answer v.s. generated answer.") + ); + + async function correctness({ + inputs, + outputs, + referenceOutputs, + }: { + inputs: Record; + outputs: Record; + referenceOutputs?: Record; + }): Promise => { + const answer = \`QUESTION: \${inputs.question} + GROUND TRUTH ANSWER: \${reference_outputs.answer} + STUDENT ANSWER: \${outputs.answer}\` + + // Run evaluator + const grade = graderLLM.invoke([{role: "system", content: correctnessInstructions}, {role: "user", content: answer}])\ + return grade.score + }; + + // Grade prompt + const relevanceInstructions = \`You are a teacher grading a quiz. + + You will be given a QUESTION and a STUDENT ANSWER. + + Here is the grade criteria to follow: + (1) Ensure the STUDENT ANSWER is concise and relevant to the QUESTION + (2) Ensure the STUDENT ANSWER helps to answer the QUESTION + + Relevance: + A relevance value of True means that the student's answer meets all of the criteria. + A relevance value of False means that the student's answer does not meet all of the criteria. + + Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. + + Avoid simply stating the correct answer at the outset.\` + + const relevanceLLM = new ChatOpenAI({ + model: "gpt-4o", + temperature: 0, + }).withStructuredOutput( + z + .object({ + explanation: z + .string() + .describe("Explain your reasoning for the score"), + relevant: z + .boolean() + .describe("Provide the score on whether the answer addresses the question") + }) + .describe("Relevance score for gene") + ); + + async function relevance({ + inputs, + outputs, + }: { + inputs: Record; + outputs: Record; + }): Promise => { + const answer = \`QUESTION: \${inputs.question} + STUDENT ANSWER: \${outputs.answer}\` + + // Run evaluator + const grade = relevanceLLM.invoke([{role: "system", content: relevanceInstructions}, {role: "user", content: answer}])\ + return grade.relevant + }; + + // Grade prompt + const groundedInstructions = \`You are a teacher grading a quiz. + + You will be given FACTS and a STUDENT ANSWER. + + Here is the grade criteria to follow: + (1) Ensure the STUDENT ANSWER is grounded in the FACTS. + (2) Ensure the STUDENT ANSWER does not contain "hallucinated" information outside the scope of the FACTS. + + Grounded: + A grounded value of True means that the student's answer meets all of the criteria. + A grounded value of False means that the student's answer does not meet all of the criteria. + + Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. + + Avoid simply stating the correct answer at the outset.\` + + const groundedLLM = new ChatOpenAI({ + model: "gpt-4o", + temperature: 0, + }).withStructuredOutput( + z + .object({ + explanation: z + .string() + .describe("Explain your reasoning for the score"), + grounded: z + .boolean() + .describe("Provide the score on if the answer hallucinates from the documents") + }) + .describe("Grounded score for the answer from the retrieved documents.") + ); + + async function grounded({ + inputs, + outputs, + }: { + inputs: Record; + outputs: Record; + }): Promise => { + const docString = outputs.documents.map((doc) => doc.pageContent).join("\n"); + const answer = \`FACTS: \${docString} + STUDENT ANSWER: \${outputs.answer}\` + + // Run evaluator + const grade = groundedLLM.invoke([{role: "system", content: groundedInstructions}, {role: "user", content: answer}])\ + return grade.grounded + }; + + // Grade prompt + const retrievalRelevanceInstructions = \`You are a teacher grading a quiz. + + You will be given a QUESTION and a set of FACTS provided by the student. + + Here is the grade criteria to follow: + (1) You goal is to identify FACTS that are completely unrelated to the QUESTION + (2) If the facts contain ANY keywords or semantic meaning related to the question, consider them relevant + (3) It is OK if the facts have SOME information that is unrelated to the question as long as (2) is met + + Relevance: + A relevance value of True means that the FACTS contain ANY keywords or semantic meaning related to the QUESTION and are therefore relevant. + A relevance value of False means that the FACTS are completely unrelated to the QUESTION. + + Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. + + Avoid simply stating the correct answer at the outset.\` + + const retrievalRelevanceLLM = new ChatOpenAI({ + model: "gpt-4o", + temperature: 0, + }).withStructuredOutput( + z + .object({ + explanation: z + .string() + .describe("Explain your reasoning for the score"), + relevant: z + .boolean() + .describe("True if the retrieved documents are relevant to the question, False otherwise") + }) + .describe("Retrieval relevance score for the retrieved documents v.s. the question.") + ); + + async function retrievalRelevance({ + inputs, + outputs, + }: { + inputs: Record; + outputs: Record; + }): Promise => { + const docString = outputs.documents.map((doc) => doc.pageContent).join("\n"); + const answer = \`FACTS: \${docString} + QUESTION: \${inputs.question}\` + + // Run evaluator + const grade = retrievalRelevanceLLM.invoke([{role: "system", content: retrievalRelevanceInstructions}, {role: "user", content: answer}])\ + return grade.relevant + }; + + const targetFunc = (input: Record) => { + return ragBot(inputs.question) + }; + + const experimentResults = await evaluate(targetFunc, { + data: datasetName, + evaluators: [correctness, groundedness, relevance, retrievalRelevance], + experimentPrefix="rag-doc-relevance", + metadata={version: "LCEL context, gpt-4-0125-preview"}, + }); + `, + ]} +/> +
\ No newline at end of file diff --git a/docs/evaluation/tutorials/static/rag_eval_overview.png b/docs/evaluation/tutorials/static/rag_eval_overview.png index 1a2043a7..1452a2a4 100644 Binary files a/docs/evaluation/tutorials/static/rag_eval_overview.png and b/docs/evaluation/tutorials/static/rag_eval_overview.png differ diff --git a/docs/evaluation/tutorials/static/rag_overview.png b/docs/evaluation/tutorials/static/rag_overview.png index 52eb40d2..3b8fb7c2 100644 Binary files a/docs/evaluation/tutorials/static/rag_overview.png and b/docs/evaluation/tutorials/static/rag_overview.png differ