diff --git a/docs/.gitbook/assets/online_evals_periodic_eval_chron (1).py b/docs/.gitbook/assets/online_evals_periodic_eval_chron (1).py index fb289a86a3..2ac093867e 100644 --- a/docs/.gitbook/assets/online_evals_periodic_eval_chron (1).py +++ b/docs/.gitbook/assets/online_evals_periodic_eval_chron (1).py @@ -7,7 +7,7 @@ from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents from phoenix.trace import DocumentEvaluations, SpanEvaluations -from phoenix.experimental.evals import ( +from phoenix.evals import ( HallucinationEvaluator, OpenAIModel, QAEvaluator, @@ -18,14 +18,16 @@ from datetime import datetime, timedelta -#Optional but speeds up Evals by 10x +# Optional but speeds up Evals by 10x import nest_asyncio + nest_asyncio.apply() + def lookup_traces(session): # Get traces into a dataframe - #spans_df = session.get_spans_dataframe("span_kind == 'RETRIEVER'") - spans_df = session.get_spans_dataframe() #all spans + # spans_df = session.get_spans_dataframe("span_kind == 'RETRIEVER'") + spans_df = session.get_spans_dataframe() # all spans trace_df = session.get_trace_dataset() if not trace_df: return None, None @@ -39,14 +41,16 @@ def lookup_traces(session): if spans_df is None: return None - spans_df['date'] = pd.to_datetime(spans_df['end_time']).dt.date + spans_df["date"] = pd.to_datetime(spans_df["end_time"]).dt.date # Get today's date today_date = datetime.now().date() + timedelta(days=1) # Calculate yesterday's date yesterday_date = today_date - timedelta(days=1) # Filter for entries from the last day (i.e., yesterday and today) - selected_date_spans_df = spans_df[(spans_df['date'] == today_date) | (spans_df['date'] == yesterday_date)] + selected_date_spans_df = spans_df[ + (spans_df["date"] == today_date) | (spans_df["date"] == yesterday_date) + ] return selected_date_spans_df, evaluation_dfs @@ -54,15 +58,15 @@ def lookup_traces(session): if os.environ.get("OPENAI_API_KEY") is None: openai_api_key = getpass("🔑 Enter your OpenAI API key: ") os.environ["OPENAI_API_KEY"] = openai_api_key - #We need to choose an arbitrary UUID to persist the dataset and reload it + # We need to choose an arbitrary UUID to persist the dataset and reload it TRACE_DATA_UUID = "b4165a34-2020-4e9b-98ec-26c5d7e954d4" has_active_session = px.active_session() is not None if has_active_session: - #Used only in a python runtime + # Used only in a python runtime session = px.active_session() else: - #The most common path from clean Script run, No session will be Live + # The most common path from clean Script run, No session will be Live try: tds = TraceDataset.load(HARD_CODE_UUID) print("Dataset Reloaded") @@ -74,7 +78,7 @@ def lookup_traces(session): px.launch_app() session = px.active_session() - px_client = px.Client(endpoint=str(session.url)) #Client based on URL & port of the session + px_client = px.Client(endpoint=str(session.url)) # Client based on URL & port of the session spans, evaluation_dfs = lookup_traces(session=session, selected_date=datetime.now().date()) if spans is not None: with_eval = set() @@ -84,20 +88,24 @@ def lookup_traces(session): with_eval.add(index[0]) else: with_eval.add(index) - #If a single span in a trace has an evaluation, the entire trace is considered to have an evaluation "eval processed" - trace_with_evals_id_set = set(spans[spans['context.span_id'].isin(with_eval)]['context.trace_id'].unique()) - all_traces_id_set = set(spans['context.trace_id'].unique()) - #Get trace IDs without evaluations + # If a single span in a trace has an evaluation, the entire trace is considered to have an evaluation "eval processed" + trace_with_evals_id_set = set( + spans[spans["context.span_id"].isin(with_eval)]["context.trace_id"].unique() + ) + all_traces_id_set = set(spans["context.trace_id"].unique()) + # Get trace IDs without evaluations traces_without_evals_id_set = all_traces_id_set - trace_with_evals_id_set - spans_without_evals_df = spans[~spans['context.span_id'].isin(with_eval)] - #Get span IDs without evaluations - spans_without_evals_id_set = set(spans_without_evals_df['context.span_id'].unique()) + spans_without_evals_df = spans[~spans["context.span_id"].isin(with_eval)] + # Get span IDs without evaluations + spans_without_evals_id_set = set(spans_without_evals_df["context.span_id"].unique()) queries_df = get_qa_with_reference(px_client) - #Grab Q&A spans without evaluations + # Grab Q&A spans without evaluations queries_no_evals = queries_df[queries_df.index.isin(spans_without_evals_id_set)] retrieved_documents_df = get_retrieved_documents(px_client) - #Grab retireved documents without evaluations, based on trace ID - retrieved_documents_no_evals = retrieved_documents_df[retrieved_documents_df['context.trace_id'].isin(traces_without_evals_id_set)] + # Grab retireved documents without evaluations, based on trace ID + retrieved_documents_no_evals = retrieved_documents_df[ + retrieved_documents_df["context.trace_id"].isin(traces_without_evals_id_set) + ] eval_model = OpenAIModel( model_name="gpt-4-turbo-preview", ) @@ -109,13 +117,13 @@ def lookup_traces(session): dataframe=queries_no_evals, evaluators=[hallucination_evaluator, qa_correctness_evaluator], provide_explanation=True, - concurrency=10 + concurrency=10, ) relevance_eval_df = run_evals( dataframe=retrieved_documents_no_evals, evaluators=[relevance_evaluator], provide_explanation=True, - concurrency=10 + concurrency=10, )[0] px_client.log_evaluations( @@ -125,7 +133,5 @@ def lookup_traces(session): ) tds = px_client.get_trace_dataset() - tds._id =TRACE_DATA_UUID + tds._id = TRACE_DATA_UUID tds.save() - - diff --git a/docs/.gitbook/assets/online_evals_periodic_eval_chron.py b/docs/.gitbook/assets/online_evals_periodic_eval_chron.py index f96ae9784a..f417754c48 100644 --- a/docs/.gitbook/assets/online_evals_periodic_eval_chron.py +++ b/docs/.gitbook/assets/online_evals_periodic_eval_chron.py @@ -7,7 +7,7 @@ from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents from phoenix.trace import DocumentEvaluations, SpanEvaluations -from phoenix.experimental.evals import ( +from phoenix.evals import ( HallucinationEvaluator, OpenAIModel, QAEvaluator, @@ -18,14 +18,16 @@ from datetime import datetime, timedelta -#Optional but speeds up Evals by 10x +# Optional but speeds up Evals by 10x import nest_asyncio + nest_asyncio.apply() + def lookup_traces(session): # Get traces into a dataframe - #spans_df = session.get_spans_dataframe("span_kind == 'RETRIEVER'") - spans_df = session.get_spans_dataframe() #all spans + # spans_df = session.get_spans_dataframe("span_kind == 'RETRIEVER'") + spans_df = session.get_spans_dataframe() # all spans trace_df = session.get_trace_dataset() if not trace_df: return None, None @@ -39,14 +41,16 @@ def lookup_traces(session): if spans_df is None: return None - spans_df['date'] = pd.to_datetime(spans_df['end_time']).dt.date + spans_df["date"] = pd.to_datetime(spans_df["end_time"]).dt.date # Get today's date today_date = datetime.now().date() + timedelta(days=1) # Calculate yesterday's date yesterday_date = today_date - timedelta(days=1) # Filter for entries from the last day (i.e., yesterday and today) - selected_date_spans_df = spans_df[(spans_df['date'] == today_date) | (spans_df['date'] == yesterday_date)] + selected_date_spans_df = spans_df[ + (spans_df["date"] == today_date) | (spans_df["date"] == yesterday_date) + ] return selected_date_spans_df, evaluation_dfs @@ -54,15 +58,15 @@ def lookup_traces(session): if os.environ.get("OPENAI_API_KEY") is None: openai_api_key = getpass("🔑 Enter your OpenAI API key: ") os.environ["OPENAI_API_KEY"] = openai_api_key - #We need to choose an arbitrary UUID to persist the dataset and reload it + # We need to choose an arbitrary UUID to persist the dataset and reload it TRACE_DATA_UUID = "b4165a34-2020-4e9b-98ec-26c5d7e954d4" has_active_session = px.active_session() is not None if has_active_session: - #Used only in a python runtime + # Used only in a python runtime session = px.active_session() else: - #The most common path from clean Script run, No session will be Live + # The most common path from clean Script run, No session will be Live try: tds = TraceDataset.load(TRACE_DATA_UUID) print("Dataset Reloaded") @@ -74,7 +78,7 @@ def lookup_traces(session): px.launch_app() session = px.active_session() - px_client = px.Client(endpoint=str(session.url)) #Client based on URL & port of the session + px_client = px.Client(endpoint=str(session.url)) # Client based on URL & port of the session spans, evaluation_dfs = lookup_traces(session=session, selected_date=datetime.now().date()) if spans is not None: with_eval = set() @@ -84,20 +88,24 @@ def lookup_traces(session): with_eval.add(index[0]) else: with_eval.add(index) - #If a single span in a trace has an evaluation, the entire trace is considered to have an evaluation "eval processed" - trace_with_evals_id_set = set(spans[spans['context.span_id'].isin(with_eval)]['context.trace_id'].unique()) - all_traces_id_set = set(spans['context.trace_id'].unique()) - #Get trace IDs without evaluations + # If a single span in a trace has an evaluation, the entire trace is considered to have an evaluation "eval processed" + trace_with_evals_id_set = set( + spans[spans["context.span_id"].isin(with_eval)]["context.trace_id"].unique() + ) + all_traces_id_set = set(spans["context.trace_id"].unique()) + # Get trace IDs without evaluations traces_without_evals_id_set = all_traces_id_set - trace_with_evals_id_set - spans_without_evals_df = spans[~spans['context.span_id'].isin(with_eval)] - #Get span IDs without evaluations - spans_without_evals_id_set = set(spans_without_evals_df['context.span_id'].unique()) + spans_without_evals_df = spans[~spans["context.span_id"].isin(with_eval)] + # Get span IDs without evaluations + spans_without_evals_id_set = set(spans_without_evals_df["context.span_id"].unique()) queries_df = get_qa_with_reference(px_client) - #Grab Q&A spans without evaluations + # Grab Q&A spans without evaluations queries_no_evals = queries_df[queries_df.index.isin(spans_without_evals_id_set)] retrieved_documents_df = get_retrieved_documents(px_client) - #Grab retireved documents without evaluations, based on trace ID - retrieved_documents_no_evals = retrieved_documents_df[retrieved_documents_df['context.trace_id'].isin(traces_without_evals_id_set)] + # Grab retireved documents without evaluations, based on trace ID + retrieved_documents_no_evals = retrieved_documents_df[ + retrieved_documents_df["context.trace_id"].isin(traces_without_evals_id_set) + ] eval_model = OpenAIModel( model_name="gpt-4-turbo-preview", ) @@ -109,13 +117,13 @@ def lookup_traces(session): dataframe=queries_no_evals, evaluators=[hallucination_evaluator, qa_correctness_evaluator], provide_explanation=True, - concurrency=10 + concurrency=10, ) relevance_eval_df = run_evals( dataframe=retrieved_documents_no_evals, evaluators=[relevance_evaluator], provide_explanation=True, - concurrency=10 + concurrency=10, )[0] px_client.log_evaluations( @@ -125,7 +133,5 @@ def lookup_traces(session): ) tds = px_client.get_trace_dataset() - tds._id =TRACE_DATA_UUID + tds._id = TRACE_DATA_UUID tds.save() - - diff --git a/docs/evaluation/concepts-evals/building-your-own-evals.md b/docs/evaluation/concepts-evals/building-your-own-evals.md index deaafd18a6..5c7d624d26 100644 --- a/docs/evaluation/concepts-evals/building-your-own-evals.md +++ b/docs/evaluation/concepts-evals/building-your-own-evals.md @@ -25,7 +25,7 @@ Building such a dataset is laborious, but you can often find a standardized one The Evals dataset is designed or easy benchmarking and pre-set downloadable test datasets. The datasets are pre-tested, many are hand crafted and designed for testing specific Eval tasks. ```python -from phoenix.experimental.evals import download_benchmark_dataset +from phoenix.evals import download_benchmark_dataset df = download_benchmark_dataset( task="binary-hallucination-classification", dataset_name="halueval_qa_data" diff --git a/docs/evaluation/concepts-evals/evals-with-explanations.md b/docs/evaluation/concepts-evals/evals-with-explanations.md index 17b4a27d29..2e296c5f86 100644 --- a/docs/evaluation/concepts-evals/evals-with-explanations.md +++ b/docs/evaluation/concepts-evals/evals-with-explanations.md @@ -6,7 +6,7 @@ See "Classifications with Explanations Section" It can be hard to understand in many cases why an LLM responds in a specific way. The explanation feature of Phoneix allows you to get a Eval output and an explanation from the LLM at the same time. We have found this incredibly useful for debugging LLM Evals. -
from phoenix.experimental.evals import (
+from phoenix.evals import (
RAG_RELEVANCY_PROMPT_RAILS_MAP,
RAG_RELEVANCY_PROMPT_TEMPLATE,
OpenAIModel,
diff --git a/docs/evaluation/evaluation-models.md b/docs/evaluation/evaluation-models.md
index af0077ca61..ce4df44b47 100644
--- a/docs/evaluation/evaluation-models.md
+++ b/docs/evaluation/evaluation-models.md
@@ -20,9 +20,9 @@ There are direct model integrations in Phoenix and indirect model integrations t
These integrations are native to the Phoenix Evals package and have better throughput, rate limit and error management.
-[Vertex AI](../api/evaluation-models.md#phoenix.experimental.evals.vertexai)
+[Vertex AI](../api/evaluation-models.md#phoenix.evals.vertexai)
-[OpenAI](../api/evaluation-models.md#phoenix.experimental.evals.openaimodel)
+[OpenAI](../api/evaluation-models.md#phoenix.evals.openaimodel)
[Azure OpenAI ](../api/evaluation-models.md#azure-openai)
diff --git a/docs/evaluation/how-to-evals/running-pre-tested-evals/ai-vs-human-groundtruth.md b/docs/evaluation/how-to-evals/running-pre-tested-evals/ai-vs-human-groundtruth.md
index 72db727c03..22e3d68fa9 100644
--- a/docs/evaluation/how-to-evals/running-pre-tested-evals/ai-vs-human-groundtruth.md
+++ b/docs/evaluation/how-to-evals/running-pre-tested-evals/ai-vs-human-groundtruth.md
@@ -59,7 +59,7 @@ idea of the human answer, please answer "incorrect".
#### How to run Eval:
```python
-from phoenix.experimental.evals import (
+from phoenix.evals import (
HUMAN_VS_AI_PROMPT_RAILS_MAP,
HUMAN_VS_AI_PROMPT_TEMPLATE,
OpenAIModel,
diff --git a/docs/evaluation/how-to-evals/running-pre-tested-evals/code-generation-eval.md b/docs/evaluation/how-to-evals/running-pre-tested-evals/code-generation-eval.md
index 74c47f4f2f..1a4db8adfc 100644
--- a/docs/evaluation/how-to-evals/running-pre-tested-evals/code-generation-eval.md
+++ b/docs/evaluation/how-to-evals/running-pre-tested-evals/code-generation-eval.md
@@ -54,7 +54,7 @@ We are continually iterating our templates, view the most up-to-date template on
## How To Run the Eval
```python
-from phoenix.experimental.evals import (
+from phoenix.evals import (
CODE_READABILITY_PROMPT_RAILS_MAP,
CODE_READABILITY_PROMPT_TEMPLATE,
OpenAIModel,
diff --git a/docs/evaluation/how-to-evals/running-pre-tested-evals/hallucinations.md b/docs/evaluation/how-to-evals/running-pre-tested-evals/hallucinations.md
index 7b765d0cba..f74614a922 100644
--- a/docs/evaluation/how-to-evals/running-pre-tested-evals/hallucinations.md
+++ b/docs/evaluation/how-to-evals/running-pre-tested-evals/hallucinations.md
@@ -67,7 +67,7 @@ We are continually iterating our templates, view the most up-to-date template on
## How To Run the Eval
```python
-from phoenix.experimental.evals import (
+from phoenix.evals import (
HALLUCINATION_PROMPT_RAILS_MAP,
HALLUCINATION_PROMPT_TEMPLATE,
OpenAIModel,
diff --git a/docs/evaluation/how-to-evals/running-pre-tested-evals/q-and-a-on-retrieved-data.md b/docs/evaluation/how-to-evals/running-pre-tested-evals/q-and-a-on-retrieved-data.md
index 103932cf62..ff155307ba 100644
--- a/docs/evaluation/how-to-evals/running-pre-tested-evals/q-and-a-on-retrieved-data.md
+++ b/docs/evaluation/how-to-evals/running-pre-tested-evals/q-and-a-on-retrieved-data.md
@@ -53,8 +53,8 @@ We are continually iterating our templates, view the most up-to-date template on
## How To Run the Eval
```python
-import phoenix.experimental.evals.templates.default_templates as templates
-from phoenix.experimental.evals import (
+import phoenix.evals.templates.default_templates as templates
+from phoenix.evals import (
OpenAIModel,
download_benchmark_dataset,
llm_classify,
diff --git a/docs/evaluation/how-to-evals/running-pre-tested-evals/reference-link-evals.md b/docs/evaluation/how-to-evals/running-pre-tested-evals/reference-link-evals.md
index a3ed27c64d..3a22d5797c 100644
--- a/docs/evaluation/how-to-evals/running-pre-tested-evals/reference-link-evals.md
+++ b/docs/evaluation/how-to-evals/running-pre-tested-evals/reference-link-evals.md
@@ -36,7 +36,7 @@ question in the conversation, or doesn't contain information that would allow yo
to answer the specific question please answer "incorrect".
-from phoenix.experimental.evals import (
+from phoenix.evals import (
REF_LINK_EVAL_PROMPT_RAILS_MAP,
REF_LINK_EVAL_PROMPT_TEMPLATE_STR,
OpenAIModel,
diff --git a/docs/evaluation/how-to-evals/running-pre-tested-evals/retrieval-rag-relevance.md b/docs/evaluation/how-to-evals/running-pre-tested-evals/retrieval-rag-relevance.md
index 074b7d7b2e..ba981428cb 100644
--- a/docs/evaluation/how-to-evals/running-pre-tested-evals/retrieval-rag-relevance.md
+++ b/docs/evaluation/how-to-evals/running-pre-tested-evals/retrieval-rag-relevance.md
@@ -53,7 +53,7 @@ We are continually iterating our templates, view the most up-to-date template on
## How To Run the Eval
-from phoenix.experimental.evals import (
+from phoenix.evals import (
RAG_RELEVANCY_PROMPT_RAILS_MAP,
RAG_RELEVANCY_PROMPT_TEMPLATE,
OpenAIModel,
diff --git a/docs/evaluation/how-to-evals/running-pre-tested-evals/summarization-eval.md b/docs/evaluation/how-to-evals/running-pre-tested-evals/summarization-eval.md
index a5fd6e6412..bd049358cc 100644
--- a/docs/evaluation/how-to-evals/running-pre-tested-evals/summarization-eval.md
+++ b/docs/evaluation/how-to-evals/running-pre-tested-evals/summarization-eval.md
@@ -59,8 +59,8 @@ We are continually iterating our templates, view the most up-to-date template on
## How To Run the Eval
```python
-import phoenix.experimental.evals.templates.default_templates as templates
-from phoenix.experimental.evals import (
+import phoenix.evals.templates.default_templates as templates
+from phoenix.evals import (
OpenAIModel,
download_benchmark_dataset,
llm_classify,
diff --git a/docs/evaluation/how-to-evals/running-pre-tested-evals/toxicity.md b/docs/evaluation/how-to-evals/running-pre-tested-evals/toxicity.md
index ca4b4030c2..a7a84ae43d 100644
--- a/docs/evaluation/how-to-evals/running-pre-tested-evals/toxicity.md
+++ b/docs/evaluation/how-to-evals/running-pre-tested-evals/toxicity.md
@@ -57,7 +57,7 @@ We are continually iterating our templates, view the most up-to-date template on
## How To Run the Eval
```python
-from phoenix.experimental.evals import (
+from phoenix.evals import (
TOXICITY_PROMPT_RAILS_MAP,
TOXICITY_PROMPT_TEMPLATE,
OpenAIModel,
diff --git a/docs/quickstart/evals.md b/docs/quickstart/evals.md
index 22766fd93f..f5f81d73cc 100644
--- a/docs/quickstart/evals.md
+++ b/docs/quickstart/evals.md
@@ -65,8 +65,8 @@ Set up evaluators (in this casefor hallucinations and Q\&A correctness), run the
```python
!pip install openai
-from phoenix.experimental.evals import OpenAIModel, HallucinationEvaluator, QAEvaluator
-from phoenix.experimental.evals import run_evals
+from phoenix.evals import OpenAIModel, HallucinationEvaluator, QAEvaluator
+from phoenix.evals import run_evals
import nest_asyncio
nest_asyncio.apply() # This is needed for concurrency in notebook environments
diff --git a/docs/retrieval/quickstart-retrieval.md b/docs/retrieval/quickstart-retrieval.md
index f441d5762c..92ac0de4db 100644
--- a/docs/retrieval/quickstart-retrieval.md
+++ b/docs/retrieval/quickstart-retrieval.md
@@ -82,7 +82,7 @@ This example shows how to run Q\&A and Hallucnation Evals with OpenAI (many othe
```python
from phoenix.trace import SpanEvaluations, DocumentEvaluations
-from phoenix.experimental.evals import (
+from phoenix.evals import (
HALLUCINATION_PROMPT_RAILS_MAP,
HALLUCINATION_PROMPT_TEMPLATE,
QA_PROMPT_RAILS_MAP,
@@ -138,7 +138,7 @@ The snipit of code above links the Evals back to the spans they were generated a
```python
-from phoenix.experimental.evals import (
+from phoenix.evals import (
RAG_RELEVANCY_PROMPT_RAILS_MAP,
RAG_RELEVANCY_PROMPT_TEMPLATE,
OpenAIModel,
diff --git a/docs/use-cases/rag-evaluation.md b/docs/use-cases/rag-evaluation.md
index 001c54cd79..5b44bf8037 100644
--- a/docs/use-cases/rag-evaluation.md
+++ b/docs/use-cases/rag-evaluation.md
@@ -244,7 +244,7 @@ Output the questions in JSON format with the keys question_1, question_2, questi
```python
import json
-from phoenix.experimental.evals import OpenAIModel, llm_generate
+from phoenix.evals import OpenAIModel, llm_generate
def output_parser(response: str, index: int):
@@ -367,7 +367,7 @@ retrieved_documents_df
Let's now use Phoenix's LLM Evals to evaluate the relevance of the retrieved documents with regards to the query. Note, we've turned on `explanations` which prompts the LLM to explain it's reasoning. This can be useful for debugging and for figuring out potential corrective actions.
```python
-from phoenix.experimental.evals import (
+from phoenix.evals import (
RelevanceEvaluator,
run_evals,
)
@@ -523,7 +523,7 @@ qa_with_reference_df
Now that we have a dataset of the question, context, and response (input, reference, and output), we now can measure how well the LLM is responding to the queries. For details on the QA correctness evaluation, see the [LLM Evals documentation](https://docs.arize.com/phoenix/llm-evals/running-pre-tested-evals/q-and-a-on-retrieved-data).
```python
-from phoenix.experimental.evals import (
+from phoenix.evals import (
HallucinationEvaluator,
OpenAIModel,
QAEvaluator,