Skip to content

Commit

Permalink
docs: remove experimental from docs
Browse files Browse the repository at this point in the history
  • Loading branch information
mikeldking committed Apr 29, 2024
1 parent f216b2a commit b79a451
Show file tree
Hide file tree
Showing 16 changed files with 83 additions and 71 deletions.
56 changes: 31 additions & 25 deletions docs/.gitbook/assets/online_evals_periodic_eval_chron (1).py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents
from phoenix.trace import DocumentEvaluations, SpanEvaluations
from phoenix.experimental.evals import (
from phoenix.evals import (
HallucinationEvaluator,
OpenAIModel,
QAEvaluator,
Expand All @@ -18,14 +18,16 @@
from datetime import datetime, timedelta


#Optional but speeds up Evals by 10x
# Optional but speeds up Evals by 10x
import nest_asyncio

nest_asyncio.apply()


def lookup_traces(session):
# Get traces into a dataframe
#spans_df = session.get_spans_dataframe("span_kind == 'RETRIEVER'")
spans_df = session.get_spans_dataframe() #all spans
# spans_df = session.get_spans_dataframe("span_kind == 'RETRIEVER'")
spans_df = session.get_spans_dataframe() # all spans
trace_df = session.get_trace_dataset()
if not trace_df:
return None, None
Expand All @@ -39,30 +41,32 @@ def lookup_traces(session):

if spans_df is None:
return None
spans_df['date'] = pd.to_datetime(spans_df['end_time']).dt.date
spans_df["date"] = pd.to_datetime(spans_df["end_time"]).dt.date

# Get today's date
today_date = datetime.now().date() + timedelta(days=1)
# Calculate yesterday's date
yesterday_date = today_date - timedelta(days=1)
# Filter for entries from the last day (i.e., yesterday and today)
selected_date_spans_df = spans_df[(spans_df['date'] == today_date) | (spans_df['date'] == yesterday_date)]
selected_date_spans_df = spans_df[
(spans_df["date"] == today_date) | (spans_df["date"] == yesterday_date)
]
return selected_date_spans_df, evaluation_dfs


if __name__ == "__main__":
if os.environ.get("OPENAI_API_KEY") is None:
openai_api_key = getpass("🔑 Enter your OpenAI API key: ")
os.environ["OPENAI_API_KEY"] = openai_api_key
#We need to choose an arbitrary UUID to persist the dataset and reload it
# We need to choose an arbitrary UUID to persist the dataset and reload it
TRACE_DATA_UUID = "b4165a34-2020-4e9b-98ec-26c5d7e954d4"

has_active_session = px.active_session() is not None
if has_active_session:
#Used only in a python runtime
# Used only in a python runtime
session = px.active_session()
else:
#The most common path from clean Script run, No session will be Live
# The most common path from clean Script run, No session will be Live
try:
tds = TraceDataset.load(HARD_CODE_UUID)
print("Dataset Reloaded")
Expand All @@ -74,7 +78,7 @@ def lookup_traces(session):
px.launch_app()
session = px.active_session()

px_client = px.Client(endpoint=str(session.url)) #Client based on URL & port of the session
px_client = px.Client(endpoint=str(session.url)) # Client based on URL & port of the session
spans, evaluation_dfs = lookup_traces(session=session, selected_date=datetime.now().date())
if spans is not None:
with_eval = set()
Expand All @@ -84,20 +88,24 @@ def lookup_traces(session):
with_eval.add(index[0])
else:
with_eval.add(index)
#If a single span in a trace has an evaluation, the entire trace is considered to have an evaluation "eval processed"
trace_with_evals_id_set = set(spans[spans['context.span_id'].isin(with_eval)]['context.trace_id'].unique())
all_traces_id_set = set(spans['context.trace_id'].unique())
#Get trace IDs without evaluations
# If a single span in a trace has an evaluation, the entire trace is considered to have an evaluation "eval processed"
trace_with_evals_id_set = set(
spans[spans["context.span_id"].isin(with_eval)]["context.trace_id"].unique()
)
all_traces_id_set = set(spans["context.trace_id"].unique())
# Get trace IDs without evaluations
traces_without_evals_id_set = all_traces_id_set - trace_with_evals_id_set
spans_without_evals_df = spans[~spans['context.span_id'].isin(with_eval)]
#Get span IDs without evaluations
spans_without_evals_id_set = set(spans_without_evals_df['context.span_id'].unique())
spans_without_evals_df = spans[~spans["context.span_id"].isin(with_eval)]
# Get span IDs without evaluations
spans_without_evals_id_set = set(spans_without_evals_df["context.span_id"].unique())
queries_df = get_qa_with_reference(px_client)
#Grab Q&A spans without evaluations
# Grab Q&A spans without evaluations
queries_no_evals = queries_df[queries_df.index.isin(spans_without_evals_id_set)]
retrieved_documents_df = get_retrieved_documents(px_client)
#Grab retireved documents without evaluations, based on trace ID
retrieved_documents_no_evals = retrieved_documents_df[retrieved_documents_df['context.trace_id'].isin(traces_without_evals_id_set)]
# Grab retireved documents without evaluations, based on trace ID
retrieved_documents_no_evals = retrieved_documents_df[
retrieved_documents_df["context.trace_id"].isin(traces_without_evals_id_set)
]
eval_model = OpenAIModel(
model_name="gpt-4-turbo-preview",
)
Expand All @@ -109,13 +117,13 @@ def lookup_traces(session):
dataframe=queries_no_evals,
evaluators=[hallucination_evaluator, qa_correctness_evaluator],
provide_explanation=True,
concurrency=10
concurrency=10,
)
relevance_eval_df = run_evals(
dataframe=retrieved_documents_no_evals,
evaluators=[relevance_evaluator],
provide_explanation=True,
concurrency=10
concurrency=10,
)[0]

px_client.log_evaluations(
Expand All @@ -125,7 +133,5 @@ def lookup_traces(session):
)

tds = px_client.get_trace_dataset()
tds._id =TRACE_DATA_UUID
tds._id = TRACE_DATA_UUID
tds.save()


56 changes: 31 additions & 25 deletions docs/.gitbook/assets/online_evals_periodic_eval_chron.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents
from phoenix.trace import DocumentEvaluations, SpanEvaluations
from phoenix.experimental.evals import (
from phoenix.evals import (
HallucinationEvaluator,
OpenAIModel,
QAEvaluator,
Expand All @@ -18,14 +18,16 @@
from datetime import datetime, timedelta


#Optional but speeds up Evals by 10x
# Optional but speeds up Evals by 10x
import nest_asyncio

nest_asyncio.apply()


def lookup_traces(session):
# Get traces into a dataframe
#spans_df = session.get_spans_dataframe("span_kind == 'RETRIEVER'")
spans_df = session.get_spans_dataframe() #all spans
# spans_df = session.get_spans_dataframe("span_kind == 'RETRIEVER'")
spans_df = session.get_spans_dataframe() # all spans
trace_df = session.get_trace_dataset()
if not trace_df:
return None, None
Expand All @@ -39,30 +41,32 @@ def lookup_traces(session):

if spans_df is None:
return None
spans_df['date'] = pd.to_datetime(spans_df['end_time']).dt.date
spans_df["date"] = pd.to_datetime(spans_df["end_time"]).dt.date

# Get today's date
today_date = datetime.now().date() + timedelta(days=1)
# Calculate yesterday's date
yesterday_date = today_date - timedelta(days=1)
# Filter for entries from the last day (i.e., yesterday and today)
selected_date_spans_df = spans_df[(spans_df['date'] == today_date) | (spans_df['date'] == yesterday_date)]
selected_date_spans_df = spans_df[
(spans_df["date"] == today_date) | (spans_df["date"] == yesterday_date)
]
return selected_date_spans_df, evaluation_dfs


if __name__ == "__main__":
if os.environ.get("OPENAI_API_KEY") is None:
openai_api_key = getpass("🔑 Enter your OpenAI API key: ")
os.environ["OPENAI_API_KEY"] = openai_api_key
#We need to choose an arbitrary UUID to persist the dataset and reload it
# We need to choose an arbitrary UUID to persist the dataset and reload it
TRACE_DATA_UUID = "b4165a34-2020-4e9b-98ec-26c5d7e954d4"

has_active_session = px.active_session() is not None
if has_active_session:
#Used only in a python runtime
# Used only in a python runtime
session = px.active_session()
else:
#The most common path from clean Script run, No session will be Live
# The most common path from clean Script run, No session will be Live
try:
tds = TraceDataset.load(TRACE_DATA_UUID)
print("Dataset Reloaded")
Expand All @@ -74,7 +78,7 @@ def lookup_traces(session):
px.launch_app()
session = px.active_session()

px_client = px.Client(endpoint=str(session.url)) #Client based on URL & port of the session
px_client = px.Client(endpoint=str(session.url)) # Client based on URL & port of the session
spans, evaluation_dfs = lookup_traces(session=session, selected_date=datetime.now().date())
if spans is not None:
with_eval = set()
Expand All @@ -84,20 +88,24 @@ def lookup_traces(session):
with_eval.add(index[0])
else:
with_eval.add(index)
#If a single span in a trace has an evaluation, the entire trace is considered to have an evaluation "eval processed"
trace_with_evals_id_set = set(spans[spans['context.span_id'].isin(with_eval)]['context.trace_id'].unique())
all_traces_id_set = set(spans['context.trace_id'].unique())
#Get trace IDs without evaluations
# If a single span in a trace has an evaluation, the entire trace is considered to have an evaluation "eval processed"
trace_with_evals_id_set = set(
spans[spans["context.span_id"].isin(with_eval)]["context.trace_id"].unique()
)
all_traces_id_set = set(spans["context.trace_id"].unique())
# Get trace IDs without evaluations
traces_without_evals_id_set = all_traces_id_set - trace_with_evals_id_set
spans_without_evals_df = spans[~spans['context.span_id'].isin(with_eval)]
#Get span IDs without evaluations
spans_without_evals_id_set = set(spans_without_evals_df['context.span_id'].unique())
spans_without_evals_df = spans[~spans["context.span_id"].isin(with_eval)]
# Get span IDs without evaluations
spans_without_evals_id_set = set(spans_without_evals_df["context.span_id"].unique())
queries_df = get_qa_with_reference(px_client)
#Grab Q&A spans without evaluations
# Grab Q&A spans without evaluations
queries_no_evals = queries_df[queries_df.index.isin(spans_without_evals_id_set)]
retrieved_documents_df = get_retrieved_documents(px_client)
#Grab retireved documents without evaluations, based on trace ID
retrieved_documents_no_evals = retrieved_documents_df[retrieved_documents_df['context.trace_id'].isin(traces_without_evals_id_set)]
# Grab retireved documents without evaluations, based on trace ID
retrieved_documents_no_evals = retrieved_documents_df[
retrieved_documents_df["context.trace_id"].isin(traces_without_evals_id_set)
]
eval_model = OpenAIModel(
model_name="gpt-4-turbo-preview",
)
Expand All @@ -109,13 +117,13 @@ def lookup_traces(session):
dataframe=queries_no_evals,
evaluators=[hallucination_evaluator, qa_correctness_evaluator],
provide_explanation=True,
concurrency=10
concurrency=10,
)
relevance_eval_df = run_evals(
dataframe=retrieved_documents_no_evals,
evaluators=[relevance_evaluator],
provide_explanation=True,
concurrency=10
concurrency=10,
)[0]

px_client.log_evaluations(
Expand All @@ -125,7 +133,5 @@ def lookup_traces(session):
)

tds = px_client.get_trace_dataset()
tds._id =TRACE_DATA_UUID
tds._id = TRACE_DATA_UUID
tds.save()


2 changes: 1 addition & 1 deletion docs/evaluation/concepts-evals/building-your-own-evals.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Building such a dataset is laborious, but you can often find a standardized one
The Evals dataset is designed or easy benchmarking and pre-set downloadable test datasets. The datasets are pre-tested, many are hand crafted and designed for testing specific Eval tasks.

```python
from phoenix.experimental.evals import download_benchmark_dataset
from phoenix.evals import download_benchmark_dataset

df = download_benchmark_dataset(
task="binary-hallucination-classification", dataset_name="halueval_qa_data"
Expand Down
2 changes: 1 addition & 1 deletion docs/evaluation/concepts-evals/evals-with-explanations.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ See "Classifications with Explanations Section"

It can be hard to understand in many cases why an LLM responds in a specific way. The explanation feature of Phoneix allows you to get a Eval output and an explanation from the LLM at the same time. We have found this incredibly useful for debugging LLM Evals.

<pre class="language-python"><code class="lang-python">from phoenix.experimental.evals import (
<pre class="language-python"><code class="lang-python">from phoenix.evals import (
RAG_RELEVANCY_PROMPT_RAILS_MAP,
RAG_RELEVANCY_PROMPT_TEMPLATE,
OpenAIModel,
Expand Down
4 changes: 2 additions & 2 deletions docs/evaluation/evaluation-models.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ There are direct model integrations in Phoenix and indirect model integrations t

These integrations are native to the Phoenix Evals package and have better throughput, rate limit and error management.&#x20;

[Vertex AI](../api/evaluation-models.md#phoenix.experimental.evals.vertexai)
[Vertex AI](../api/evaluation-models.md#phoenix.evals.vertexai)

[OpenAI](../api/evaluation-models.md#phoenix.experimental.evals.openaimodel)
[OpenAI](../api/evaluation-models.md#phoenix.evals.openaimodel)

[Azure OpenAI ](../api/evaluation-models.md#azure-openai)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ idea of the human answer, please answer "incorrect".
#### How to run Eval:

```python
from phoenix.experimental.evals import (
from phoenix.evals import (
HUMAN_VS_AI_PROMPT_RAILS_MAP,
HUMAN_VS_AI_PROMPT_TEMPLATE,
OpenAIModel,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ We are continually iterating our templates, view the most up-to-date template on
## How To Run the Eval

```python
from phoenix.experimental.evals import (
from phoenix.evals import (
CODE_READABILITY_PROMPT_RAILS_MAP,
CODE_READABILITY_PROMPT_TEMPLATE,
OpenAIModel,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ We are continually iterating our templates, view the most up-to-date template on
## How To Run the Eval

```python
from phoenix.experimental.evals import (
from phoenix.evals import (
HALLUCINATION_PROMPT_RAILS_MAP,
HALLUCINATION_PROMPT_TEMPLATE,
OpenAIModel,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ We are continually iterating our templates, view the most up-to-date template on
## How To Run the Eval

```python
import phoenix.experimental.evals.templates.default_templates as templates
from phoenix.experimental.evals import (
import phoenix.evals.templates.default_templates as templates
from phoenix.evals import (
OpenAIModel,
download_benchmark_dataset,
llm_classify,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ question in the conversation, or doesn't contain information that would allow yo
to answer the specific question please answer "incorrect".
</code></pre>

<pre class="language-python"><code class="lang-python"><strong>from phoenix.experimental.evals import (
<pre class="language-python"><code class="lang-python"><strong>from phoenix.evals import (
</strong><strong> REF_LINK_EVAL_PROMPT_RAILS_MAP,
</strong> REF_LINK_EVAL_PROMPT_TEMPLATE_STR,
OpenAIModel,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ We are continually iterating our templates, view the most up-to-date template on

## How To Run the Eval

<pre class="language-python"><code class="lang-python"><strong>from phoenix.experimental.evals import (
<pre class="language-python"><code class="lang-python"><strong>from phoenix.evals import (
</strong> RAG_RELEVANCY_PROMPT_RAILS_MAP,
RAG_RELEVANCY_PROMPT_TEMPLATE,
OpenAIModel,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ We are continually iterating our templates, view the most up-to-date template on
## How To Run the Eval

```python
import phoenix.experimental.evals.templates.default_templates as templates
from phoenix.experimental.evals import (
import phoenix.evals.templates.default_templates as templates
from phoenix.evals import (
OpenAIModel,
download_benchmark_dataset,
llm_classify,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ We are continually iterating our templates, view the most up-to-date template on
## How To Run the Eval

```python
from phoenix.experimental.evals import (
from phoenix.evals import (
TOXICITY_PROMPT_RAILS_MAP,
TOXICITY_PROMPT_TEMPLATE,
OpenAIModel,
Expand Down
4 changes: 2 additions & 2 deletions docs/quickstart/evals.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ Set up evaluators (in this casefor hallucinations and Q\&A correctness), run the
```python
!pip install openai

from phoenix.experimental.evals import OpenAIModel, HallucinationEvaluator, QAEvaluator
from phoenix.experimental.evals import run_evals
from phoenix.evals import OpenAIModel, HallucinationEvaluator, QAEvaluator
from phoenix.evals import run_evals
import nest_asyncio
nest_asyncio.apply() # This is needed for concurrency in notebook environments

Expand Down
Loading

0 comments on commit b79a451

Please sign in to comment.