docs: remove experimental from docs

Arize-ai · Apr 29, 2024 · b79a451 · b79a451
1 parent f216b2a
commit b79a451
Show file tree

Hide file tree

Showing 16 changed files with 83 additions and 71 deletions.
diff --git a/docs/.gitbook/assets/online_evals_periodic_eval_chron (1).py b/docs/.gitbook/assets/online_evals_periodic_eval_chron (1).py
@@ -7,7 +7,7 @@
 
 from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents
 from phoenix.trace import DocumentEvaluations, SpanEvaluations
-from phoenix.experimental.evals import (
+from phoenix.evals import (
     HallucinationEvaluator,
     OpenAIModel,
     QAEvaluator,
@@ -18,14 +18,16 @@
 from datetime import datetime, timedelta
 
 
-#Optional but speeds up Evals by 10x
+# Optional but speeds up Evals by 10x
 import nest_asyncio
+
 nest_asyncio.apply()
 
+
 def lookup_traces(session):
     # Get traces into a dataframe
-    #spans_df = session.get_spans_dataframe("span_kind == 'RETRIEVER'")
-    spans_df = session.get_spans_dataframe() #all spans
+    # spans_df = session.get_spans_dataframe("span_kind == 'RETRIEVER'")
+    spans_df = session.get_spans_dataframe()  # all spans
     trace_df = session.get_trace_dataset()
     if not trace_df:
         return None, None
@@ -39,30 +41,32 @@ def lookup_traces(session):
 
     if spans_df is None:
         return None
-    spans_df['date'] = pd.to_datetime(spans_df['end_time']).dt.date
+    spans_df["date"] = pd.to_datetime(spans_df["end_time"]).dt.date
 
     # Get today's date
     today_date = datetime.now().date() + timedelta(days=1)
     # Calculate yesterday's date
     yesterday_date = today_date - timedelta(days=1)
     # Filter for entries from the last day (i.e., yesterday and today)
-    selected_date_spans_df = spans_df[(spans_df['date'] == today_date) | (spans_df['date'] == yesterday_date)]
+    selected_date_spans_df = spans_df[
+        (spans_df["date"] == today_date) | (spans_df["date"] == yesterday_date)
+    ]
     return selected_date_spans_df, evaluation_dfs
 
 
 if __name__ == "__main__":
     if os.environ.get("OPENAI_API_KEY") is None:
         openai_api_key = getpass("🔑 Enter your OpenAI API key: ")
         os.environ["OPENAI_API_KEY"] = openai_api_key
-    #We need to choose an arbitrary UUID to persist the dataset and reload it
+    # We need to choose an arbitrary UUID to persist the dataset and reload it
     TRACE_DATA_UUID = "b4165a34-2020-4e9b-98ec-26c5d7e954d4"
 
     has_active_session = px.active_session() is not None
     if has_active_session:
-        #Used only in a python runtime
+        # Used only in a python runtime
         session = px.active_session()
     else:
-        #The most common path from clean Script run, No session will be Live
+        # The most common path from clean Script run, No session will be Live
         try:
             tds = TraceDataset.load(HARD_CODE_UUID)
             print("Dataset Reloaded")
@@ -74,7 +78,7 @@ def lookup_traces(session):
             px.launch_app()
             session = px.active_session()
 
-    px_client = px.Client(endpoint=str(session.url)) #Client based on URL & port of the session
+    px_client = px.Client(endpoint=str(session.url))  # Client based on URL & port of the session
     spans, evaluation_dfs = lookup_traces(session=session, selected_date=datetime.now().date())
     if spans is not None:
         with_eval = set()
@@ -84,20 +88,24 @@ def lookup_traces(session):
                     with_eval.add(index[0])
                 else:
                     with_eval.add(index)
-        #If a single span in a trace has an evaluation, the entire trace is considered to have an evaluation "eval processed"
-        trace_with_evals_id_set = set(spans[spans['context.span_id'].isin(with_eval)]['context.trace_id'].unique())
-        all_traces_id_set = set(spans['context.trace_id'].unique())
-        #Get trace IDs without evaluations
+        # If a single span in a trace has an evaluation, the entire trace is considered to have an evaluation "eval processed"
+        trace_with_evals_id_set = set(
+            spans[spans["context.span_id"].isin(with_eval)]["context.trace_id"].unique()
+        )
+        all_traces_id_set = set(spans["context.trace_id"].unique())
+        # Get trace IDs without evaluations
         traces_without_evals_id_set = all_traces_id_set - trace_with_evals_id_set
-        spans_without_evals_df = spans[~spans['context.span_id'].isin(with_eval)]
-        #Get span IDs without evaluations
-        spans_without_evals_id_set = set(spans_without_evals_df['context.span_id'].unique())
+        spans_without_evals_df = spans[~spans["context.span_id"].isin(with_eval)]
+        # Get span IDs without evaluations
+        spans_without_evals_id_set = set(spans_without_evals_df["context.span_id"].unique())
         queries_df = get_qa_with_reference(px_client)
-        #Grab Q&A spans without evaluations
+        # Grab Q&A spans without evaluations
         queries_no_evals = queries_df[queries_df.index.isin(spans_without_evals_id_set)]
         retrieved_documents_df = get_retrieved_documents(px_client)
-        #Grab retireved documents without evaluations, based on trace ID
-        retrieved_documents_no_evals = retrieved_documents_df[retrieved_documents_df['context.trace_id'].isin(traces_without_evals_id_set)]
+        # Grab retireved documents without evaluations, based on trace ID
+        retrieved_documents_no_evals = retrieved_documents_df[
+            retrieved_documents_df["context.trace_id"].isin(traces_without_evals_id_set)
+        ]
         eval_model = OpenAIModel(
             model_name="gpt-4-turbo-preview",
         )
@@ -109,13 +117,13 @@ def lookup_traces(session):
             dataframe=queries_no_evals,
             evaluators=[hallucination_evaluator, qa_correctness_evaluator],
             provide_explanation=True,
-            concurrency=10
+            concurrency=10,
         )
         relevance_eval_df = run_evals(
             dataframe=retrieved_documents_no_evals,
             evaluators=[relevance_evaluator],
             provide_explanation=True,
-            concurrency=10
+            concurrency=10,
         )[0]
 
         px_client.log_evaluations(
@@ -125,7 +133,5 @@ def lookup_traces(session):
         )
 
         tds = px_client.get_trace_dataset()
-        tds._id =TRACE_DATA_UUID
+        tds._id = TRACE_DATA_UUID
         tds.save()
-
-
diff --git a/docs/.gitbook/assets/online_evals_periodic_eval_chron.py b/docs/.gitbook/assets/online_evals_periodic_eval_chron.py
@@ -7,7 +7,7 @@
 
 from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents
 from phoenix.trace import DocumentEvaluations, SpanEvaluations
-from phoenix.experimental.evals import (
+from phoenix.evals import (
     HallucinationEvaluator,
     OpenAIModel,
     QAEvaluator,
@@ -18,14 +18,16 @@
 from datetime import datetime, timedelta
 
 
-#Optional but speeds up Evals by 10x
+# Optional but speeds up Evals by 10x
 import nest_asyncio
+
 nest_asyncio.apply()
 
+
 def lookup_traces(session):
     # Get traces into a dataframe
-    #spans_df = session.get_spans_dataframe("span_kind == 'RETRIEVER'")
-    spans_df = session.get_spans_dataframe() #all spans
+    # spans_df = session.get_spans_dataframe("span_kind == 'RETRIEVER'")
+    spans_df = session.get_spans_dataframe()  # all spans
     trace_df = session.get_trace_dataset()
     if not trace_df:
         return None, None
@@ -39,30 +41,32 @@ def lookup_traces(session):
 
     if spans_df is None:
         return None
-    spans_df['date'] = pd.to_datetime(spans_df['end_time']).dt.date
+    spans_df["date"] = pd.to_datetime(spans_df["end_time"]).dt.date
 
     # Get today's date
     today_date = datetime.now().date() + timedelta(days=1)
     # Calculate yesterday's date
     yesterday_date = today_date - timedelta(days=1)
     # Filter for entries from the last day (i.e., yesterday and today)
-    selected_date_spans_df = spans_df[(spans_df['date'] == today_date) | (spans_df['date'] == yesterday_date)]
+    selected_date_spans_df = spans_df[
+        (spans_df["date"] == today_date) | (spans_df["date"] == yesterday_date)
+    ]
     return selected_date_spans_df, evaluation_dfs
 
 
 if __name__ == "__main__":
     if os.environ.get("OPENAI_API_KEY") is None:
         openai_api_key = getpass("🔑 Enter your OpenAI API key: ")
         os.environ["OPENAI_API_KEY"] = openai_api_key
-    #We need to choose an arbitrary UUID to persist the dataset and reload it
+    # We need to choose an arbitrary UUID to persist the dataset and reload it
     TRACE_DATA_UUID = "b4165a34-2020-4e9b-98ec-26c5d7e954d4"
 
     has_active_session = px.active_session() is not None
     if has_active_session:
-        #Used only in a python runtime
+        # Used only in a python runtime
         session = px.active_session()
     else:
-        #The most common path from clean Script run, No session will be Live
+        # The most common path from clean Script run, No session will be Live
         try:
             tds = TraceDataset.load(TRACE_DATA_UUID)
             print("Dataset Reloaded")
@@ -74,7 +78,7 @@ def lookup_traces(session):
             px.launch_app()
             session = px.active_session()
 
-    px_client = px.Client(endpoint=str(session.url)) #Client based on URL & port of the session
+    px_client = px.Client(endpoint=str(session.url))  # Client based on URL & port of the session
     spans, evaluation_dfs = lookup_traces(session=session, selected_date=datetime.now().date())
     if spans is not None:
         with_eval = set()
@@ -84,20 +88,24 @@ def lookup_traces(session):
                     with_eval.add(index[0])
                 else:
                     with_eval.add(index)
-        #If a single span in a trace has an evaluation, the entire trace is considered to have an evaluation "eval processed"
-        trace_with_evals_id_set = set(spans[spans['context.span_id'].isin(with_eval)]['context.trace_id'].unique())
-        all_traces_id_set = set(spans['context.trace_id'].unique())
-        #Get trace IDs without evaluations
+        # If a single span in a trace has an evaluation, the entire trace is considered to have an evaluation "eval processed"
+        trace_with_evals_id_set = set(
+            spans[spans["context.span_id"].isin(with_eval)]["context.trace_id"].unique()
+        )
+        all_traces_id_set = set(spans["context.trace_id"].unique())
+        # Get trace IDs without evaluations
         traces_without_evals_id_set = all_traces_id_set - trace_with_evals_id_set
-        spans_without_evals_df = spans[~spans['context.span_id'].isin(with_eval)]
-        #Get span IDs without evaluations
-        spans_without_evals_id_set = set(spans_without_evals_df['context.span_id'].unique())
+        spans_without_evals_df = spans[~spans["context.span_id"].isin(with_eval)]
+        # Get span IDs without evaluations
+        spans_without_evals_id_set = set(spans_without_evals_df["context.span_id"].unique())
         queries_df = get_qa_with_reference(px_client)
-        #Grab Q&A spans without evaluations
+        # Grab Q&A spans without evaluations
         queries_no_evals = queries_df[queries_df.index.isin(spans_without_evals_id_set)]
         retrieved_documents_df = get_retrieved_documents(px_client)
-        #Grab retireved documents without evaluations, based on trace ID
-        retrieved_documents_no_evals = retrieved_documents_df[retrieved_documents_df['context.trace_id'].isin(traces_without_evals_id_set)]
+        # Grab retireved documents without evaluations, based on trace ID
+        retrieved_documents_no_evals = retrieved_documents_df[
+            retrieved_documents_df["context.trace_id"].isin(traces_without_evals_id_set)
+        ]
         eval_model = OpenAIModel(
             model_name="gpt-4-turbo-preview",
         )
@@ -109,13 +117,13 @@ def lookup_traces(session):
             dataframe=queries_no_evals,
             evaluators=[hallucination_evaluator, qa_correctness_evaluator],
             provide_explanation=True,
-            concurrency=10
+            concurrency=10,
         )
         relevance_eval_df = run_evals(
             dataframe=retrieved_documents_no_evals,
             evaluators=[relevance_evaluator],
             provide_explanation=True,
-            concurrency=10
+            concurrency=10,
         )[0]
 
         px_client.log_evaluations(
@@ -125,7 +133,5 @@ def lookup_traces(session):
         )
 
         tds = px_client.get_trace_dataset()
-        tds._id =TRACE_DATA_UUID
+        tds._id = TRACE_DATA_UUID
         tds.save()
-
-
diff --git a/docs/evaluation/concepts-evals/building-your-own-evals.md b/docs/evaluation/concepts-evals/building-your-own-evals.md
@@ -25,7 +25,7 @@ Building such a dataset is laborious, but you can often find a standardized one
 The Evals dataset is designed or easy benchmarking and pre-set downloadable test datasets. The datasets are pre-tested, many are hand crafted and designed for testing specific Eval tasks.
 
 ```python
-from phoenix.experimental.evals import download_benchmark_dataset
+from phoenix.evals import download_benchmark_dataset
 
 df = download_benchmark_dataset(
     task="binary-hallucination-classification", dataset_name="halueval_qa_data"

diff --git a/docs/evaluation/concepts-evals/evals-with-explanations.md b/docs/evaluation/concepts-evals/evals-with-explanations.md
@@ -6,7 +6,7 @@ See "Classifications with Explanations Section"
 
 It can be hard to understand in many cases why an LLM responds in a specific way. The explanation feature of Phoneix allows you to get a Eval output and an explanation from the LLM at the same time. We have found this incredibly useful for debugging LLM Evals.
 
-<pre class="language-python"><code class="lang-python">from phoenix.experimental.evals import (
+<pre class="language-python"><code class="lang-python">from phoenix.evals import (
     RAG_RELEVANCY_PROMPT_RAILS_MAP,
     RAG_RELEVANCY_PROMPT_TEMPLATE,
     OpenAIModel,

diff --git a/docs/evaluation/evaluation-models.md b/docs/evaluation/evaluation-models.md
@@ -20,9 +20,9 @@ There are direct model integrations in Phoenix and indirect model integrations t
 
 These integrations are native to the Phoenix Evals package and have better throughput, rate limit and error management.&#x20;
 
-[Vertex AI](../api/evaluation-models.md#phoenix.experimental.evals.vertexai)
+[Vertex AI](../api/evaluation-models.md#phoenix.evals.vertexai)
 
-[OpenAI](../api/evaluation-models.md#phoenix.experimental.evals.openaimodel)
+[OpenAI](../api/evaluation-models.md#phoenix.evals.openaimodel)
 
 [Azure OpenAI ](../api/evaluation-models.md#azure-openai)
 

diff --git a/docs/evaluation/how-to-evals/running-pre-tested-evals/ai-vs-human-groundtruth.md b/docs/evaluation/how-to-evals/running-pre-tested-evals/ai-vs-human-groundtruth.md
@@ -59,7 +59,7 @@ idea of the human answer, please answer "incorrect".
 #### How to run Eval:
 
 ```python
-from phoenix.experimental.evals import (
+from phoenix.evals import (
     HUMAN_VS_AI_PROMPT_RAILS_MAP,
     HUMAN_VS_AI_PROMPT_TEMPLATE,
     OpenAIModel,

diff --git a/docs/evaluation/how-to-evals/running-pre-tested-evals/code-generation-eval.md b/docs/evaluation/how-to-evals/running-pre-tested-evals/code-generation-eval.md
@@ -54,7 +54,7 @@ We are continually iterating our templates, view the most up-to-date template on
 ## How To Run the Eval
 
 ```python
-from phoenix.experimental.evals import (
+from phoenix.evals import (
     CODE_READABILITY_PROMPT_RAILS_MAP,
     CODE_READABILITY_PROMPT_TEMPLATE,
     OpenAIModel,

diff --git a/docs/evaluation/how-to-evals/running-pre-tested-evals/hallucinations.md b/docs/evaluation/how-to-evals/running-pre-tested-evals/hallucinations.md
@@ -67,7 +67,7 @@ We are continually iterating our templates, view the most up-to-date template on
 ## How To Run the Eval
 
 ```python
-from phoenix.experimental.evals import (
+from phoenix.evals import (
     HALLUCINATION_PROMPT_RAILS_MAP,
     HALLUCINATION_PROMPT_TEMPLATE,
     OpenAIModel,

diff --git a/docs/evaluation/how-to-evals/running-pre-tested-evals/q-and-a-on-retrieved-data.md b/docs/evaluation/how-to-evals/running-pre-tested-evals/q-and-a-on-retrieved-data.md
@@ -53,8 +53,8 @@ We are continually iterating our templates, view the most up-to-date template on
 ## How To Run the Eval
 
 ```python
-import phoenix.experimental.evals.templates.default_templates as templates
-from phoenix.experimental.evals import (
+import phoenix.evals.templates.default_templates as templates
+from phoenix.evals import (
     OpenAIModel,
     download_benchmark_dataset,
     llm_classify,

diff --git a/docs/evaluation/how-to-evals/running-pre-tested-evals/reference-link-evals.md b/docs/evaluation/how-to-evals/running-pre-tested-evals/reference-link-evals.md
@@ -36,7 +36,7 @@ question in the conversation, or doesn't contain information that would allow yo
 to answer the specific question please answer "incorrect".
 </code></pre>
 
-<pre class="language-python"><code class="lang-python"><strong>from phoenix.experimental.evals import (
+<pre class="language-python"><code class="lang-python"><strong>from phoenix.evals import (
 </strong><strong>    REF_LINK_EVAL_PROMPT_RAILS_MAP,
 </strong>    REF_LINK_EVAL_PROMPT_TEMPLATE_STR,
     OpenAIModel,

diff --git a/docs/evaluation/how-to-evals/running-pre-tested-evals/retrieval-rag-relevance.md b/docs/evaluation/how-to-evals/running-pre-tested-evals/retrieval-rag-relevance.md
@@ -53,7 +53,7 @@ We are continually iterating our templates, view the most up-to-date template on
 
 ## How To Run the Eval
 
-<pre class="language-python"><code class="lang-python"><strong>from phoenix.experimental.evals import (
+<pre class="language-python"><code class="lang-python"><strong>from phoenix.evals import (
 </strong>    RAG_RELEVANCY_PROMPT_RAILS_MAP,
     RAG_RELEVANCY_PROMPT_TEMPLATE,
     OpenAIModel,

diff --git a/docs/evaluation/how-to-evals/running-pre-tested-evals/summarization-eval.md b/docs/evaluation/how-to-evals/running-pre-tested-evals/summarization-eval.md
@@ -59,8 +59,8 @@ We are continually iterating our templates, view the most up-to-date template on
 ## How To Run the Eval
 
 ```python
-import phoenix.experimental.evals.templates.default_templates as templates
-from phoenix.experimental.evals import (
+import phoenix.evals.templates.default_templates as templates
+from phoenix.evals import (
     OpenAIModel,
     download_benchmark_dataset,
     llm_classify,

diff --git a/docs/evaluation/how-to-evals/running-pre-tested-evals/toxicity.md b/docs/evaluation/how-to-evals/running-pre-tested-evals/toxicity.md
@@ -57,7 +57,7 @@ We are continually iterating our templates, view the most up-to-date template on
 ## How To Run the Eval
 
 ```python
-from phoenix.experimental.evals import (
+from phoenix.evals import (
     TOXICITY_PROMPT_RAILS_MAP,
     TOXICITY_PROMPT_TEMPLATE,
     OpenAIModel,

diff --git a/docs/quickstart/evals.md b/docs/quickstart/evals.md
@@ -65,8 +65,8 @@ Set up evaluators (in this casefor hallucinations and Q\&A correctness), run the
 ```python
 !pip install openai
 
-from phoenix.experimental.evals import OpenAIModel, HallucinationEvaluator, QAEvaluator
-from phoenix.experimental.evals import run_evals
+from phoenix.evals import OpenAIModel, HallucinationEvaluator, QAEvaluator
+from phoenix.evals import run_evals
 import nest_asyncio
 nest_asyncio.apply()  # This is needed for concurrency in notebook environments