Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DO NOT MERGE, wip audio evals #5616

Draft
wants to merge 19 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions packages/phoenix-evals/src/phoenix/evals/classify.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from __future__ import annotations

import inspect
import logging
from collections import defaultdict
from enum import Enum
from itertools import product
from typing import (
Any,
Callable,
DefaultDict,
Dict,
Iterable,
Expand Down Expand Up @@ -68,6 +70,7 @@ def llm_classify(
model: BaseModel,
template: Union[ClassificationTemplate, PromptTemplate, str],
rails: List[str],
data_processor: Optional[Callable[[Any], str]] = None,
system_instruction: Optional[str] = None,
verbose: bool = False,
use_function_calling_if_available: bool = True,
Expand Down Expand Up @@ -102,6 +105,9 @@ def llm_classify(
rails (List[str]): A list of strings representing the possible output classes
of the model's predictions.

data_processor (Optional[Callable[[Any], str]]): An optional general-purpose function which
can be run as a coroutine to process the data before it is passed to the model.

system_instruction (Optional[str], optional): An optional system message.

verbose (bool, optional): If True, prints detailed info to stdout such as
Expand Down Expand Up @@ -153,6 +159,8 @@ def llm_classify(
details about execution errors that may have occurred during the classification as well
as the total runtime of each classification (in seconds).
"""
data_var_name = template.get_data_template_variable()

concurrency = concurrency or model.default_concurrency
# clients need to be reloaded to ensure that async evals work properly
model.reload_client()
Expand Down Expand Up @@ -213,6 +221,16 @@ def _process_response(response: str) -> Tuple[str, Optional[str]]:

async def _run_llm_classification_async(input_data: pd.Series[Any]) -> ParsedLLMResponse:
with set_verbosity(model, verbose) as verbose_model:
if data_processor:
if not inspect.iscoroutinefunction(data_processor):
raise ValueError("data_processor must be an asynchronous function")

# Process audio element and replace with template variable corresponding to data
input_data.loc[data_var_name] = await data_processor(input_data.loc[data_var_name])
input_data.index = [
data_var_name if idx == data_var_name else idx for idx in input_data.index
]

prompt = _map_template(input_data)
response = await verbose_model._async_generate(
prompt, instruction=system_instruction, **model_kwargs
Expand All @@ -222,6 +240,16 @@ async def _run_llm_classification_async(input_data: pd.Series[Any]) -> ParsedLLM

def _run_llm_classification_sync(input_data: pd.Series[Any]) -> ParsedLLMResponse:
with set_verbosity(model, verbose) as verbose_model:
if data_processor:
if inspect.iscoroutinefunction(data_processor):
raise ValueError("data_processor must be a synchronous function")

# Process audio element and replace with template variable corresponding to data
input_data.loc[data_var_name] = data_processor(input_data.loc[data_var_name])
input_data.index = [
data_var_name if idx == data_var_name else idx for idx in input_data.index
]

prompt = _map_template(input_data)
response = verbose_model._generate(
prompt, instruction=system_instruction, **model_kwargs
Expand Down
69 changes: 68 additions & 1 deletion packages/phoenix-evals/src/phoenix/evals/default_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
QA_SPAN_PROMPT_TEMPLATE,
TOOL_CALLING_SPAN_PROMPT_TEMPLATE,
)
from phoenix.evals.templates import ClassificationTemplate
from phoenix.evals.templates import (
ClassificationTemplate,
PromptPartContentType,
PromptPartTemplate,
)

RAG_RELEVANCY_PROMPT_RAILS_MAP = OrderedDict({True: "relevant", False: "unrelated"})
RAG_RELEVANCY_PROMPT_BASE_TEMPLATE = """
Expand Down Expand Up @@ -702,6 +706,69 @@

USER_FRUSTRATION_PROMPT_RAILS_MAP = OrderedDict({True: "frustrated", False: "ok"})

TONE_EMOTION_TEMPLATE_PT1 = """
You are a helpful AI bot that checks for the emotional sentiment of the audio.
Analyze the audio file and determine the sentiment (e.g., positive, neutral, negative).
Your evaluation should provide a multiclass label from the following options:
['positive', 'neutral', 'negative'].

Here is the audio:
"""

TONE_EMOTION_TEMPLATE_PT2 = """{the_audio_string}"""

TONE_EMOTION_TEMPLATE_PT4 = """
Your response must be a string, either positive, neutral, or negative, and should not contain any
text or characters aside from that.
The string positive means that the emotion/tone of the audio suggests that the user is happy,
enthusiastic, etc., such as through positive intonation, energetic delivery, or cheerful
expressions, while neutral reflects a lack of strong emotional cues, and negative indicates
frustration, heightened intensity, and other negative emotions.
"""

EXPLANATION_TEMPLATE_PT = """
Write out in a step-by-step manner an EXPLANATION to show how you determined if the answer was
positive, neutral, or negative.

EXPLANATION:
"""

TONE_EMOTION_RAILS = ["positive", "neutral", "negative"]

AUDIO_SENTIMENT_TEMPLATE = ClassificationTemplate(
rails=TONE_EMOTION_RAILS,
template=[
PromptPartTemplate(
content_type=PromptPartContentType.TEXT,
template=TONE_EMOTION_TEMPLATE_PT1,
),
PromptPartTemplate(
content_type=PromptPartContentType.AUDIO, template=TONE_EMOTION_TEMPLATE_PT2
),
PromptPartTemplate(
content_type=PromptPartContentType.TEXT,
template=TONE_EMOTION_TEMPLATE_PT4,
),
],
explanation_template=[
PromptPartTemplate(
content_type=PromptPartContentType.TEXT,
template=EXPLANATION_TEMPLATE_PT,
),
PromptPartTemplate(
content_type=PromptPartContentType.TEXT,
template=TONE_EMOTION_TEMPLATE_PT1,
),
PromptPartTemplate(
content_type=PromptPartContentType.AUDIO, template=TONE_EMOTION_TEMPLATE_PT2
),
PromptPartTemplate(
content_type=PromptPartContentType.TEXT,
template=TONE_EMOTION_TEMPLATE_PT4,
),
],
)

RAG_RELEVANCY_PROMPT_TEMPLATE = ClassificationTemplate(
rails=list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values()),
template=RAG_RELEVANCY_PROMPT_BASE_TEMPLATE,
Expand Down
27 changes: 22 additions & 5 deletions packages/phoenix-evals/src/phoenix/evals/models/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
from phoenix.evals.models.rate_limiters import RateLimiter
from phoenix.evals.templates import MultimodalPrompt, PromptPartContentType

from ...evals.utils import get_audio_format_from_base64 # todo change...

MINIMUM_OPENAI_VERSION = "1.0.0"
MODEL_TOKEN_LIMIT_MAPPING = {
"gpt-3.5-turbo-instruct": 4096,
Expand Down Expand Up @@ -282,11 +284,26 @@ def _build_messages(
self, prompt: MultimodalPrompt, system_instruction: Optional[str] = None
) -> List[Dict[str, str]]:
messages = []
for parts in prompt.parts:
if parts.content_type == PromptPartContentType.TEXT:
messages.append({"role": "system", "content": parts.content})
for part in prompt.parts:
if part.content_type == PromptPartContentType.TEXT:
messages.append({"role": "system", "content": part.content})
elif part.content_type == PromptPartContentType.AUDIO:
messages.append(
{ # type: ignore
"role": "user",
"content": [
{
"type": "input_audio",
"input_audio": {
"data": part.content,
"format": get_audio_format_from_base64(part.content),
},
}
],
}
)
else:
raise ValueError(f"Unsupported content type: {parts.content_type}")
raise ValueError(f"Unsupported content type: {part.content_type}")
if system_instruction:
messages.insert(0, {"role": "system", "content": str(system_instruction)})
return messages
Expand Down Expand Up @@ -321,7 +338,7 @@ def _generate(self, prompt: Union[str, MultimodalPrompt], **kwargs: Any) -> str:
prompt = MultimodalPrompt.from_string(prompt)

invoke_params = self.invocation_params
messages = self._build_messages(prompt, kwargs.get("instruction"))
messages = self._build_messages(prompt=prompt, system_instruction=kwargs.get("instruction"))
if functions := kwargs.get("functions"):
invoke_params["functions"] = functions
if function_call := kwargs.get("function_call"):
Expand Down
15 changes: 14 additions & 1 deletion packages/phoenix-evals/src/phoenix/evals/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ def get_field(self, field_name: str, args: Sequence[Any], kwargs: Mapping[str, A

class PromptPartContentType(str, Enum):
TEXT = "text"
AUDIO_URL = "audio_url"
TEXT_DATA = "text_data"
AUDIO = "audio"


@dataclass
Expand All @@ -40,6 +41,7 @@ class PromptPart:
content: str


# TODO: ask about rename to PromptTemplatePart
@dataclass
class PromptPartTemplate:
content_type: PromptPartContentType
Expand Down Expand Up @@ -119,6 +121,17 @@ def _normalize_template(
return [PromptPartTemplate(content_type=PromptPartContentType.TEXT, template=template)]
return template

def get_data_template_variable(self) -> Union[str, None]:
if isinstance(self.template, str):
return None

for template_message in self.template:
if (
template_message.content_type == PromptPartContentType.AUDIO
or template_message.content_type == PromptPartContentType.TEXT_DATA
):
return template_message.template.strip("{}")


class ClassificationTemplate(PromptTemplate):
def __init__(
Expand Down
19 changes: 19 additions & 0 deletions packages/phoenix-evals/src/phoenix/evals/utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import base64
import json
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple
from urllib.error import HTTPError
from urllib.request import urlopen
from zipfile import ZipFile

import filetype
import pandas as pd
from tqdm.auto import tqdm

Expand Down Expand Up @@ -174,3 +176,20 @@ def _default_openai_function(
def printif(condition: bool, *args: Any, **kwargs: Any) -> None:
if condition:
tqdm.write(*args, **kwargs)


def get_audio_format_from_base64(enc_str: str) -> str:
"""
Determines the audio format from a Base64 encoded string.

Args:
enc_str (str): The Base64 encoded audio data.

Returns:
str: The audio format ('wav', 'mp3', 'flac', 'ogg', 'aac', or 'unknown').
"""
# Decode the Base64 string back to bytes and guess the file type
audio_bytes = base64.b64decode(enc_str)
kind = filetype.guess(audio_bytes)

return kind.extension
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,17 @@ def classification_dataframe():
)


@pytest.fixture
def audio_classification_dataframe():
return pd.DataFrame(
[
{
"input": "/AAABAAgABwADAAwACAAOABAACgAQABIAFQASABkCAAGAPACMAGgAcACAAFwAmACcAHQAuA=",
},
]
)


@pytest.fixture
def classification_responses():
return [
Expand Down Expand Up @@ -334,6 +345,15 @@ def test_classify_fn_call_explain(
)


@pytest.mark.respx(base_url="https://api.openai.com/v1/chat/completions")
def test_classify_data_processor(
openai_api_key: str, classification_dataframe: DataFrame, respx_mock: respx.mock
):
# Test for case where the encoded string is not of a valid file type
# todo implement this test
pass


@pytest.mark.respx(base_url="https://api.openai.com/v1/chat/completions")
def test_llm_classify_prints_to_stdout_with_verbose_flag(
classification_dataframe: DataFrame,
Expand Down
47 changes: 46 additions & 1 deletion packages/phoenix-evals/tests/phoenix/evals/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from phoenix.evals.utils import NOT_PARSABLE, snap_to_rail
from dataclasses import dataclass

from phoenix.evals.utils import NOT_PARSABLE, get_audio_format_from_base64, snap_to_rail


def test_snap_to_rail():
Expand All @@ -16,3 +18,46 @@ def test_snap_to_rail():
assert snap_to_rail("a", ["a", "b", "c"]) == "a"
assert snap_to_rail(" abc", ["a", "ab", "abc"]) == "abc"
assert snap_to_rail("abc", ["abc", "a", "ab"]) == "abc"


def test_get_audio_format_from_base64():
@dataclass
class Sample:
enc_data: str
format: str

samples = [
Sample(
enc_data="UklGRiSaCABXQVZFZm10IBAAAAABAAIARKwAABCxAgAEABAAZGF0YQCaCABy+HL4BPsE+w//D"
"/9WAlYCGgEaAfIA8gA6AToB0ADQ",
format="wav",
),
Sample(
enc_data="SUQzBAAAAAAAI1RTU0UAAAAPAAADTGF2ZjU3LjgzLjEwMAAAAAAAAAAAAAAA"
"//tQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA",
format="mp3",
),
Sample(
enc_data="T2dnUwACAAAAAAAAAABdwLNHAAAAANYQEycBHgF2b3JiaXMAAAAAAkSsAAAAAAAAA3ECAAAAAA"
"C4AU9nZ1MAAAAAAAAAAAAAXcCz",
format="ogg",
),
Sample(
enc_data="ZkxhQwAAACISABIAAAkBADcsCsRC8ABHLQsq7uacAVPLZSxxjf3w6f8tBAAALg0AAABMYXZmNTc"
"uODMuMTAwAQAAABUAAABlbmNv",
format="flac",
),
Sample(
enc_data="//FQgAP//N4EAExhdmM1Ny4xMDcuMTAwAEIgCMEYOP/xUIBxH/whKwwBNFoeYDeQgCQQEQQCgQC"
"w7jBLFAWFQ2EiTzU874zjjPP3",
format="aac",
),
Sample(
enc_data="cvhy+AT7BPsP/w//VgJWAhoBGgHyAPIAOgE6AdAA0ACA/4D/Nvs2+735vfkC+AL4EfYR9izyLPL"
"Z9dn1AvoC+vn3+fdy+XL5K/cr",
format="pcm"
)
]

for sample in samples:
assert get_audio_format_from_base64(sample.enc_data) == sample.format
Loading