diff --git a/packages/phoenix-evals/src/phoenix/evals/classify.py b/packages/phoenix-evals/src/phoenix/evals/classify.py index 8d47145da4..c7bc346cd6 100644 --- a/packages/phoenix-evals/src/phoenix/evals/classify.py +++ b/packages/phoenix-evals/src/phoenix/evals/classify.py @@ -1,11 +1,13 @@ from __future__ import annotations +import inspect import logging from collections import defaultdict from enum import Enum from itertools import product from typing import ( Any, + Callable, DefaultDict, Dict, Iterable, @@ -68,6 +70,7 @@ def llm_classify( model: BaseModel, template: Union[ClassificationTemplate, PromptTemplate, str], rails: List[str], + data_processor: Optional[Callable[[Any], str]] = None, system_instruction: Optional[str] = None, verbose: bool = False, use_function_calling_if_available: bool = True, @@ -102,6 +105,9 @@ def llm_classify( rails (List[str]): A list of strings representing the possible output classes of the model's predictions. + data_processor (Optional[Callable[[Any], str]]): An optional general-purpose function which + can be run as a coroutine to process the data before it is passed to the model. + system_instruction (Optional[str], optional): An optional system message. verbose (bool, optional): If True, prints detailed info to stdout such as @@ -153,6 +159,8 @@ def llm_classify( details about execution errors that may have occurred during the classification as well as the total runtime of each classification (in seconds). """ + data_var_name = template.get_data_template_variable() + concurrency = concurrency or model.default_concurrency # clients need to be reloaded to ensure that async evals work properly model.reload_client() @@ -213,6 +221,16 @@ def _process_response(response: str) -> Tuple[str, Optional[str]]: async def _run_llm_classification_async(input_data: pd.Series[Any]) -> ParsedLLMResponse: with set_verbosity(model, verbose) as verbose_model: + if data_processor: + if not inspect.iscoroutinefunction(data_processor): + raise ValueError("data_processor must be an asynchronous function") + + # Process audio element and replace with template variable corresponding to data + input_data.loc[data_var_name] = await data_processor(input_data.loc[data_var_name]) + input_data.index = [ + data_var_name if idx == data_var_name else idx for idx in input_data.index + ] + prompt = _map_template(input_data) response = await verbose_model._async_generate( prompt, instruction=system_instruction, **model_kwargs @@ -222,6 +240,16 @@ async def _run_llm_classification_async(input_data: pd.Series[Any]) -> ParsedLLM def _run_llm_classification_sync(input_data: pd.Series[Any]) -> ParsedLLMResponse: with set_verbosity(model, verbose) as verbose_model: + if data_processor: + if inspect.iscoroutinefunction(data_processor): + raise ValueError("data_processor must be a synchronous function") + + # Process audio element and replace with template variable corresponding to data + input_data.loc[data_var_name] = data_processor(input_data.loc[data_var_name]) + input_data.index = [ + data_var_name if idx == data_var_name else idx for idx in input_data.index + ] + prompt = _map_template(input_data) response = verbose_model._generate( prompt, instruction=system_instruction, **model_kwargs diff --git a/packages/phoenix-evals/src/phoenix/evals/default_templates.py b/packages/phoenix-evals/src/phoenix/evals/default_templates.py index 98f37e2d40..f07e953111 100644 --- a/packages/phoenix-evals/src/phoenix/evals/default_templates.py +++ b/packages/phoenix-evals/src/phoenix/evals/default_templates.py @@ -6,7 +6,11 @@ QA_SPAN_PROMPT_TEMPLATE, TOOL_CALLING_SPAN_PROMPT_TEMPLATE, ) -from phoenix.evals.templates import ClassificationTemplate +from phoenix.evals.templates import ( + ClassificationTemplate, + PromptPartContentType, + PromptPartTemplate, +) RAG_RELEVANCY_PROMPT_RAILS_MAP = OrderedDict({True: "relevant", False: "unrelated"}) RAG_RELEVANCY_PROMPT_BASE_TEMPLATE = """ @@ -702,6 +706,69 @@ USER_FRUSTRATION_PROMPT_RAILS_MAP = OrderedDict({True: "frustrated", False: "ok"}) +TONE_EMOTION_TEMPLATE_PT1 = """ +You are a helpful AI bot that checks for the emotional sentiment of the audio. +Analyze the audio file and determine the sentiment (e.g., positive, neutral, negative). +Your evaluation should provide a multiclass label from the following options: +['positive', 'neutral', 'negative']. + +Here is the audio: +""" + +TONE_EMOTION_TEMPLATE_PT2 = """{the_audio_string}""" + +TONE_EMOTION_TEMPLATE_PT4 = """ +Your response must be a string, either positive, neutral, or negative, and should not contain any +text or characters aside from that. +The string positive means that the emotion/tone of the audio suggests that the user is happy, +enthusiastic, etc., such as through positive intonation, energetic delivery, or cheerful +expressions, while neutral reflects a lack of strong emotional cues, and negative indicates +frustration, heightened intensity, and other negative emotions. +""" + +EXPLANATION_TEMPLATE_PT = """ +Write out in a step-by-step manner an EXPLANATION to show how you determined if the answer was +positive, neutral, or negative. + +EXPLANATION: +""" + +TONE_EMOTION_RAILS = ["positive", "neutral", "negative"] + +AUDIO_SENTIMENT_TEMPLATE = ClassificationTemplate( + rails=TONE_EMOTION_RAILS, + template=[ + PromptPartTemplate( + content_type=PromptPartContentType.TEXT, + template=TONE_EMOTION_TEMPLATE_PT1, + ), + PromptPartTemplate( + content_type=PromptPartContentType.AUDIO, template=TONE_EMOTION_TEMPLATE_PT2 + ), + PromptPartTemplate( + content_type=PromptPartContentType.TEXT, + template=TONE_EMOTION_TEMPLATE_PT4, + ), + ], + explanation_template=[ + PromptPartTemplate( + content_type=PromptPartContentType.TEXT, + template=EXPLANATION_TEMPLATE_PT, + ), + PromptPartTemplate( + content_type=PromptPartContentType.TEXT, + template=TONE_EMOTION_TEMPLATE_PT1, + ), + PromptPartTemplate( + content_type=PromptPartContentType.AUDIO, template=TONE_EMOTION_TEMPLATE_PT2 + ), + PromptPartTemplate( + content_type=PromptPartContentType.TEXT, + template=TONE_EMOTION_TEMPLATE_PT4, + ), + ], +) + RAG_RELEVANCY_PROMPT_TEMPLATE = ClassificationTemplate( rails=list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values()), template=RAG_RELEVANCY_PROMPT_BASE_TEMPLATE, diff --git a/packages/phoenix-evals/src/phoenix/evals/models/openai.py b/packages/phoenix-evals/src/phoenix/evals/models/openai.py index c7efe16229..f8c6288cad 100644 --- a/packages/phoenix-evals/src/phoenix/evals/models/openai.py +++ b/packages/phoenix-evals/src/phoenix/evals/models/openai.py @@ -19,6 +19,8 @@ from phoenix.evals.models.rate_limiters import RateLimiter from phoenix.evals.templates import MultimodalPrompt, PromptPartContentType +from ...evals.utils import get_audio_format_from_base64 # todo change... + MINIMUM_OPENAI_VERSION = "1.0.0" MODEL_TOKEN_LIMIT_MAPPING = { "gpt-3.5-turbo-instruct": 4096, @@ -282,11 +284,26 @@ def _build_messages( self, prompt: MultimodalPrompt, system_instruction: Optional[str] = None ) -> List[Dict[str, str]]: messages = [] - for parts in prompt.parts: - if parts.content_type == PromptPartContentType.TEXT: - messages.append({"role": "system", "content": parts.content}) + for part in prompt.parts: + if part.content_type == PromptPartContentType.TEXT: + messages.append({"role": "system", "content": part.content}) + elif part.content_type == PromptPartContentType.AUDIO: + messages.append( + { # type: ignore + "role": "user", + "content": [ + { + "type": "input_audio", + "input_audio": { + "data": part.content, + "format": get_audio_format_from_base64(part.content), + }, + } + ], + } + ) else: - raise ValueError(f"Unsupported content type: {parts.content_type}") + raise ValueError(f"Unsupported content type: {part.content_type}") if system_instruction: messages.insert(0, {"role": "system", "content": str(system_instruction)}) return messages @@ -321,7 +338,7 @@ def _generate(self, prompt: Union[str, MultimodalPrompt], **kwargs: Any) -> str: prompt = MultimodalPrompt.from_string(prompt) invoke_params = self.invocation_params - messages = self._build_messages(prompt, kwargs.get("instruction")) + messages = self._build_messages(prompt=prompt, system_instruction=kwargs.get("instruction")) if functions := kwargs.get("functions"): invoke_params["functions"] = functions if function_call := kwargs.get("function_call"): diff --git a/packages/phoenix-evals/src/phoenix/evals/templates.py b/packages/phoenix-evals/src/phoenix/evals/templates.py index 4f2a578ed6..f7202b38cd 100644 --- a/packages/phoenix-evals/src/phoenix/evals/templates.py +++ b/packages/phoenix-evals/src/phoenix/evals/templates.py @@ -31,7 +31,8 @@ def get_field(self, field_name: str, args: Sequence[Any], kwargs: Mapping[str, A class PromptPartContentType(str, Enum): TEXT = "text" - AUDIO_URL = "audio_url" + TEXT_DATA = "text_data" + AUDIO = "audio" @dataclass @@ -40,6 +41,7 @@ class PromptPart: content: str +# TODO: ask about rename to PromptTemplatePart @dataclass class PromptPartTemplate: content_type: PromptPartContentType @@ -119,6 +121,17 @@ def _normalize_template( return [PromptPartTemplate(content_type=PromptPartContentType.TEXT, template=template)] return template + def get_data_template_variable(self) -> Union[str, None]: + if isinstance(self.template, str): + return None + + for template_message in self.template: + if ( + template_message.content_type == PromptPartContentType.AUDIO + or template_message.content_type == PromptPartContentType.TEXT_DATA + ): + return template_message.template.strip("{}") + class ClassificationTemplate(PromptTemplate): def __init__( diff --git a/packages/phoenix-evals/src/phoenix/evals/utils.py b/packages/phoenix-evals/src/phoenix/evals/utils.py index 6721f51f7d..9cb9726b52 100644 --- a/packages/phoenix-evals/src/phoenix/evals/utils.py +++ b/packages/phoenix-evals/src/phoenix/evals/utils.py @@ -1,3 +1,4 @@ +import base64 import json from io import BytesIO from typing import Any, Dict, List, Optional, Tuple @@ -5,6 +6,7 @@ from urllib.request import urlopen from zipfile import ZipFile +import filetype import pandas as pd from tqdm.auto import tqdm @@ -174,3 +176,20 @@ def _default_openai_function( def printif(condition: bool, *args: Any, **kwargs: Any) -> None: if condition: tqdm.write(*args, **kwargs) + + +def get_audio_format_from_base64(enc_str: str) -> str: + """ + Determines the audio format from a Base64 encoded string. + + Args: + enc_str (str): The Base64 encoded audio data. + + Returns: + str: The audio format ('wav', 'mp3', 'flac', 'ogg', 'aac', or 'unknown'). + """ + # Decode the Base64 string back to bytes and guess the file type + audio_bytes = base64.b64decode(enc_str) + kind = filetype.guess(audio_bytes) + + return kind.extension diff --git a/packages/phoenix-evals/tests/phoenix/evals/functions/test_classify.py b/packages/phoenix-evals/tests/phoenix/evals/functions/test_classify.py index 84cb6803d7..a7e04a1a3e 100644 --- a/packages/phoenix-evals/tests/phoenix/evals/functions/test_classify.py +++ b/packages/phoenix-evals/tests/phoenix/evals/functions/test_classify.py @@ -84,6 +84,17 @@ def classification_dataframe(): ) +@pytest.fixture +def audio_classification_dataframe(): + return pd.DataFrame( + [ + { + "input": "/AAABAAgABwADAAwACAAOABAACgAQABIAFQASABkCAAGAPACMAGgAcACAAFwAmACcAHQAuA=", + }, + ] + ) + + @pytest.fixture def classification_responses(): return [ @@ -334,6 +345,15 @@ def test_classify_fn_call_explain( ) +@pytest.mark.respx(base_url="https://api.openai.com/v1/chat/completions") +def test_classify_data_processor( + openai_api_key: str, classification_dataframe: DataFrame, respx_mock: respx.mock +): + # Test for case where the encoded string is not of a valid file type + # todo implement this test + pass + + @pytest.mark.respx(base_url="https://api.openai.com/v1/chat/completions") def test_llm_classify_prints_to_stdout_with_verbose_flag( classification_dataframe: DataFrame, diff --git a/packages/phoenix-evals/tests/phoenix/evals/test_utils.py b/packages/phoenix-evals/tests/phoenix/evals/test_utils.py index c0bbdd093a..9923ab3abb 100644 --- a/packages/phoenix-evals/tests/phoenix/evals/test_utils.py +++ b/packages/phoenix-evals/tests/phoenix/evals/test_utils.py @@ -1,4 +1,6 @@ -from phoenix.evals.utils import NOT_PARSABLE, snap_to_rail +from dataclasses import dataclass + +from phoenix.evals.utils import NOT_PARSABLE, get_audio_format_from_base64, snap_to_rail def test_snap_to_rail(): @@ -16,3 +18,46 @@ def test_snap_to_rail(): assert snap_to_rail("a", ["a", "b", "c"]) == "a" assert snap_to_rail(" abc", ["a", "ab", "abc"]) == "abc" assert snap_to_rail("abc", ["abc", "a", "ab"]) == "abc" + + +def test_get_audio_format_from_base64(): + @dataclass + class Sample: + enc_data: str + format: str + + samples = [ + Sample( + enc_data="UklGRiSaCABXQVZFZm10IBAAAAABAAIARKwAABCxAgAEABAAZGF0YQCaCABy+HL4BPsE+w//D" + "/9WAlYCGgEaAfIA8gA6AToB0ADQ", + format="wav", + ), + Sample( + enc_data="SUQzBAAAAAAAI1RTU0UAAAAPAAADTGF2ZjU3LjgzLjEwMAAAAAAAAAAAAAAA" + "//tQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", + format="mp3", + ), + Sample( + enc_data="T2dnUwACAAAAAAAAAABdwLNHAAAAANYQEycBHgF2b3JiaXMAAAAAAkSsAAAAAAAAA3ECAAAAAA" + "C4AU9nZ1MAAAAAAAAAAAAAXcCz", + format="ogg", + ), + Sample( + enc_data="ZkxhQwAAACISABIAAAkBADcsCsRC8ABHLQsq7uacAVPLZSxxjf3w6f8tBAAALg0AAABMYXZmNTc" + "uODMuMTAwAQAAABUAAABlbmNv", + format="flac", + ), + Sample( + enc_data="//FQgAP//N4EAExhdmM1Ny4xMDcuMTAwAEIgCMEYOP/xUIBxH/whKwwBNFoeYDeQgCQQEQQCgQC" + "w7jBLFAWFQ2EiTzU874zjjPP3", + format="aac", + ), + Sample( + enc_data="cvhy+AT7BPsP/w//VgJWAhoBGgHyAPIAOgE6AdAA0ACA/4D/Nvs2+735vfkC+AL4EfYR9izyLPL" + "Z9dn1AvoC+vn3+fdy+XL5K/cr", + format="pcm" + ) + ] + + for sample in samples: + assert get_audio_format_from_base64(sample.enc_data) == sample.format