From 2a0ddd9d9183f761f7db9f47c9219ad615976cbe Mon Sep 17 00:00:00 2001 From: Mikyo King Date: Thu, 26 Sep 2024 18:53:35 -0700 Subject: [PATCH 01/42] docs: initial playground PRD (#4777) --- cspell.json | 1 + internal_docs/specs/playground.md | 128 ++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 internal_docs/specs/playground.md diff --git a/cspell.json b/cspell.json index f128d64789..0ac8a3cc89 100644 --- a/cspell.json +++ b/cspell.json @@ -19,6 +19,7 @@ "dunder", "Evals", "fastapi", + "genai", "gitbook", "graphiql", "HDBSCAN", diff --git a/internal_docs/specs/playground.md b/internal_docs/specs/playground.md new file mode 100644 index 0000000000..a0f9c54c61 --- /dev/null +++ b/internal_docs/specs/playground.md @@ -0,0 +1,128 @@ +# Playground + +Authors: @mikeldking + +As a user of Phoenix I don't want to have to go back to my IDE to iterate on a prompt. I don’t have to worry about programming languages or dependencies. I want to be able to use the data stored in Phoenix (spans, datasets) and run them through prompt(s) and prompt template(s). + +## Terminology + +- **operation ** Refers to how the LLM is invoked (chat, text_completion). We will consider chat to be higher priority (https://opentelemetry.io/docs/specs/semconv/attributes-registry/gen-ai/) +- playground input source - refers to whether the input to the template is "manual" e.g. modifiable by the user or "dataset". + +## Use-cases + +A user may want to use the playground to: + +- Test a prompt template +- LLM Replay: replay a template or prompt change on an LLM Span - to live debug an improvement +- Run a template change on a dataset (sweep over a set of inputs) - e.g. a prompt change experiment +- A/B Testing of models and templates +- evaluation template creation: run an evaluation template on a single chosen production span or Dataset - Workflow is testing your Evals and be able to save as experiment +- Synthetic data Generation - Use to generate synthetic data, add columns to current rows of data in a dataset, to help create test data + +### Prompt Template + +As an AI engineer, I may want to use a prompt playground to explore synthesis, cost, latency, etc. under different scenarios. This means that the playground needs to be more flexible than a vendor’s playground as it needs “unify” the API across vendors. + +As a user, I want to be able to "run" a template and see the results as the tokens arrive. But I also want this data to be recorded (as a span) so that I can use it for datasets and annotations (e.g. stash the ones that I like). + +### LLM Replay + +As an AI engineer that is already using Phoenix tracing, I want the ability to take an LLM span and replay the synthesis to see if a tweaked response will make any difference in the output. This means that all the necessary fields for synthesis must be able to be translated from semantic attribute values to the playground. + +- llm vendor +- llm name +- operation type (chat, text_completion) +- invocation parameters +- messages and roles +- tools +- output schema + +The above values will have to be translated from the span to a corresponding values in the playground for invocation. + +Below is a typical attribute payload for an chat llm span: + +```typescript +{ + + llm: { + output_messages: [ + { + message: { + content: "This is an AI Answer", + role: "assistant", + }, + }, + ], + model_name: "gpt-3.5-turbo", + token_count: { completion: 9.0, prompt: 1881.0, total: 1890.0 }, + input_messages: [ + { + message: { + content: "You are a chatbot", + role: "system", + }, + }, + { + message: { + content: "Anser me the following question. Are you sentient?", + role: "user", + }, + }, + ], + invocation_parameters: + '{"context_window": 16384, "num_output": -1, "is_chat_model": true, "is_function_calling_model": true, "model_name": "gpt-3.5-turbo"}', + }, + openinference: { span: { kind: "LLM" } }, +}; +``` + +For chat the following mapping will be used: + +- llm.input_messages -> invocation.messages +- llm.invocation_parameters -> invocation.parameters +- llm.model_name -> invocation.model (e.g. vendor + model_name ) +- llm.tools -> invocation.tools +- llm.tool_selection (missing) -> playground.tool_selection (default to auto) +- (TBD) -> invocation.output_schema (output schema for JSON mode) + +### A/B Testing + +As an AI engineer I want the ability to create “multiple” playgrounds to answer certain types of questions: + +- Does prompt A produce better responses +- Does model X produce better responses + +In some cases I want to have things in A / B sync’d and sometimes I don’t. For that reason the UI should allow the user to: + +- Sync models - The user is adjusting the template or invocation parameters +- Sync templates - The user is adjusting the model +- Sync inputs - The user is testing different inputs + +### Evaluation Template + +As an AI engineer I want to ask questions about a previously recorded synthesis (e.g. LLM span) + +## Technical Features + +The following technical features are required for the playground: + +**Frontend** + +- Credential storage in local storage +- Span attribute to playground state translation + +**Backend** + +- LLM invocation interface (GraphQL or other) +- Span recording during synthesis +- Playground "session" tracking +- Playground dataset invocation tracking +- Streaming of synthesis results + +## Tracing Needs + +In order for spans -> playground invocation to be seamless, we must capture all necessary invocation parameters. This includes: + +- LLM/genai system - e.g. openai / anthropic etc. +- Output Schema - LLMs can adhere to OpenAPI schemas From a0a83a29ce93b68c6272efca9195b00ce6397702 Mon Sep 17 00:00:00 2001 From: Mikyo King Date: Thu, 26 Sep 2024 18:54:23 -0700 Subject: [PATCH 02/42] feat: Playground UI prototype (#4778) * docs: initial playground PRD * feat: playground prototype * add rudamentary messages UI * WIP * routing * add a playground button * cleanup * cleanup routes * Update app/src/pages/playground/PlaygroundTemplate.tsx * Update app/src/pages/playground/spanPlaygroundPageLoader.ts * cleanup store --- app/src/Routes.tsx | 21 +- app/src/pages/playground/Playground.tsx | 47 ++-- .../playground/PlaygroundChatTemplate.tsx | 53 ++++ app/src/pages/playground/PlaygroundInput.tsx | 9 +- .../PlaygroundInputModeRadioGroup.tsx | 32 +++ .../pages/playground/PlaygroundInstance.tsx | 43 ++++ .../pages/playground/PlaygroundTemplate.tsx | 66 ++++- .../pages/playground/SpanPlaygroundPage.tsx | 10 + .../spanPlaygroundPageLoaderQuery.graphql.ts | 156 ++++++++++++ app/src/pages/playground/constants.tsx | 1 + app/src/pages/playground/index.tsx | 2 + .../playground/spanPlaygroundPageLoader.ts | 32 +++ app/src/pages/playground/types.ts | 7 + app/src/pages/trace/SpanDetails.tsx | 15 ++ app/src/store/playgroundStore.tsx | 233 +++++++++++++++++- app/src/utils/__tests__/spanUtils.test.ts | 46 ++++ app/src/utils/spanUtils.ts | 3 + 17 files changed, 739 insertions(+), 37 deletions(-) create mode 100644 app/src/pages/playground/PlaygroundChatTemplate.tsx create mode 100644 app/src/pages/playground/PlaygroundInputModeRadioGroup.tsx create mode 100644 app/src/pages/playground/PlaygroundInstance.tsx create mode 100644 app/src/pages/playground/SpanPlaygroundPage.tsx create mode 100644 app/src/pages/playground/__generated__/spanPlaygroundPageLoaderQuery.graphql.ts create mode 100644 app/src/pages/playground/constants.tsx create mode 100644 app/src/pages/playground/spanPlaygroundPageLoader.ts create mode 100644 app/src/pages/playground/types.ts create mode 100644 app/src/utils/__tests__/spanUtils.test.ts create mode 100644 app/src/utils/spanUtils.ts diff --git a/app/src/Routes.tsx b/app/src/Routes.tsx index d97d015dce..dc6c42057f 100644 --- a/app/src/Routes.tsx +++ b/app/src/Routes.tsx @@ -5,6 +5,7 @@ import { createBrowserRouter } from "react-router-dom"; import { datasetLoaderQuery$data } from "./pages/dataset/__generated__/datasetLoaderQuery.graphql"; import { embeddingLoaderQuery$data } from "./pages/embedding/__generated__/embeddingLoaderQuery.graphql"; import { Layout } from "./pages/Layout"; +import { spanPlaygroundPageLoaderQuery$data } from "./pages/playground/__generated__/spanPlaygroundPageLoaderQuery.graphql"; import { projectLoaderQuery$data } from "./pages/project/__generated__/projectLoaderQuery.graphql"; import { APIsPage, @@ -40,6 +41,8 @@ import { ResetPasswordPage, ResetPasswordWithTokenPage, SettingsPage, + SpanPlaygroundPage, + spanPlaygroundPageLoader, TracePage, TracingRoot, } from "./pages"; @@ -157,11 +160,25 @@ const router = createBrowserRouter( } handle={{ crumb: () => "Playground", }} - /> + > + } /> + } + loader={spanPlaygroundPageLoader} + handle={{ + crumb: (data: spanPlaygroundPageLoaderQuery$data) => { + if (data.span.__typename === "Span") { + return `span ${data.span.context.spanId}`; + } + return "span unknown"; + }, + }} + /> + } diff --git a/app/src/pages/playground/Playground.tsx b/app/src/pages/playground/Playground.tsx index b326596816..4fe7b37344 100644 --- a/app/src/pages/playground/Playground.tsx +++ b/app/src/pages/playground/Playground.tsx @@ -1,25 +1,16 @@ import React from "react"; import { Panel, PanelGroup, PanelResizeHandle } from "react-resizable-panels"; -import { css } from "@emotion/react"; import { Button, Flex, Heading, View } from "@arizeai/components"; import { resizeHandleCSS } from "@phoenix/components/resize"; -import { PlaygroundProvider } from "@phoenix/contexts/PlaygroundContext"; +import { + PlaygroundProvider, + usePlaygroundContext, +} from "@phoenix/contexts/PlaygroundContext"; -import { PlaygroundInput } from "./PlaygroundInput"; +import { PlaygroundInstance } from "./PlaygroundInstance"; import { PlaygroundOperationTypeRadioGroup } from "./PlaygroundOperationTypeRadioGroup"; -import { PlaygroundOutput } from "./PlaygroundOutput"; -import { PlaygroundTemplate } from "./PlaygroundTemplate"; -import { PlaygroundTools } from "./PlaygroundTools"; - -const panelContentCSS = css` - padding: var(--ac-global-dimension-size-200); - overflow: auto; - display: flex; - flex-direction: column; - gap: var(--ac-global-dimension-size-200); -`; export function Playground() { return ( @@ -39,17 +30,25 @@ export function Playground() { + + + ); +} + +function PlaygroundInstances() { + const instances = usePlaygroundContext((state) => state.instances); + return ( + - - - - - - - - - + {instances.map((instance, i) => ( + <> + {i !== 0 && } + + + + + ))} - + ); } diff --git a/app/src/pages/playground/PlaygroundChatTemplate.tsx b/app/src/pages/playground/PlaygroundChatTemplate.tsx new file mode 100644 index 0000000000..99623e68c8 --- /dev/null +++ b/app/src/pages/playground/PlaygroundChatTemplate.tsx @@ -0,0 +1,53 @@ +import React from "react"; + +import { Card, TextArea } from "@arizeai/components"; + +import { usePlaygroundContext } from "@phoenix/contexts/PlaygroundContext"; + +import { PlaygroundInstanceProps } from "./types"; + +interface PlaygroundChatTemplateProps extends PlaygroundInstanceProps {} +export function PlaygroundChatTemplate(props: PlaygroundChatTemplateProps) { + const id = props.playgroundInstanceId; + // TODO: remove the hard coding of the first instance + const instances = usePlaygroundContext((state) => state.instances); + const updateInstance = usePlaygroundContext((state) => state.updateInstance); + const playground = instances.find((instance) => instance.id === id); + if (!playground) { + throw new Error(`Playground instance ${id} not found`); + } + const { template } = playground; + if (template.__type !== "chat") { + throw new Error(`Invalid template type ${template.__type}`); + } + + return ( +
    + {template.messages.map((message, index) => { + return ( +
  • + +