diff --git a/src/helm/benchmark/static_build/assets/index-2d9bbc7a.js b/src/helm/benchmark/static_build/assets/index-2d9bbc7a.js deleted file mode 100644 index 18935aa9ea..0000000000 --- a/src/helm/benchmark/static_build/assets/index-2d9bbc7a.js +++ /dev/null @@ -1,10 +0,0 @@ -import{r as l,a as Rs,L as f,O as Is,d as Ls,u as ke,f as Ce,H as Ss,h as ks,i as D,R as Cs}from"./react-d4a0b69b.js";import{g as F,b as $,m as oe,s as Pe,a as Ps,d as ye,y as Ts,c as Me,e as he,l as xe}from"./tremor-54a99cc4.js";import"./recharts-6d337683.js";(function(){const t=document.createElement("link").relList;if(t&&t.supports&&t.supports("modulepreload"))return;for(const r of document.querySelectorAll('link[rel="modulepreload"]'))a(r);new MutationObserver(r=>{for(const i of r)if(i.type==="childList")for(const c of i.addedNodes)c.tagName==="LINK"&&c.rel==="modulepreload"&&a(c)}).observe(document,{childList:!0,subtree:!0});function n(r){const i={};return r.integrity&&(i.integrity=r.integrity),r.referrerPolicy&&(i.referrerPolicy=r.referrerPolicy),r.crossOrigin==="use-credentials"?i.credentials="include":r.crossOrigin==="anonymous"?i.credentials="omit":i.credentials="same-origin",i}function a(r){if(r.ep)return;r.ep=!0;const i=n(r);fetch(r.href,i)}})();var Te={exports:{}},le={};/** - * @license React - * react-jsx-runtime.production.min.js - * - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */var Bs=l,Ds=Symbol.for("react.element"),Hs=Symbol.for("react.fragment"),Us=Object.prototype.hasOwnProperty,Os=Bs.__SECRET_INTERNALS_DO_NOT_USE_OR_YOU_WILL_BE_FIRED.ReactCurrentOwner,Fs={key:!0,ref:!0,__self:!0,__source:!0};function Be(s,t,n){var a,r={},i=null,c=null;n!==void 0&&(i=""+n),t.key!==void 0&&(i=""+t.key),t.ref!==void 0&&(c=t.ref);for(a in t)Us.call(t,a)&&!Fs.hasOwnProperty(a)&&(r[a]=t[a]);if(s&&s.defaultProps)for(a in t=s.defaultProps,t)r[a]===void 0&&(r[a]=t[a]);return{$$typeof:Ds,type:s,key:i,ref:c,props:r,_owner:Os.current}}le.Fragment=Hs;le.jsx=Be;le.jsxs=Be;Te.exports=le;var e=Te.exports,de={},Re=Rs;de.createRoot=Re.createRoot,de.hydrateRoot=Re.hydrateRoot;function _s({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M3.75 6.75h16.5M3.75 12h16.5m-16.5 5.25h16.5"}))}const Vs=l.forwardRef(_s),De=Vs;function zs({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9 12.75L11.25 15 15 9.75M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const Ws=l.forwardRef(zs),qs=Ws;function Gs({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9.75 9.75l4.5 4.5m0-4.5l-4.5 4.5M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const Qs=l.forwardRef(Gs),Ks=Qs,He=""+new URL("crfm-logo-74391ab8.png",import.meta.url).href,Ue=""+new URL("helm-logo-simple-2ed5400b.png",import.meta.url).href;function Js({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M12 2.25a.75.75 0 01.75.75v11.69l3.22-3.22a.75.75 0 111.06 1.06l-4.5 4.5a.75.75 0 01-1.06 0l-4.5-4.5a.75.75 0 111.06-1.06l3.22 3.22V3a.75.75 0 01.75-.75zm-9 13.5a.75.75 0 01.75.75v2.25a1.5 1.5 0 001.5 1.5h13.5a1.5 1.5 0 001.5-1.5V16.5a.75.75 0 011.5 0v2.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V16.5a.75.75 0 01.75-.75z",clipRule:"evenodd"}))}const Xs=l.forwardRef(Js),Ys=Xs;function $s({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M15.75 2.25H21a.75.75 0 01.75.75v5.25a.75.75 0 01-1.5 0V4.81L8.03 17.03a.75.75 0 01-1.06-1.06L19.19 3.75h-3.44a.75.75 0 010-1.5zm-10.5 4.5a1.5 1.5 0 00-1.5 1.5v10.5a1.5 1.5 0 001.5 1.5h10.5a1.5 1.5 0 001.5-1.5V10.5a.75.75 0 011.5 0v8.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V8.25a3 3 0 013-3h8.25a.75.75 0 010 1.5H5.25z",clipRule:"evenodd"}))}const Zs=l.forwardRef($s),et=Zs;function st({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M15 3.75a.75.75 0 01.75-.75h4.5a.75.75 0 01.75.75v4.5a.75.75 0 01-1.5 0V5.56l-3.97 3.97a.75.75 0 11-1.06-1.06l3.97-3.97h-2.69a.75.75 0 01-.75-.75zm-12 0A.75.75 0 013.75 3h4.5a.75.75 0 010 1.5H5.56l3.97 3.97a.75.75 0 01-1.06 1.06L4.5 5.56v2.69a.75.75 0 01-1.5 0v-4.5zm11.47 11.78a.75.75 0 111.06-1.06l3.97 3.97v-2.69a.75.75 0 011.5 0v4.5a.75.75 0 01-.75.75h-4.5a.75.75 0 010-1.5h2.69l-3.97-3.97zm-4.94-1.06a.75.75 0 010 1.06L5.56 19.5h2.69a.75.75 0 010 1.5h-4.5a.75.75 0 01-.75-.75v-4.5a.75.75 0 011.5 0v2.69l3.97-3.97a.75.75 0 011.06 0z",clipRule:"evenodd"}))}const tt=l.forwardRef(st),nt=tt;function at({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M12.53 16.28a.75.75 0 01-1.06 0l-7.5-7.5a.75.75 0 011.06-1.06L12 14.69l6.97-6.97a.75.75 0 111.06 1.06l-7.5 7.5z",clipRule:"evenodd"}))}const rt=l.forwardRef(at),Oe=rt;function lt({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M11.47 4.72a.75.75 0 011.06 0l3.75 3.75a.75.75 0 01-1.06 1.06L12 6.31 8.78 9.53a.75.75 0 01-1.06-1.06l3.75-3.75zm-3.75 9.75a.75.75 0 011.06 0L12 17.69l3.22-3.22a.75.75 0 111.06 1.06l-3.75 3.75a.75.75 0 01-1.06 0l-3.75-3.75a.75.75 0 010-1.06z",clipRule:"evenodd"}))}const it=l.forwardRef(lt),ct=it;function ot({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M10.5 3.75a6.75 6.75 0 100 13.5 6.75 6.75 0 000-13.5zM2.25 10.5a8.25 8.25 0 1114.59 5.28l4.69 4.69a.75.75 0 11-1.06 1.06l-4.69-4.69A8.25 8.25 0 012.25 10.5z",clipRule:"evenodd"}))}const dt=l.forwardRef(ot),mt=dt;function ue(s,t){return t?t==="home"?"https://crfm.stanford.edu/helm/":s?`https://crfm.stanford.edu/helm/${t}/${s}/`:`https://crfm.stanford.edu/helm/${t}/latest/`:"#"}function Fe(){const[s,t]=l.useState([]),[n,a]=l.useState();return l.useEffect(()=>{if(n&&n.title&&n.title!=="All Leaderboards"){const r=n.title==="Lite"||n.title==="Classic"?"HELM "+n.title:n.title;document.title=r+" - Holistic Evaluation of Language Models (HELM)"}},[n]),l.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(r=>r.json()).then(r=>{if(t(r),window.PROJECT_ID){const i=r.find(c=>c.id===window.PROJECT_ID);a(i)}else{const i=r.find(c=>c.id==="lite");a(i)}}).catch(r=>{console.error("Error fetching JSON:",r)})},[]),n===void 0||n.title===void 0?null:e.jsxs("div",{className:"dropdown z-50",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"btn normal-case bg-white font-bold p-2 border-0 text-lg block whitespace-nowrap z-40","aria-haspopup":"true","aria-controls":"menu",children:[n.title," ",e.jsx(Oe,{fill:"black",color:"black",className:"text w-4 h-4 inline"})]}),e.jsx("ul",{tabIndex:0,className:"-translate-x-36 dropdown-content z-[1] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:s.map((r,i)=>e.jsx("li",{className:"z-40",children:e.jsxs("a",{href:ue(void 0,r.id),className:"block",role:"menuitem",children:[e.jsx("strong",{className:n.title===r.title?"underline":"",children:r.title}),": ",r.description]})},i))})]})}function E(s){return s.startsWith("http://")||s.startsWith("https://")?s:`${window.BENCHMARK_OUTPUT_BASE_URL.replace(/\/$/,"")}/${s.replace(/^\//,"")}`}function Z(){return window.RELEASE?`/releases/${window.RELEASE}`:`/runs/${window.SUITE}`}async function ht(s){try{return await(await fetch(E(`${Z()}/summary.json`),{signal:s})).json()}catch(t){return console.log(t),{release:void 0,suites:void 0,suite:void 0,date:""}}}function xt(){const[s,t]=l.useState({release:void 0,suites:void 0,suite:void 0,date:""}),[n,a]=l.useState();l.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(g=>g.json()).then(g=>{if(window.PROJECT_ID){const w=g.find(j=>j.id===window.PROJECT_ID);a(w)}else{const w=g.find(j=>j.id==="lite");a(w)}}).catch(g=>{console.error("Error fetching JSON:",g)})},[]),l.useEffect(()=>{const g=new AbortController;async function w(){const j=await ht(g.signal);t(j)}return w(),()=>g.abort()},[]);const r=n!==void 0&&n.releases!==void 0?n.releases:["v1.0.0"],i=s.release||s.suite||null;if(!i)return null;const c=`Release ${i} (${s.date})`;if(r.length<=1)return e.jsx("div",{children:c});const d=r.indexOf(i),x=d<0?e.jsx(F,{color:"blue",children:"preview"}):d===0?e.jsx(F,{color:"blue",children:"latest"}):e.jsx(F,{color:"yellow",children:"stale"});return e.jsxs("div",{className:"dropdown",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"normal-case bg-white border-0 block whitespace-nowrap","aria-haspopup":"true","aria-controls":"menu",children:[c," ",x," ",e.jsx(Oe,{fill:"black",color:"black",className:"inline text w-4 h-4"})]}),e.jsx("ul",{tabIndex:0,className:"dropdown-content z-[50] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:r.map(g=>e.jsx("li",{children:e.jsx("a",{href:ue(g,n?n.id:"lite"),className:"block",role:"menuitem",children:g})},g))})]})}function ut(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsxs("div",{className:"dropdown md:hidden mr-4",children:[e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(De,{className:"w-16 h-16"})}),e.jsxs("ul",{tabIndex:0,className:"menu menu-lg dropdown-content mt-3 z-[1] p-2 bg-base-100 shadow",children:[e.jsx("li",{children:e.jsx(f,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(f,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(f,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(f,{to:"runs",className:"whitespace-nowrap",children:"Predictions"})}),e.jsx("li",{children:e.jsx(f,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})})]})]})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx("a",{href:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:He,className:"object-contain"})}),e.jsx(f,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:Ue,className:"object-contain"})}),e.jsx(Fe,{})]}),e.jsx("div",{className:"flex-none hidden md:block",children:e.jsxs("ul",{className:"flex flex-row gap-6 px-1",children:[e.jsx("li",{children:e.jsx(f,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(f,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(f,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(f,{to:"runs",children:"Predictions"})}),e.jsx("li",{children:e.jsx(f,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})}),e.jsx("li",{className:"hidden lg:flex",children:e.jsx(xt,{})})]})})]})}function ft(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsx("div",{className:"dropdown md:hidden mr-4",children:e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(De,{className:"w-16 h-16"})})})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx(f,{to:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:He,className:"object-contain"})}),e.jsx(f,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:Ue,className:"object-contain"})}),e.jsx(Fe,{})]})]})}function gt(){return e.jsxs(e.Fragment,{children:[window.PROJECT_ID==="home"?e.jsx(ft,{}):e.jsx(ut,{}),e.jsx("main",{className:"p-8 pt-0",children:e.jsx("div",{className:"mx-auto max-w-[1500]px",children:e.jsx(Is,{})})})]})}async function C(s){try{return await(await fetch(E(`${Z()}/schema.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),{adapter:[],metric_groups:[],metrics:[],models:[],perturbations:[],run_groups:[]}}}function pt({href:s,children:t}){return e.jsx("a",{href:s,className:"link link-primary link-hover",target:"_blank",rel:"noreferrer",children:t})}function J({value:s}){return e.jsx("span",{children:e.jsx(Ls,{components:{a:pt},children:s})})}function k({title:s,subtitle:t,markdown:n=!1}){return e.jsxs("header",{className:"m-4 ml-0",children:[e.jsx("h1",{className:"text-4xl",children:s}),n&&t!==void 0?e.jsx("h2",{className:"mt-2 text-neutral",children:e.jsx(J,{value:t})}):t!==void 0&&e.jsx("h2",{className:"mt-2 text-neutral",children:t})]})}const jt={open:"green",limited:"yellow",closed:"red"},bt={open:"Open",limited:"Limited",closed:"Closed"};function wt({level:s}){return e.jsx(F,{color:jt[s],children:bt[s]})}function H(){return e.jsx("div",{className:"w-full",children:e.jsx("div",{className:"block mx-auto my-24 loading loading-spinner loading-lg"})})}function vt(){const[s,t]=l.useState([]);l.useEffect(()=>{const c=new AbortController;async function d(){const x=await C(c.signal);t(x.models)}return d(),()=>c.abort()},[]);const[n,a,r]=s.reduce((c,d)=>{switch(d.access){case"open":c[0]+=1;break;case"limited":c[1]+=1;break;case"closed":c[2]+=1;break}return c},[0,0,0]),i=Object.values(s.reduce((c,d)=>{const x=d.creator_organization;return c[x]===void 0?(c[x]={name:x,models:1},c):(c[x].models+=1,c)},{}));return s.length===0?e.jsx(H,{}):e.jsxs(e.Fragment,{children:[e.jsx(k,{title:"Models"}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Creator"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Description"}),e.jsx("th",{children:"Access"})]})}),e.jsx("tbody",{children:s.map(c=>e.jsxs("tr",{children:[e.jsx("td",{className:"text-lg",children:c.creator_organization}),e.jsxs("td",{children:[e.jsx("span",{className:"text-xl",children:c.display_name}),e.jsx("br",{}),e.jsx("span",{children:c.name})]}),e.jsx("td",{children:e.jsx(J,{value:c.description})}),e.jsx("td",{children:e.jsx(wt,{level:c.access})})]}))})]}),e.jsx(k,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-3 grid-cols-1 gap-8",children:[e.jsxs($,{className:"flex flex-col justify-between",children:[e.jsx(oe,{children:"Models"}),e.jsx(Pe,{className:"text-6xl md:!text-[96px]",children:s.length}),e.jsx(Ps,{values:[n,a,r],colors:["green","yellow","red"]}),e.jsx(ye,{categories:["Open","Limited","Closed"],colors:["green","yellow","red"]})]}),e.jsxs($,{className:"md:col-span-2",children:[e.jsx(oe,{children:"Creator Organizations"}),e.jsxs("div",{className:"flex justify-between mt-4",children:[e.jsx(Ts,{data:i,category:"models",index:"name",variant:"pie",className:"basis-5/12"}),e.jsx(ye,{categories:i.map(c=>c.name),className:"basis-7/12"})]})]})]})]})]})}function ae({to:s,children:t,inTable:n=!1,title:a=""}){return n?e.jsx(f,{className:"link link-hover",to:s,title:a,children:t}):e.jsx(f,{className:"link link-primary link-hover",to:s,children:t})}function Nt(){const[s,t]=l.useState([]);l.useEffect(()=>{const a=new AbortController;async function r(){const i=await C(a.signal);t(i.run_groups.filter(c=>!c.todo&&c.taxonomy&&!c.display_name.includes("CLEVA")))}return r(),()=>a.abort()},[]);const n=Object.values(s.reduce((a,r)=>{var c;const i=((c=r.taxonomy)==null?void 0:c.task)||"Unknown";return a[i]===void 0?(a[i]={name:i,value:1},a):(a[i].value+=1,a)},{}));return s.length===0?e.jsx(H,{}):(console.log(s),e.jsxs(e.Fragment,{children:[e.jsx(k,{title:"Scenarios",subtitle:"A scenario represents a use case and consists of a dataset of instances."}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Scenario"}),e.jsx("th",{children:"Task"}),e.jsx("th",{children:"What"}),e.jsx("th",{children:"Who"}),e.jsx("th",{children:"When"}),e.jsx("th",{children:"Language"}),e.jsx("th",{children:"Description"})]})}),e.jsx("tbody",{children:s.map(a=>{var r,i,c,d,x;return e.jsxs("tr",{children:[e.jsxs("td",{children:[e.jsx(ae,{to:`/groups/${a.name}`,children:e.jsx("span",{className:"text-lg",children:a.display_name})}),e.jsx("span",{className:"block",children:a.name})]}),e.jsx("td",{children:((r=a.taxonomy)==null?void 0:r.task)||""}),e.jsx("td",{children:((i=a.taxonomy)==null?void 0:i.what)||""}),e.jsx("td",{children:((c=a.taxonomy)==null?void 0:c.who)||""}),e.jsx("td",{children:((d=a.taxonomy)==null?void 0:d.when)||""}),e.jsx("td",{children:((x=a.taxonomy)==null?void 0:x.language)||""}),e.jsx("td",{children:e.jsx(J,{value:a.description})})]})})})]}),e.jsx(k,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-4 gap-8",children:[e.jsxs($,{className:"flex flex-col",children:[e.jsx(oe,{children:"Total scenarios"}),e.jsx(Pe,{className:"mx-auto my-6 md:mt-16 !text-[72px] md:!text-[96px]",children:s.length})]}),e.jsx($,{className:"col-span-3",children:e.jsxs("div",{className:"grid md:grid-cols-2 gap-x-12",children:[e.jsx(Me,{data:n.slice(0,Math.floor(n.length/2))}),e.jsx(Me,{data:n.slice(Math.ceil(n.length/2))})]})})]})]})]}))}function At(){return E(`${Z()}/groups.json`)}async function _e(s){try{return await(await fetch(At(),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),[]}}function fe({children:s}){return e.jsx("div",{role:"navigation",className:"tabs flex-nowrap border-b-2 border-gray-2 overflow-x-auto overflow-y-hidden",children:s})}function re({active:s=!1,onClick:t=()=>{},size:n="md",children:a}){return e.jsx("div",{onClick:t,className:`whitespace-nowrap text-${n} mb-[-2px] text-md tab tab-bordered${s?" border-2 border-grey-500 rounded":" border-none"}`,children:a})}function Et({title:s,titleId:t,...n},a){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 20 20",fill:"currentColor","aria-hidden":"true",ref:a,"aria-labelledby":t},n),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M4.25 5.5a.75.75 0 00-.75.75v8.5c0 .414.336.75.75.75h8.5a.75.75 0 00.75-.75v-4a.75.75 0 011.5 0v4A2.25 2.25 0 0112.75 17h-8.5A2.25 2.25 0 012 14.75v-8.5A2.25 2.25 0 014.25 4h5a.75.75 0 010 1.5h-5z",clipRule:"evenodd"}),l.createElement("path",{fillRule:"evenodd",d:"M6.194 12.753a.75.75 0 001.06.053L16.5 4.44v2.81a.75.75 0 001.5 0v-4.5a.75.75 0 00-.75-.75h-4.5a.75.75 0 000 1.5h2.553l-9.056 8.194a.75.75 0 00-.053 1.06z",clipRule:"evenodd"}))}const yt=l.forwardRef(Et),Ie=yt;function K(s){return Number.isNaN(Number(s))?String(s):String(Math.round(Number(s)*1e3)/1e3)}function Le({value:s,title:t,hideIcon:n}){if(typeof s.value=="string"&&s.value.includes("⚠")&&(s.value=s.value.replace("⚠","")),s.value===void 0)return"-";if(s.run_spec_names){const a=(()=>{if(s.run_spec_names.length==1)return"/runs/"+s.run_spec_names[0];if(s.run_spec_names.length>1){const r="/runs/?q="+s.run_spec_names.map(c=>`^${c}$`).join("|");return encodeURI(r)}})();return a?e.jsx(ae,{to:a,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center ",children:[K(s.value),!n&&e.jsx(Ie,{className:"w-3 h-3 ml-1",stroke:"#cbcbcb",fill:"#cbcbcb"})]})}):t?e.jsx("a",{title:t,children:K(s.value)}):e.jsx(e.Fragment,{children:K(s.value)})}return s.href?e.jsx(ae,{to:s.href,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center",children:[K(s.value),!n&&e.jsx(Ie,{className:"w-3 h-3 ml-1",stroke:"#cbcbcb",fill:"#cbcbcb"})]})}):s.markdown?e.jsx(J,{value:String(s.value)}):t?e.jsx("a",{title:t,children:K(s.value)}):e.jsx(e.Fragment,{children:K(s.value)})}function ge({schema:s,groupTable:t,numRowsToDisplay:n,sortColumnIndex:a=1,sortable:r=!0,displayColumnIndexes:i=void 0,miniStyle:c=!1}){const[d,x]=l.useState(1),[g,w]=l.useState(a);function j(h){return h.length>30?h.substring(0,27)+"...":h}const R=h=>{const p=["AIRBench 2024 -","-book"];if(h.value==="Model/adapter")return"Model";if(p.some(m=>h.value.includes(m))){let m=h.value;return p.forEach(v=>{m=m.replace(v,"")}),j(m)}else return j(h.value)},I=h=>{if(s){const p=s.models.find(m=>m.display_name===h);if(p){let m=p.description;return m.includes("/")&&(m=m.replace("/","_")),m}}return""},P=h=>{x(h===g?d*-1:h===0?-1:1),w(h)},u=h=>{if(s){const p=s.models.find(m=>m.display_name===h);if(p){let m=p.name;return m.includes("/")&&(m=m.replace("/","_")),m}}return""},U=()=>{const h=t.header[g].lower_is_better,p=d*(h?1:-1),m=t.rows.slice();return m.sort((v,_)=>{var ee,O;const T=(ee=v[g])==null?void 0:ee.value,L=(O=_[g])==null?void 0:O.value;return T!==void 0&&L===void 0?-1:L!==void 0&&T===void 0?1:typeof T=="number"&&typeof L=="number"?(T-L)*p:typeof T=="string"&&typeof L=="string"?p===1?T.localeCompare(L):L.localeCompare(T):0}),n>0?m.slice(0,n):m};function y(h){const p=h.lastIndexOf(" - ");return p===-1?h:h.substring(0,p)+"*"+h.substring(p+1)}const X=h=>{const m=y(h).split("*")[0].trim();if(s){const v=s.run_groups.find(_=>_.display_name===m||_.short_display_name===m);if(v)return v.name}return""};return e.jsxs("table",{className:c?"table w-full":"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsx("tr",{children:t.header.filter((h,p)=>i===void 0||i.includes(p)).map((h,p)=>e.jsx("th",{className:`${p===g?"bg-gray-100":"bg-white"} ${p===0?"left-0 z-40":""} ${h.description?"underline decoration-dashed decoration-gray-300 ":""} whitespace-nowrap px-4 sticky top-0`,title:h.description?h.description:"",children:e.jsxs("div",{className:c?"flex gap-2 items-center":"z-20 flex justify-between items-center min-w-48 w-48 max-w-48 text-wrap",children:[e.jsx("span",{className:"inline-block w-full break-words",children:R(h)}),r?e.jsx("button",{className:"link",onClick:()=>P(p),children:e.jsx(ct,{className:"w-6 h-6"})}):null]})},`$${p}`))})}),e.jsx("tbody",{children:U().map((h,p)=>e.jsx("tr",{children:h.filter((m,v)=>i===void 0||i.includes(v)).map((m,v)=>e.jsx("td",{className:`${v===0?"z-20 text-lg sticky left-0":"z-0"} ${p%2===0?"bg-gray-50":"bg-white"}`,children:v==1?e.jsx("div",{className:`${m&&m.style&&m.style["font-weight"]&&m.style["font-weight"]==="bold"?"font-bold":""}`,children:e.jsx(Le,{value:{...m,href:"/runs/?q="+u(String(h[0].value))},title:`Click value to see all predictions for: ${u(String(h[0].value))}`})}):e.jsx("div",{className:`${m&&m.style&&m.style["font-weight"]&&m.style["font-weight"]==="bold"?"font-bold":""} ${v===0?"underline decoration-dashed decoration-gray-300 z-10":"z-0"}`,children:e.jsx(Le,{value:{...m},title:String(h[0].value)===m.value?I(String(h[0].value)):`Click value to see predictions for ${String(h[0].value)} for ${X(R(t.header[v]))}: ${u(String(h[0].value))}`})})},`${v}`))},`$${h[0].value}`))})]})}function Mt(){const[s,t]=l.useState(0),[n,a]=l.useState(),[r,i]=l.useState();return l.useEffect(()=>{const c=new AbortController;async function d(){const x=C(c.signal),g=_e(c.signal),w=await x;i(w);const j=await g;a(j)}return d(),()=>c.abort()},[]),n===void 0||r===void 0?e.jsx(H,{}):n.length===0?e.jsxs("div",{children:[e.jsx(k,{title:"Results",subtitle:"Groupings of the processes, methods, and metrics involved in evaluating models, particularly in the context of natural language understanding and question answering.",className:"mb-16"}),e.jsx("div",{children:"No groups found."})]}):e.jsxs("div",{children:[e.jsx(k,{title:"Results",subtitle:"Groupings of the processes, methods, and metrics involved in evaluating models, particularly in the context of natural language understanding and question answering.",className:"mb-16"}),e.jsxs("div",{children:[n.length>1?e.jsx(fe,{children:n.map((c,d)=>e.jsx(re,{active:d===s,onClick:()=>t(d),children:c.title},d))}):null,e.jsx(ge,{schema:r,groupTable:n[s],numRowsToDisplay:-1,sortColumnIndex:1,sortable:!0},`${s}`)]})]})}async function Ve(s,t){try{return await(await fetch(E(`${Z()}/groups/${s}.json`),{signal:t})).json()}catch(n){return n instanceof Error&&n.name!=="AbortError"&&console.log(n),[]}}function ze({schema:s,runGroupName:t,numRowsToDisplay:n=-1}){const[a,r]=l.useState(),[i,c]=l.useState(0);return l.useEffect(()=>{const d=new AbortController;async function x(){const g=await Ve(t,d.signal);r(g)}return x(),()=>d.abort()},[s,t]),a===void 0||a.length===0?e.jsx(H,{}):a.length===0?e.jsx("div",{children:"Group currently has no tables."}):e.jsxs("div",{children:[a.length>1?e.jsx(fe,{children:a.map((d,x)=>e.jsx(re,{active:x===i,onClick:()=>c(x),children:d.title},x))}):null,e.jsx(ge,{schema:s,groupTable:a[i],numRowsToDisplay:n,sortColumnIndex:1},`${t}-${i}`)]})}function Rt(){const{groupName:s}=ke(),[t,n]=l.useState(void 0);l.useEffect(()=>{const i=new AbortController;async function c(){const x=await C(i.signal);n(x)}return c(),()=>i.abort()},[]);const r=(()=>{if(t!==void 0){for(const i of t.run_groups)if(i.name===s)return i}})();return t===void 0?e.jsx(H,{}):r===void 0?e.jsxs("div",{children:['Group "',s,'" not found.']}):e.jsxs(e.Fragment,{children:[e.jsx(k,{title:r.display_name,subtitle:r.description,markdown:!0,className:"mr-8"}),e.jsx(ze,{schema:t,runGroupName:r.name},r.name)]})}async function It(s){try{return await(await fetch(E(`${Z()}/run_specs.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),[]}}function me({currentPage:s,totalPages:t,onNextPage:n,onPrevPage:a,className:r}){let i="join";return r!==void 0&&(i=`join ${r}`),e.jsxs("div",{className:i,children:[e.jsx("button",{onClick:a,className:"join-item btn",children:"«"}),e.jsxs("button",{className:"join-item btn",children:["Page ",s," of ",t]}),e.jsx("button",{onClick:n,className:"join-item btn",children:"»"})]})}const ce=100;function Lt(){const[s,t]=Ce(),[n,a]=l.useState(),[r,i]=l.useState(Number(s.get("page")||1)),[c,d]=l.useState(!0),[x,g]=l.useState(s.get("q")||"");l.useEffect(()=>{const u=new AbortController;async function U(){const y=await It(u.signal);a(y)}return U(),()=>u.abort()},[]);const w=u=>{u.preventDefault();const y=u.target.q.value;g(y),t({q:y,page:"1"})};if(n===void 0)return e.jsx(H,{});const j=c?new RegExp(x):null,R=n.filter(u=>j?j.test(u.name):u.name.includes(x)),I=R.slice((r-1)*ce,r*ce),P=Math.ceil(R.length/ce);return e.jsxs(e.Fragment,{children:[e.jsx(k,{title:"Predictions",subtitle:"All benchmark predictions"}),e.jsxs("form",{className:"flex mb-8",onSubmit:w,children:[e.jsxs("div",{className:"form-control",children:[e.jsx("input",{type:"text",name:"q",placeholder:"Search",className:"input input-bordered",value:x,onChange:u=>g(u.target.value)}),e.jsxs("label",{className:"label",children:[e.jsxs("span",{className:"label-text-alt flex item-center",children:[e.jsx("input",{type:"checkbox",className:"toggle toggle-xs",checked:c,onChange:()=>d(!c)}),e.jsx("span",{className:"ml-2",children:"Regex"})]}),e.jsx("span",{className:"label-text-alt",children:`${R.length} results`})]})]}),e.jsx("div",{className:"form-control ml-4",children:e.jsx("button",{className:"btn",children:e.jsx(mt,{className:"w-6 h-6"})})})]}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Run"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Groups"}),e.jsx("th",{children:"Adapter method"}),e.jsx("th",{children:"Subject / Task"})]})}),e.jsx("tbody",{children:I.map((u,U)=>e.jsxs("tr",{children:[e.jsx("td",{children:e.jsx(ae,{to:`/runs/${u.name}`,children:u.name})}),e.jsx("td",{children:u.adapter_spec.model}),e.jsx("td",{children:u.groups.join(", ")}),e.jsx("td",{children:u.adapter_spec.method}),e.jsx("td",{children:u.scenario_spec.args.subject||u.scenario_spec.args.task||"-"})]},`${u.name}-${U}`))})]})}),P>0?e.jsx(me,{className:"flex justify-center my-8",onNextPage:()=>{const u=Math.min(r+1,P);i(u),s.set("page",String(u)),t(s)},onPrevPage:()=>{const u=Math.max(r-1,1);i(u),s.set("page",String(u)),t(s)},currentPage:r,totalPages:P}):e.jsx("div",{className:"my-8 text-center",children:"No results"})]})}function W(){return window.SUITE!==void 0?window.SUITE:void 0}async function St(s,t,n){try{return await(await fetch(E(`/runs/${n||W()}/${s}/instances.json`),{signal:t})).json()}catch(a){return a instanceof Error&&a.name!=="AbortError"&&console.log(a),[]}}async function kt(s,t,n){try{return await(await fetch(E(`/runs/${n||W()}/${s}/stats.json`),{signal:t})).json()}catch(a){return a instanceof Error&&a.name!=="AbortError"&&console.log(a),[]}}async function Ct(s,t,n){try{return await(await fetch(E(`/runs/${n||W()}/${s}/display_requests.json`),{signal:t})).json()}catch(a){return a instanceof Error&&a.name!=="AbortError"&&console.log(a),[]}}async function Pt(s,t,n){try{return await(await fetch(E(`/runs/${n||W()}/${s}/display_predictions.json`),{signal:t})).json()}catch(a){return a instanceof Error&&a.name==="AbortError"&&console.log(a),[]}}async function Tt(s,t,n){try{return await(await fetch(E(`/runs/${n||W()}/${s}/scenario.json`),{signal:t})).json()}catch(a){a instanceof Error&&a.name!=="AbortError"&&console.log(a);return}}function We(s,t){return E(`/runs/${t||W()}/${s}/run_spec.json`)}async function Bt(s,t,n){try{return await(await fetch(We(s,n),{signal:t})).json()}catch(a){a instanceof Error&&a.name!=="AbortError"&&console.log(a);return}}function Dt(s,t){return E(`/runs/${t||W()}/${s}/scenario_state.json`)}function Ht(s){const n={quasi_exact_match:!1,toxic_frac:!0,safety_score:!1,exact_match:!1},a=Object.keys(s);for(const r of a)if(s[r]!==void 0&&n[r]!==void 0)return n[r]?s[r]<.5?[r,!0]:[r,!1]:s[r]>=.5?[r,!0]:[r,!1];return["",!1]}function Ut(s){const[t,n]=Ht(s.stats);return t===""?null:n?e.jsx(Ot,{value:`${t.replace(/_/g," ")}: ${s.stats[t]}`}):e.jsx(Ft,{value:`${t.replace(/_/g," ")}: ${s.stats[t]}`})}function Ot({value:s}){return e.jsx(F,{icon:qs,color:"green",children:s})}function Ft({value:s}){return e.jsx(F,{icon:Ks,color:"red",children:s})}function z({value:s}){const[t,n]=l.useState(!1),[a,r]=l.useState(!1);return e.jsxs(e.Fragment,{children:[e.jsxs("div",{onMouseOver:()=>n(!0),onMouseOut:()=>n(!1),className:"relative",children:[e.jsx("div",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-[36rem] mb-2 whitespace-pre-wrap",children:s}),t?e.jsx("button",{className:"bg-white absolute p-2 leading-none height-fit min-h-none right-1 bottom-1 shadow",onClick:()=>r(!0),children:e.jsx(nt,{fill:"black",color:"black",className:"text w-4 h-4"})}):null]}),e.jsx("dialog",{open:a,className:"modal p-16 bg-opacity-80 bg-white",onClick:()=>r(!1),children:e.jsx("div",{className:"modal-box max-w-none p-4 whitespace-pre-wrap bg-base-200",children:s})})]})}function qe({mediaObject:s}){if(s.content_type.includes("image")){if(s.location===void 0)return null;const t=E(s.location.replace("benchmark_output/","").replace("prod_env/","../"));return e.jsxs("div",{children:[e.jsx("img",{src:t}),e.jsx("br",{})]})}else if(s.content_type.includes("audio")){if(s.location===void 0)return null;const t=E(s.location.replace("benchmark_output/","").replace("prod_env/","../"));return e.jsx("div",{children:e.jsx("audio",{controls:!0,src:t})})}else return s.text&&s.content_type&&s.content_type==="text/plain"&&s.text.length>1?e.jsxs("div",{children:[s.text,e.jsx("br",{}),e.jsx("br",{})]}):e.jsx("div",{})}function Ge({multimediaObject:s}){return e.jsx("div",{children:s.media_objects.map(t=>e.jsx(qe,{mediaObject:t}))})}function _t(s){return Array.isArray(s)?s.length==0?"[]":`[${s.map(t=>String(t).replace(/\n/,"\\n")).join(", ")}]`:String(s)}function Vt({request:s}){return e.jsxs("div",{children:[s.request.prompt.length>0?e.jsxs("div",{children:[e.jsxs("h3",{className:"block text text-gray-400",children:["Prompt (",s.request.prompt.length," Chars)"]}),e.jsx(z,{value:s.request.prompt})]}):s.request.multimodal_prompt?e.jsxs("div",{children:[e.jsx("h3",{className:"block text text-gray-400",children:"Prompt"}),e.jsx(Ge,{multimediaObject:s.request.multimodal_prompt})]}):e.jsx("h3",{className:"block text text-gray-400",children:"Empty Prompt"}),e.jsx(he,{children:Object.keys(s.request).filter(t=>t!=="prompt").map((t,n)=>e.jsxs(xe,{children:[e.jsxs("span",{children:[t,":"]}),s.request&&s.request[t]?e.jsx("span",{children:_t(s.request[t])}):"null"]},n+1))})]})}function zt(s){return e.jsx("div",{children:s.map((t,n)=>e.jsxs("div",{children:[t.error&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Error"}),e.jsx(z,{value:t.error})," "]}),t.text&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Text"}),e.jsx(z,{value:t.text})," "]}),t.media_object&&e.jsx(qe,{mediaObject:t.media_object})]},n))})}function Wt(s){return e.jsx("div",{children:Object.entries(s).map(([t,n])=>e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:t}),e.jsx(z,{value:n===null?"null":n.toString()})]}))})}function qt({predictionAnnotations:s}){return e.jsx("div",{children:s&&s!==void 0?Object.entries(s).map(([t,n])=>e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white my-2",children:[e.jsx("summary",{className:"collapse-title",children:e.jsx(e.Fragment,{children:"View "+t+" annotations"})}),e.jsx("div",{className:"collapse-content",children:Array.isArray(n)?zt(n):Wt(n)})]},t)):null})}function Gt({predictions:s,requests:t,metricFieldMap:n}){return s.length<1?null:e.jsx("div",{children:e.jsx("div",{className:"flex flex-wrap justify-start items-start",children:s.map((a,r)=>e.jsxs("div",{className:"w-full",children:[s.length>1?e.jsxs("h2",{children:["Trial ",a.train_trial_index]}):null,e.jsx("div",{className:"mt-2 w-full",children:a.base64_images&&a.base64_images.length>0?e.jsxs(e.Fragment,{children:[e.jsx("h3",{className:"mr-4",children:"Prediction image"}),a.base64_images.map(i=>e.jsx("img",{src:"data:image;base64,"+i,alt:"Base64 Image"}))]}):e.jsxs(e.Fragment,{children:[e.jsxs("h3",{children:[e.jsx("span",{className:"mr-4",children:"Prediction raw text"}),e.jsx(Ut,{stats:a.stats})]}),e.jsx(z,{value:a.predicted_text}),a.mapped_output?e.jsxs(e.Fragment,{children:[e.jsx("h3",{children:"Prediction mapped output"}),e.jsx(z,{value:String(a.mapped_output)})]}):null]})}),e.jsx(qt,{predictionAnnotations:a.annotations}),e.jsxs("div",{className:"mx-1",children:[e.jsx("h3",{children:"Metrics"}),e.jsx(he,{children:Object.keys(a.stats).map((i,c)=>e.jsxs(xe,{children:[n[i]?e.jsx("span",{title:n[i].description,children:n[i].display_name}):e.jsx("span",{children:i}),e.jsx("span",{children:String(a.stats[i])})]},c))})]}),e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white",children:[e.jsx("summary",{className:"collapse-title",children:"Request details"}),e.jsx("div",{className:"collapse-content",children:e.jsx(Vt,{request:t[r]})})]})]},r))})})}const Qt="correct";function Kt({references:s}){return e.jsxs("span",{children:[e.jsx("h3",{children:"References"}),e.jsx("ul",{children:s.map((t,n)=>e.jsxs("li",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-72 mb-2 whitespace-pre-wrap",children:[t.output.text,t.tags.map(a=>e.jsx(F,{className:"mx-2",color:a===Qt?"green":void 0,children:a}))]},n))})]})}function Jt({instance:s,requests:t,predictions:n,metricFieldMap:a}){return e.jsxs("div",{children:[e.jsx("h3",{children:"Input"}),s.input.multimedia_content!==void 0?e.jsx(Ge,{multimediaObject:s.input.multimedia_content}):s.input.text.includes('
0?e.jsx(Kt,{references:s.references}):null}),e.jsx("div",{children:n&&t?e.jsx(Gt,{predictions:n,requests:t,metricFieldMap:a}):null})]})}function Xt({stat:s,metricFieldMap:t}){const n=`${s.name.split!==void 0?` on ${s.name.split}`:""}${s.name.sub_split!==void 0?`/${s.name.sub_split}`:""}${s.name.perturbation!==void 0?` with ${s.name.perturbation.name}`:" original"}`;return t[s.name.name]?e.jsxs("span",{title:t[s.name.name].description,children:[e.jsx("strong",{children:t[s.name.name].display_name||s.name.name}),n]}):e.jsxs("span",{children:[e.jsx("strong",{children:s.name.name}),n]})}function Qe(){return window.RELEASE!==void 0?window.RELEASE:void 0}async function Yt(s){try{return await(await fetch(E(`/releases/${Qe()}/runs_to_run_suites.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),{}}}function $t(s,t){return Qe()?s[t]:window.SUITE}const te=10,ne=50;function Zt(){const{runName:s}=ke(),[t,n]=Ce(),[a,r]=l.useState(0),[i,c]=l.useState(),[d,x]=l.useState(),[g,w]=l.useState([]),[j,R]=l.useState([]),[I,P]=l.useState(),[u,U]=l.useState(),[y,X]=l.useState(1),[h,p]=l.useState(1),[m,v]=l.useState(1),[_,T]=l.useState(1),[L,ee]=l.useState(),[O,fs]=l.useState(),[pe,gs]=l.useState({}),[je,ps]=l.useState({}),[be,js]=l.useState(""),[we,bs]=l.useState(!0);l.useEffect(()=>{const o=new AbortController;async function B(){const b=o.signal;if(s===void 0)return()=>o.abort();const S=window.SUITE?window.SUITE:$t(await Yt(b),s);x(S);const[q,ve,Ne,Ns,As,Es,ie]=await Promise.all([Bt(s,b,S),St(s,b,S),kt(s,b,S),Tt(s,b,S),Pt(s,b,S),Ct(s,b,S),C(b)]);c(q),w(ve);const Ae=Math.ceil(ve.length/te),ys=Number(t.get("instancesPage")||1);p(Ae),X(Math.max(Math.min(ys,Ae),1)),R(Ne),fs(Ns);const Ee=Math.floor(Ne.length/ne),Ms=Number(t.get("metricsPage")||1);T(Ee),v(Math.max(Math.min(Ms,Ee),1));const G={};Es.forEach(A=>{var Y;const N=A.instance_id,V=((Y=A.perturbation)==null?void 0:Y.name)||"";G[N]===void 0&&(G[N]={}),G[N][V]===void 0&&(G[N][V]=[]),G[N][V].push(A)}),U(G);const Q={};As.forEach(A=>{var Y;const N=A.instance_id,V=((Y=A.perturbation)==null?void 0:Y.name)||"";Q[N]===void 0&&(Q[N]={}),Q[N][V]===void 0&&(Q[N][V]=[]),Q[N][V].push(A)}),P(Q),ps(ie.metrics.reduce((A,N)=>(A[N.name]=N,A),{})),gs(ie.adapter.reduce((A,N)=>(A[N.name]=N,A),{})),ee(ie.models.find(A=>A.name===(q==null?void 0:q.adapter_spec.model)))}return B(),()=>o.abort()},[s,t]);const se=g.slice((y-1)*te,(y-1)*te+te),ws=j.slice((m-1)*ne,(m-1)*ne+ne);if(l.useEffect(()=>{const o=t.get("instance");if(o&&we&&se.length>0){if(se.findIndex(b=>b.id===o)===-1)return;requestAnimationFrame(()=>{const b=document.getElementById(`instance-${o}`);b&&b.scrollIntoView({behavior:"smooth"})}),bs(!1)}},[we,t,y,n,se]),i===void 0||I===void 0||u===void 0||O===void 0)return e.jsx(H,{});const vs=o=>o.perturbation===void 0?`Instance id: ${o.id} [split: ${o.split}]`:`Instance id: ${o.id} [split: ${o.split}][perturbation: ${o.perturbation.name}]`;return e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"flex justify-between gap-8 mb-12",children:e.jsxs("div",{children:[e.jsxs("h1",{className:"text-3xl flex items-center",children:[O.name,e.jsx("a",{href:"/#/groups/"+O.name,children:e.jsx(et,{className:"w-6 h-6 ml-2"})})]}),e.jsx("h3",{className:"text-xl",children:e.jsx(J,{value:O.description})}),e.jsx("h1",{className:"text-3xl mt-2",children:i.adapter_spec.model}),e.jsx("h3",{className:"text-xl",children:e.jsx(J,{value:(L==null?void 0:L.description)||""})}),e.jsx("div",{className:"mt-2 flex gap-2",children:O.tags.map(o=>e.jsx(F,{size:"xs",color:"gray",children:e.jsx("span",{className:"text text-md",children:o})}))})]})}),e.jsxs($,{children:[e.jsxs("div",{className:"flex justify-between",children:[e.jsx("h3",{className:"text-lg mb-1",children:"Adapter Specification"}),e.jsxs("div",{className:"flex gap-2",children:[e.jsx(Ys,{className:"w-6 h-6 mr-1 text text-primary"}),e.jsx("a",{className:"link link-primary link-hover",href:We(i.name,d),download:"true",target:"_blank",children:"Spec JSON"}),e.jsx("a",{className:"link link-primary link-hover",href:Dt(i.name,d),download:"true",target:"_blank",children:"Full JSON"})]})]}),e.jsx("div",{children:e.jsx(he,{className:"grid md:grid-cols-2 lg:grid-cols-3 gap-x-8",children:Object.entries(i.adapter_spec).map(([o,B],b)=>e.jsxs(xe,{className:b<3?"!border-0":"",children:[e.jsx("strong",{className:"mr-1",title:pe[o]?pe[o].description:void 0,children:`${o}: `}),e.jsx("span",{className:"overflow-x-auto",children:B})]}))})})]}),e.jsx("div",{className:"mt-16 mb-8",children:e.jsxs(fe,{children:[e.jsx(re,{size:"lg",active:a===0,onClick:()=>r(0),children:"Instances + Predictions"}),e.jsx(re,{size:"lg",active:a===1,onClick:()=>r(1),children:"All metrics"})]})}),a===0?e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"grid gap-8",children:se.map((o,B)=>{var b,S;return e.jsxs("div",{id:"instance-"+o.id,className:"border p-4",children:[e.jsxs("div",{className:"flex items-center justify-between",children:[e.jsx("h3",{className:"text-xl mb-4",children:vs(o)}),e.jsx("button",{className:"btn btn-sm normal-case px-2 py-1",onClick:()=>{const q=window.location.href+(window.location.href.includes("?")?"&instance=":"?instance=")+o.id;navigator.clipboard.writeText(q)},children:"Copy Link"})]}),e.jsx(Jt,{instance:o,requests:u[o.id][((b=o.perturbation)==null?void 0:b.name)||""],predictions:I[o.id][((S=o.perturbation)==null?void 0:S.name)||""],metricFieldMap:je},`${o.id}-${B}`)]})})}),e.jsx(me,{className:"flex justify-center my-8",onNextPage:()=>{const o=Math.min(y+1,h);X(o),t.set("instancesPage",String(o)),n(t)},onPrevPage:()=>{const o=Math.max(y-1,1);X(o),t.set("instancesPage",String(o)),n(t)},currentPage:y,totalPages:h})]}):e.jsxs("div",{children:[e.jsx("div",{className:"flex justify-start my-4",children:e.jsx("input",{type:"text",className:"input input-bordered w-full max-w-xs",placeholder:"Search for a metric",onChange:o=>js(o.target.value)})}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsx("tr",{children:Object.keys(j[0]).map(o=>e.jsx("th",{children:o},o))})}),e.jsx("tbody",{children:ws.filter(o=>!be||o.name.name.toLowerCase().includes(be.toLowerCase())).map(o=>e.jsx("tr",{children:Object.entries(o).map(([B,b])=>B==="name"?e.jsx("td",{children:e.jsx(Xt,{stat:o,metricFieldMap:je})},B):e.jsx("td",{children:b}))}))})]})}),e.jsx(me,{className:"flex justify-center my-8",onNextPage:()=>{const o=Math.min(m+1,_);v(o),t.set("metricsPage",String(o)),n(t)},onPrevPage:()=>{const o=Math.max(m-1,1);v(o),t.set("metricsPage",String(o)),n(t)},currentPage:m,totalPages:_})]})]})}function en(){const[s,t]=l.useState(void 0),[n,a]=l.useState(void 0),[r,i]=l.useState(void 0);if(l.useEffect(()=>{const d=new AbortController;async function x(){const g=C(d.signal),w=_e(d.signal),j=await g;t(j);const R=await w,I=[];R.forEach(P=>{P.rows.forEach(u=>{I.push({title:String(u[0].value),name:u[0].href.replace("?group=","")})})}),a(I)}return x(),()=>d.abort()},[]),s===void 0||n===void 0)return e.jsx(H,{});if(n.length===0)return e.jsxs(e.Fragment,{children:[e.jsx(k,{title:"HELM Leaderboard",subtitle:"The HELM leaderboard shows how the various models perform across different scenarios and metrics.",markdown:!0}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]});const c=r!==void 0?r:n[0].name;return e.jsxs(e.Fragment,{children:[e.jsxs("div",{className:"flex flex-row justify-between",children:[e.jsx(k,{title:"HELM Leaderboard",subtitle:"The HELM leaderboard shows how the various models perform across different scenarios and metrics.",markdown:!0}),e.jsxs("div",{className:"w-64 pt-8",children:[e.jsx("label",{htmlFor:"group",className:"block text-sm font-medium text-gray-700",children:"Select a group:"}),e.jsx("select",{id:"group",name:"group",onChange:d=>i(d.target.value),className:"mt-1 block w-full pl-3 pr-10 py-2 text-base border-gray-300 focus:outline-none focus:ring focus:border-blue-300 rounded-md",children:n.map((d,x)=>e.jsx("option",{value:d.name,children:d.title},x))})]})]}),e.jsx(ze,{schema:s,runGroupName:c},c)]})}const sn=""+new URL("instruct-flowchart-48854f7c.svg",import.meta.url).href,tn=""+new URL("instruct-graph-0a57d7d2.svg",import.meta.url).href;function nn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 font-bold text-center",children:"HELM Instruct: A Multidimensional Instruction Following Evaluation Framework with Absolute Ratings"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://crfm.stanford.edu/2024/02/18/helm-instruct.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{children:["We introduce ",e.jsx("em",{children:"HELM Instruct"}),", a multidimensional evaluation framework for instruction-following LLMs with absolute ratings. The framework takes an instruction, a model, an evaluator, and a criterion to generate a score. In our study, we use HELM Instruct to compare 4 instruction-following models on 7 scenarios based on 4 Human/LM evaluators and 5 criteria. Check out the blog post for more details."]}),e.jsxs("div",{className:"grid my-16 grid-cols-1 md:mx-32 md:grid-cols-2 md:gap-2",children:[e.jsx("img",{src:sn,alt:"Evaluation flowchart",className:"mx-auto block",sizes:"100vw"}),e.jsx("img",{src:tn,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block",sizes:"100vw"})]}),e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Average"}),e.jsx("th",{children:"Helpfulness"}),e.jsx("th",{children:"Understandability"}),e.jsx("th",{children:"Completeness"}),e.jsx("th",{children:"Conciseness"}),e.jsx("th",{children:"Harmlessness"})]})}),e.jsxs("tbody",{children:[e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-4-0314"}),e.jsx("td",{children:"4.63"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.85"}),e.jsx("td",{children:"4.50"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.95"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-3.5-turbo-0613"}),e.jsx("td",{children:"4.60"}),e.jsx("td",{children:"4.34"}),e.jsx("td",{children:"4.86"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.41"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"anthropic_claude-v1.3"}),e.jsx("td",{children:"4.56"}),e.jsx("td",{children:"4.25"}),e.jsx("td",{children:"4.87"}),e.jsx("td",{children:"4.32"}),e.jsx("td",{children:"4.40"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"cohere_command-xlarge-beta"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"3.90"}),e.jsx("td",{children:"4.73"}),e.jsx("td",{children:"3.88"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"4.72"})]})]})]})]})}function Ke({models:s}){return e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[s.length," models"]}),e.jsx("ul",{children:s.map((t,n)=>t.todo?e.jsxs("li",{className:"text-slate-300 mt-1",children:[t.creator_organization," / ",t.display_name]},n):e.jsx(f,{to:"models",children:e.jsxs("li",{className:"text-black mt-1",children:[t.creator_organization," / ",t.display_name]},n)}))})]})}function Je({runGroups:s}){const t=new Map(s.filter(r=>r.metric_groups!==void 0&&(r.subgroups===void 0||r.subgroups.length===0)).map(r=>[r.name,r])),n=new Set,a=[];return s.forEach(r=>{const i=r.subgroups?r.subgroups:[],c=[];i.forEach(d=>{const x=t.get(d);x&&(c.push(x),n.add(x.name))}),c.length>0&&a.push([r,c])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[n.size," scenarios"]}),e.jsx("ul",{children:a.map(([r,i])=>e.jsxs("li",{className:"my-3",children:[e.jsx(f,{className:"text-black",to:"groups/"+r.name,children:e.jsx("h2",{children:r.display_name})}),e.jsx("ul",{className:"list-disc list-inside",children:i.map(c=>c.todo?e.jsx("li",{className:`${c.todo?"ml-4 text-slate-300":"ml-4"}`,children:c.display_name},c.name):e.jsx(f,{className:"text-black",to:"groups/"+c.name,children:e.jsx("li",{className:`${c.todo?"ml-4 text-slate-300":"ml-4"}`,children:c.display_name},c.name)}))})]},r.name))})]})}const Xe=""+new URL("helmhero-28e90f4d.png",import.meta.url).href;function M({runGroupName:s=void 0,tableIndexToDisplay:t=0,numRowsToDisplay:n=10,sortColumnIndex:a=1}){const[r,i]=l.useState(void 0),[c,d]=l.useState(void 0);return l.useEffect(()=>{const x=new AbortController;async function g(){const w=await C(x.signal);i(w);const j=w.run_groups;if(j.length===0)return;const R=s||j[0].name,I=await Ve(R,x.signal);d(I[t])}return g(),()=>x.abort()},[s,t]),r===void 0||c===void 0?e.jsx(H,{}):e.jsx("div",{className:"rounded-2xl overflow-hidden border-2 bg-white p-1 mx-2 my-0 overflow-x-auto",style:{overflow:"auto",justifyContent:"space-between"},children:e.jsx(ge,{schema:r,groupTable:c,numRowsToDisplay:n,sortColumnIndex:a,displayColumnIndexes:[0,1],sortable:!1,miniStyle:!0})})}function an(){return e.jsxs("div",{className:"flex flex-col px-4 sm:px-6 py-100 sm:py-10 sm:mb-96 md:mb-96 lg:mb-0 xl:mb-0 2xl:mb-0",children:[e.jsx("div",{className:"flex flex-col text-center mb-10 justify-start",children:e.jsx("h1",{className:"text-3xl sm:text-4xl mb-3 sm:mb-4 mx-2 mt-2",children:e.jsx("strong",{children:"A holistic framework for evaluating foundation models."})})}),e.jsxs("div",{className:"flex flex-col md:flex-col lg:flex-row lg:justify-center",children:[e.jsx("div",{className:"w-full lg:w-1/2 flex justify-center mb-4 lg:mb-0 h-full py-10",children:e.jsx("img",{src:Xe,alt:"HELM Hero",className:"object-contain w-96"})}),e.jsxs("div",{className:"py-2 rounded-xl bg-gray-100 h-full",children:[e.jsx(M,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-2 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Ye=""+new URL("ai21-0eb91ec3.png",import.meta.url).href,$e=""+new URL("aleph-alpha-7ce10034.png",import.meta.url).href,Ze=""+new URL("anthropic-70d8bc39.png",import.meta.url).href,es=""+new URL("bigscience-7f0400c0.png",import.meta.url).href,ss=""+new URL("cohere-3550c6cb.png",import.meta.url).href,ts=""+new URL("eleutherai-b9451114.png",import.meta.url).href,ns=""+new URL("google-06d997ad.png",import.meta.url).href,as=""+new URL("meta-5580e9f1.png",import.meta.url).href,rs=""+new URL("microsoft-f5ee5016.png",import.meta.url).href,ls=""+new URL("mistral-18e1be23.png",import.meta.url).href,is=""+new URL("nvidia-86fa75c1.png",import.meta.url).href,cs=""+new URL("openai-3f8653e4.png",import.meta.url).href,os=""+new URL("tii-24de195c.png",import.meta.url).href,ds=""+new URL("together-a665a35b.png",import.meta.url).href,ms=""+new URL("tsinghua-keg-97d4b395.png",import.meta.url).href,hs="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAASYAAABfCAYAAABFnmpnAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAlhSURBVHgB7d3xddtGEgbw7+np/8gVeFLBqYOwg/gqOKaC+CqIU0GcCqxUcOnAvAruUoEnFZxcwUUjEDYMA8QCmN0dgN/vPUggAHFpejGcXSyWQON/T8v/HZa3SOdR3vvEsg5O5dnyZuD5xfH5lywfzu/FL0/L8fx61njj+Lr63js9d87lPYZFfb1DDpVf0wd8rpOvMLNO3p5/Pz4td1jvm8TjPMoygvIU8ch5OXS2nZ6Wn8+/iUoTfK6Tr8/bHtDUSZ3645vz78kDE71IPM4rMKUS+FFswwHNJ9Y71AngRH1HNJmUZVEXY0AbmP6Ej5eJxwl8iPNxKR6xLUc0AUpAFINlUP/BhTrpnTGlZkKeGVNqMPSi2B4BgxPFIrhQJ70DkyQet+Wm3NYyppagqQil33uiMfK0/GtoRxuYPE+2lIov8PNtwjFeWdV/sW2Cpn1PFMU9Bq50e2dMJuXKnOentqCcrWZLXUd8efWOqLYf0TuPc2RMJTOYVAIfin34CURxWKLyurshR8YkCceUzphSjknhdfWytgPY10Sx/AOdOnnT2aHwced0TKqp7MuzrD005VpHEMVh5+mr9kE3MHmddKU7v6d4BibFfnwPoli+a1e6gekP+EjpPyrZlJvaP8eeMqZ7EMVyaFdyZExTt6UIfAnK2fpwgS77cGBwokgE56QlRx/TVMYk8DWVfQn87CljMgxMFI3Yj1p9TJ5KBSbF/jAwUTTPdbLGVTmBv0tZmlegVOxP6jQ1RKU8B6bbzgaFj7vzMpaBCcryCkwfUc7v+Pr9a/uEBH6YMVGqUnXy+cMyR2Bqn3wsMOVo6tlo87HBj16jzBXl/IrxCd4e0AxG81BjkOU/Z5R7hN+/9Tc0712KEn2JD0/Lv7EdrzF8jgmaG3G9PuTEftz2NnrNZGlX5sYCRY6TQS7s21tTziqIjUHy+HfVCExzrmwe4EcRazZPC0oP2D59Wn5AM7+SB7EfNwOFuD35iBz3yd0t3DeHIgb78Jhzcl9SIzDR/lh9VDjKFZhKBIrU5xT4iDRUwGswrCl9QzXtk1ez9KtxTMarg1cW7ltq7OTyDIIfEIeCKBbX4UZ7yZjGRpsL/ESaWSBS9kZkXOtkrsA0Nj4mV59G7vIURFRMPzB5RT2ZuT1XeXsNTKUn9iMq6WU/MHld7ZGR7bkypruZ2+cqObgyBZtyFI3CUa6MaSwgCPLIXZ4iFl7mp2gEfj4OBSaP4JQ7gxnyMmN5ilj2erWRyDzeDG2Ej5yBYsiLxNewhCIWZkwUjWudHApMCh9DgUKQjwxs83qzovXpCPxE6z+jbfobfDyfa0OByWu8jgxsyznK+C5x2xLRmjtelcCwI5083MOH2o+cTbkIE8YJfEQaXCnwu7nV6yosXbcDnFsnOZtyOQNFankeb1a0jMLzyyqZLZEHr6lpzPN9oDkzJknc5qXfTPSK4JGyCgtKR/jxvBmYro+dY7/At04+n2+3Yzsc9G8Tyd2063e2C3zU6By2ibeGZgv0fg9PIEpzGtiWo06OBiavjKkfKHIHplwZk6K8HP/hQ06gWgQ+fYV2IpdokgvyU1wITAof/UAhyOtu4vFSe+2HOYF9TDX9BJ/+QsF+/h9P7crNyAGK9XIFitTyBD4U+/QbiGL5VCfHApNHBJaJx976zZ8tN+VyU+xjvmnaD2vCndoHY4HJ62pNjkBxSbfDXeBjj82dn0EUy9+7D3JmTKYbKEoEphcjZa+xt0GI9tVQDyCKwz4otbshZx+T6U5CVmLSe+ms73Fw5VrWhn8NojgsKL3pb8ydMUlnvUTG5N10VOyHZUpHEMVgMca+/PTN0M7bkT9S+OgGB0F+MrK+1B4yphOaT6UTiGI4ofmSTB07oGRgYsZU3gMG2u9EFVmW9Dsm6mTuply3X6nkVTmBj0izCixxRDNlyzuUyViJpti9dZN18lJg8ghO7VUyQRlteRzD9KUjmu+WfwWiGI5Py3uMzON0c+EPPQJTmzEJymjLY2D6mr0ndnPwPYhiEDTBSfo7LgUmj/E7awLEksDYlifwscfBlRacSjSriVJYXXzX33gpMHlM97EmUCwZfS69ctdS7I+AY5kolgN6My1cCkyK9dr71wTzLc3Y2jLX8upnW8KuWjyguayaY+T59yCa5wGf66TC3xczLdxeOFDhw66ULQkUel4E83wDn6acoh7LaLpXBAXNQDSvKUzvz0uOoEf79EPvscC3Th7QxInROb9bXtnCt1gemJa8BivP4/aXSP1LiuYqxq/wcwDRcoqmTp7g59CulMiYLCgtCRQWGKyfae5VpD33L719Wn6ED8+vgKL5TvCpY7W/F9AG8B7gw85168YoFpiWBIulfTxL+7T6Ig6uVCxr3g7hsIG67IbqB2yfZ3fApwRmqinnNWGcYD7FsuAo2PftKF5NTA4ZIA9WHxU+pF25wXSha1kUnHsSPGJdxsTANE1AFIu0K1OBSbHe0mbc0vJdv0OdiMqbCkwe/SwHzKe937nLu/QaiKiwEk25JRnTR8fyl2LGRFRJiabcEnr+XWv0NQceElVUImNa4nFkvUb5RFTYVGCqlTnoyHqN8omosKgZk3bWawx03PrMlUSbFrWP6bHya2BTjqiiqcBkFOVpZ52d30TX4dO5nhKYanc+M3vJg+8rRTMrMCnK0t7jGtnLNWRMDEwUVkpgKt0RrL3HHC5AdGUiNuX688soylIQUVVbaMqNbStZPhEVFDEwPSZuy6X2jIBEV48Z09c4VICosoh9TDqwrWQWw45vospSA1PJk3WoLEU5CiKq6jbxOAsWHtPVptDEbbkwY6IS7EtHBX7eYkd1NzUwWb+LoIzand8fQJTfq/Pi5R12FJhSmnKmVB+PztyeA2cWIKosNTApytCZ20uVT0QFRQtMY5lZqRRVQUTVpQamCIHh0j4vHFxJFEC0jOlx4T4vCiKqbksZ0x+oWz4RFbKVzm/DjInoSqQGJqPI7zFw+URUyJzAVDtjKVE+B1cSBTAnMJXo46mdMV3T4EpmhxSNtiuRMiZduX+taztRGZgorEh9TDqxP/eJxHmYiIKI1sc0VX7O18DBlURBzAlMuTMKTTgmZ2BSEFEIkTKmlI7nnMGRfS5EQWypj8nkbG6xj4koiNSJ4lon5JswThOOseDxHfJYmzEpyvjTqayp+xIV6ynW8Xod7XOtpdgOz/duisLHp8TjLzZXn0YvVQJMAAAAAElFTkSuQmCC",xs=""+new URL("yandex-38e09d70.png",import.meta.url).href,us=""+new URL("01-694cb9b7.png",import.meta.url).href,rn=[Ye,$e,Ze,es,ss,ts,ns,as,rs,ls,is,cs,os,ds,ms,hs,xs,us];function Se(){const[s,t]=l.useState(void 0);return l.useEffect(()=>{const n=new AbortController;async function a(){const r=await C(n.signal);t(r)}return a(),()=>n.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(an,{}),e.jsxs("div",{className:"mx-auto text-lg px-16",children:[e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:rn.map((n,a)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:n,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},a))})})})]}),e.jsx("div",{className:"container mx-auto",children:e.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-8",children:[e.jsx(Ke,{models:s.models}),e.jsx(Je,{runGroups:s.run_groups})]})})]})]}):null}function ln(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Massive Multitask Language Understanding (MMLU) on HELM"}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Massive Multitask Language Understanding (MMLU)"})," ",e.jsx("a",{href:"https://arxiv.org/pdf/2009.03300.pdf",className:"link",children:"(Hendrycks et al, 2020)"})," ","is a multiple-choice question answering test that covers 57 tasks including elementary mathematics, US history, computer science, law, and more. We publish evaluation results from evaluating various models on MMLU using HELM. Our evaluation results include the following:"]}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"Simple, standardized prompts"}),e.jsx("li",{children:"Accuracy breakdown for each of the 57 subjects"}),e.jsx("li",{children:"Full transparency of all raw prompts and predictions"})]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://crfm.stanford.edu/2024/05/01/helm-mmlu.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsx("div",{className:"flex-1",children:e.jsx(M,{})})]})]})}const cn=""+new URL("air-overview-d2e6c49f.png",import.meta.url).href;function on(){const s={fontVariant:"small-caps",fontWeight:"bold"},t=e.jsx("span",{style:s,children:"AIR-Bench 2024"});return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:t}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("img",{src:cn,alt:"AIR 2024 Categories",className:"mx-auto my-4 block w-3/4",sizes:"100vw"}),e.jsxs("p",{children:["We introduce ",t,", the first AI safety benchmark aligned with emerging government regulations and company policies, following the regulation-based safety categories grounded in our AI Risks study, AIR 2024. AIR 2024 decomposes 8 government regulations and 16 company policies into a four-tiered safety taxonomy with 314 granular risk categories in the lowest tier. ",t," contains 5,694 diverse prompts spanning these categories, with manual curation and human auditing to ensure quality. We evaluate leading language models on ",t,", uncovering insights into their alignment with specified safety concerns. By bridging the gap between public benchmarks and practical AI risks, ",t," ","provides a foundation for assessing model safety across jurisdictions, fostering the development of safer and more responsible AI systems."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2407.17436",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(M,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const dn=""+new URL("scb10x-204bd786.png",import.meta.url).href,mn=""+new URL("scbx-71e53e72.jpg",import.meta.url).href;function hn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"ThaiExam"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("div",{className:"text-center",children:[e.jsx("a",{href:"https://scbx.com/",children:e.jsx("img",{src:mn,alt:"Logo",className:"inline h-32 mx-4 my-4"})}),e.jsx("a",{href:"https://scb10x.com/",children:e.jsx("img",{src:dn,alt:"Logo",className:"inline h-32 mx-4 my-4"})})]}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://www.scbx.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"SCBX"})," ","and"," ",e.jsx("a",{href:"https://www.scb10x.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"SCB 10X"}),", we introduce the ThaiExam HELM leaderboard. ThaiExam is a Thai language benchmark based on examinations for high school students and investment professionals in Thailand. The ThaiExam leaderboard is the first public leaderboard for large language models on Thai language scenarios, and features evaluations of leading language models. Like all other HELM leaderboards, the ThaiExam leaderboard provides full prompt-level transparency, and the results can be fully reproduced using the HELM framework. We hope that this leaderboard will encourage further work in multilingual language model evaluation."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(f,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(M,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const xn=""+new URL("wellsfargo-a86a6c4a.png",import.meta.url).href;function un(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Finance"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("div",{children:e.jsx("a",{href:"https://wellsfargo.com/",children:e.jsx("img",{src:xn,alt:"Logo",className:"mx-auto block my-4 w-48"})})}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://www.wellsfargo.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Wells Fargo"}),", we introduce the ",e.jsx("span",{className:"font-bold",children:"HELM Finance"})," ","leaderboard for ecologically-valid evaluations of leading language models in the financial domain. The leaderboard evaluates the ability of language models to perform tasks from financial professions on publicly financial documents across a range of scenarios."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(f,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(M,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const fn=""+new URL("heim-logo-3e5e3aa4.png",import.meta.url).href;function gn({metricFieldMap:s,metricGroups:t}){const n=new Set,a=[];return t.forEach(r=>{const i=[];r.metrics.forEach(c=>{const d=s[c.name];d&&(i.push(d),n.add(d.name))}),i.length>0&&a.push([r,i])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[n.size," metrics"]}),e.jsx("ul",{children:a.map(([r,i])=>e.jsxs("li",{className:"my-3",children:[e.jsx("h4",{children:r.display_name}),e.jsx("ul",{className:"list-disc list-inside",children:i.map(c=>e.jsx("li",{className:"ml-4",children:c.display_name},c.name))})]},r.name))})]})}function pn(){const[s,t]=l.useState(void 0);l.useEffect(()=>{const a=new AbortController;async function r(){const i=await C(a.signal);t(i)}return r(),()=>a.abort()},[]);const n=s?s.metrics.reduce((a,r)=>(a[r.name]=r,a),{}):void 0;return e.jsxs("div",{className:"container mx-auto px-16 text-base",children:[e.jsx("div",{className:"container max-w-screen-lg mx-auto",children:e.jsx("img",{className:"mx-auto w-96",src:fn,alt:"HEIM Logo"})}),e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Holistic Evaluation of Text-To-Image Models"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2311.04287",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{className:"my-2",children:["Significant effort has recently been made in developing text-to-image generation models, which take textual prompts as input and generate images. As these models are widely used in real-world applications, there is an urgent need to comprehensively understand their capabilities and risks. However, existing evaluations primarily focus on image-text alignment and image quality. To address this limitation, we introduce a new benchmark,"," ",e.jsx("strong",{children:"Holistic Evaluation of Text-To-Image Models (HEIM)"}),"."]}),e.jsx("p",{className:"my-2",children:"We identify 12 different aspects that are important in real-world model deployment, including:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside unreset",children:[e.jsx("li",{children:"image-text alignment"}),e.jsx("li",{children:"image quality"}),e.jsx("li",{children:"aesthetics"}),e.jsx("li",{children:"originality"}),e.jsx("li",{children:"reasoning"}),e.jsx("li",{children:"knowledge"}),e.jsx("li",{children:"bias"}),e.jsx("li",{children:"toxicity"}),e.jsx("li",{children:"fairness"}),e.jsx("li",{children:"robustness"}),e.jsx("li",{children:"multilinguality"}),e.jsx("li",{children:"efficiency"})]}),e.jsx("p",{className:"my-2",children:"By curating scenarios encompassing these aspects, we evaluate state-of-the-art text-to-image models using this benchmark. Unlike previous evaluations that focused on alignment and quality, HEIM significantly improves coverage by evaluating all models across all aspects. Our results reveal that no single model excels in all aspects, with different models demonstrating strengths in different aspects."}),e.jsx("p",{className:"my-2",children:"For full transparency, this website contains all the prompts, generated images and the results for the automated and human evaluation metrics."}),e.jsx("p",{className:"my-2",children:"Inspired by HELM, we decompose the model evaluation into four key components: aspect, scenario, adaptation, and metric:"}),e.jsx("div",{className:"container max-w-screen-lg mx-auto my-8",children:e.jsx("img",{src:"https://crfm.stanford.edu/heim/latest/images/heim-main.png",alt:"HEIM scenarios, prompts, images and metrics"})}),s&&n?e.jsxs("div",{className:"grid grid-cols-1 md:grid-cols-3 gap-8",children:[e.jsx(Ke,{models:s.models}),e.jsx(Je,{runGroups:s.run_groups}),e.jsx(gn,{metricFieldMap:n,metricGroups:s.metric_groups})]}):null]})}const jn=""+new URL("vhelm-framework-a1ca3f3f.png",import.meta.url).href,bn=""+new URL("vhelm-model-8afb7616.png",import.meta.url).href,wn=""+new URL("vhelm-aspects-1437d673.png",import.meta.url).href;function vn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Holistic Evaluation of Vision-Language Models"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2410.07112",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"#/leaderboard",children:"Leaderboard"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{className:"my-4",children:["Current benchmarks for assessing vision-language models (VLMs) often focus on their perception or problem-solving capabilities and neglect other critical aspects such as fairness, multilinguality, or toxicity. Furthermore, they differ in their evaluation procedures and the scope of the evaluation, making it difficult to compare models. To address these issues, we extend the HELM framework to VLMs to present the Holistic Evaluation of Vision Language Models (VHELM). To address these issues, we introduce VHELM, built on HELM for language models. VHELM aggregates various datasets to cover one or more of the 9 aspects:"," ",e.jsx("b",{children:"visual perception"}),", ",e.jsx("b",{children:"bias"}),", ",e.jsx("b",{children:"fairness"}),", ",e.jsx("b",{children:"knowledge"}),", ",e.jsx("b",{children:"multilinguality"}),", ",e.jsx("b",{children:"reasoning"}),", ",e.jsx("b",{children:"robustness"}),","," ",e.jsx("b",{children:"safety"}),", and ",e.jsx("b",{children:"toxicity"}),". In doing so, we produce a comprehensive, multi-dimensional view of the capabilities of the VLMs across these important factors. In addition, we standardize the standard inference parameters, methods of prompting, and evaluation metrics to enable fair comparisons across models. Our framework is designed to be lightweight and automatic so that evaluation runs are cheap and fast. For transparency, we release the raw model generations and complete results on this website."]}),e.jsx("p",{className:"my-4 font-bold",children:"VHELM is intended to be a living benchmark. We hope to continue adding new datasets, models and metrics over time, so please stay tuned!"}),e.jsxs("div",{className:"my-16 flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("img",{src:bn,alt:"A vision-lanuage model (VLM) takes in an image and a text prompt and generates text.",className:""}),e.jsx("img",{src:jn,alt:"An example of an evaluation for an Aspect (Knowledge) - a Scenario (MMMU) undergoes Adaptation (multimodal multiple choice) for a Model (GPT-4 Omni), then Metrics (Exact match) are computed",className:""})]}),e.jsxs("div",{className:"flex-1",children:[e.jsx(M,{}),e.jsx(f,{to:"leaderboard",className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})]})]}),e.jsx("div",{className:"container max-w-screen-lg mx-auto my-8",children:e.jsx("img",{src:wn,alt:"An example of each aspect in VHELM: Visual Perception, Bias, Fairness, Knowledge, Multilinguality, Reasoning, Robustness, Toxicity Mitigation and Safety. ",className:""})})]})}const Nn=""+new URL("accenture-6f97eeda.png",import.meta.url).href,An=""+new URL("cresta-9e22b983.png",import.meta.url).href;function En(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Call Center"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("div",{className:"text-center",children:[e.jsx("a",{href:"https://www.accenture.com/",children:e.jsx("img",{src:Nn,alt:"Logo",className:"inline h-12 mx-4 my-4"})}),e.jsx("a",{href:"https://www.cresta.com/",children:e.jsx("img",{src:An,alt:"Logo",className:"inline h-8 mx-4 my-4"})})]}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://www.accenture.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Accenture"})," ","and"," ",e.jsx("a",{href:"https://www.cresta.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Cresta"}),", we introduce the HELM"," ",e.jsx("span",{className:"font-bold",children:"Call Center"})," leaderboard. HELM Call Center is a leaderboard consisting of evaluations of leading language models on scenarios with realistic tasks from the call center context."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(f,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(M,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const yn=""+new URL("cuhk-8c5631e9.png",import.meta.url).href;function Mn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Chinese Language Models EVAluation Platform (CLEVA)"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("div",{className:"text-center",children:e.jsx("a",{href:"https://www.cuhk.edu.hk/",children:e.jsx("img",{src:yn,alt:"Logo",className:"inline h-12 mx-4 my-4"})})}),e.jsxs("p",{children:["In collaboration with the"," ",e.jsx("a",{href:"https://lwwangcse.github.io/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"LaVi Lab"})," ","team from"," ",e.jsx("a",{href:"https://www.cuhk.edu.hk/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"The Chinese University of Hong Kong (CUHK)"}),", we introduce the"," ",e.jsx("strong",{className:"font-bold",children:"Chinese Language Models EVAluation Platform (CLEVA)"})," ","leaderboard on HELM. CLEVA is a comprehensive Chinese-language benchmark for holistic evaluation of Chinese-language LLMs, and employs a standardized workflow to assess LLMs' performance across various dimensions."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2308.04813",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(M,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Rn="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAoAAAAEBCAMAAADfF+TxAAAABGdBTUEAALGPC/xhBQAAACBjSFJNAAB6JgAAgIQAAPoAAACA6AAAdTAAAOpgAAA6mAAAF3CculE8AAAC0FBMVEUAAAAgcMIfcMEfcMEfccEfcMEfcMEfb8Efb8AfccAfcMEfbsEfbMEAAP8gcMIggL8fcMEfcMEfcMAfcMEeccMecMEecMIfcMIfcMAeccEndsQfbsEfcMAfb8EfccAeb8EXdLkgccEfcMEfcMEecMEhccEAgP8fb8EAVaoeb8IfcMIeccEaZswebsIfcMEgcMEac78fccIeacMfcMEfcMEecMEcccYrgNUfcMIeb8EfcMIfccEfcMEeb8AfcMIjdMUfcMEfcMEjcsEcccYecMAeccEfb8IfcMEgccIfcMEkbcgfb8IfccEfcMEdbb0fcMIkbcIfcMEba7wfcMIfcMEfcMEfcMEfb8IgccAfccAgcMEfcMEgcMEfb8EfccAfcMEfccAeb8MgcMEgc78ecMIfccEfcMEhb8IgccEfb8IfcMEecMEfcMIdcb8fcMEdbMQfcMEfcMEVar8fb8IfcMEgccIfcMAfcMEgcL8gcMEfcMEfcMEgccIicsMfcMIeb8AecMEfcMIfccEeb8AfcMIgccEgcMAgcL8gcL8fb8EfcMEfb8EfccEfcMEAgL8fcMEgb8Ifb8Igdb8gccIfcMIfcMEfb8Egb8EcccEfb8EeccAfcMEgccEfcMEdb8AgcMEgb8EzZswkbbYecMAgcr8hccIgcMEfccEgcMIgcMEfb8Edbr8fccEdb8Efb8Eccb0fb8AfcMEgccIfcMEfcMAhb8EecMEfcMEfcMAfcMIfcMAgcMEfcMEecMAfcMEid7sfcsAecb8ib8QfcMEfcMEfb8EfcMEeb78ecMEfcMEfcMAhb7wfcMAfcsEfccAfccEfcMEeb8AfcMAccb8fcMEdccAgcMIgcL8bbb8fb8EfcMEfcMEeb8EfccEgcMEecMAeb8EecMAeb8AebsMgccIeccQgcb8gcMEhb8QfcMIfccEecMEebcIfb8EfcMH////Dl82sAAAA7nRSTlMAcM/y7uPRvqeKa0ohAYkI8ciSWyLiVOCbVg06q/ezXguR9vCgRgLnA1zcbwpD6oAUmhH59Y8SBmR3S7zZVaIW+7YdCW1mU8Sq/g51zJ0jwxX6E31itN23YVGy5bG1o8t6TMooO9X8LoiF5tq7NP0ai/QMbN9YOaQgkO3kPyaWZbAyc36/eFlAEJXvjIN7BOlHnhhPpvPXVy2uTYRf1jWZZwUHnzg2ocV50qUsrE7HG47oaMCCPpjhco1JwttdlA9BRB5CdHy9PMnT2BevMWqcKUXQJOw9UDAcY63rh96phpd2bjNxK0i6JxlaqCrOBCMOsQAAAAFiS0dE77iw4qEAAAAHdElNRQfoBhEVHhJsM9kZAAAK9ElEQVR42u3d+Z/XRR3A8UEOS/gK2QqILCIL666AEATLEYfIsV/ABBQhFYSVgCXkKpCIU6UAFREKCwrkMlSSzMoslIRKDeUyCcUOOuyev6EUEWH3+92d2fl85j3T6/Wz85mZz/e5DxXmO6u0x+qpC7tIS6h+g4aNLv7Yxy9p3CSjHFQvlUVfqtzWtFkqy1YAzEvxE5d9siAMgJdn3AJsrgEopBYtr2glH6C+0i3A1gAUVGGbq9pKB3i1U3+t2gFQVkXtOxSLBnhNiUuApRqA4rq2YyfBAHVnlwDbAFBi13XpKhfgpxz669YdgDL7dI+eUgE2KHMHsJcGoNQa9C6TCVD3cQewLwAF95l+MgH2d+ZvwEAAiu76QRIB1r/BFcDBGoCyKxoiEKAe6grgMACKr7ynPIDljvwNygJQfg07iwNYONwNwBEagAGU7SINoC51A3AkAMPoxjJhANs48fdZDcBAummULICjx7gAeHOaAFt47JYqWx97a2CNq8VfzY1PcT2fcwHwthQXrIiIiIiIiIiIiIiIiIiIiIiIiIgo/G73WK8qqwnuQOr53VHtKx6f+jo62GqYkPpSOZLvtGo/+XqpL2Oipb+CbOpLBaDT7syIAGh7RcIkDcDAmywCoG5uB7ARAEMHWCEDYGsrf3dpAIYOUE8RAdDuioQuAAwfYLkIgHZXJHwegOEDnNpEBECbKxKmaQCGD1BPFwHQ5oqESgDGAHBGiQSAupc5wC8AMAaAVf8w2gvAmcb+7tYAjAJgfxEAZ802BTgHgHEAnDtPAkD9RUN/mS8BMA6Aer4IgDcZAlygARgJwHtEAMwuNAP4ZQDGAnBRiQSAeoSRv+KvADAWgBf+5l5PAEcaAVzs6V2pgR4bHCnAShEA9RITgEt9ASQiIiIiIiIiIiIiIiIiIiIiIiKi/4uWVVi0PKnVrLBZTcU8m6nuHWnbfaZT3T8yoe7sWdPUK5OaeuRXHX3kVpcmtUgK4NesDt6uspmqh/U539WmU61J7MhxTVckFDdM8SsXAEwJ4ANyAPatYeYEvwwCQG8AH5QDsKYrEpYCMEKAD8kBqAfnnXjtwwCMEOAQQQCH5Z14nQZghAAnCwKY/4qERwAYI8AVggDmvSKhbBEAYwS4XhLAfFckbNAAjBBgYYkkgPmuSPg6AGMEeJESBfDmnNN2+gYAYwQ4URbAjTmnfVQDMEaAK2UB1N/MNe23ABglwE3CAG7OMevwdgCMEWB2gDCAt2aqn/XbGoAxAvyOEgZQb6l+1usBGCXAreIAVn86p+toAEYJsFQcwNuLq5v0MR0GwLYDLBqTFMBtNqsZkLGZqlM3u9aaT7WqW7JlXO6vtm1XREREREREREREREREREREREREJKgNOyyak9RqlmyV207rXfVIY3nnLo3blcrr6OzoI4/hRHQqbbQ/gWl1Ijpr+s8XnJ3u8aSn4ki+h2ZNUekC/K7pgEkfzNa0yPRHaw0A5QNcrVIGuNv0Yo0nPphtqPHlHgCUD3DGqrQB1nvSdMRTZ2bbYzpuCQDFA8xOVqkD/J7dFQmtTC9EGKkAKB7gpSp9gMbXS565IuEq04lGAFA8wKe3eQCobrS6IsH0QoTsQgBKB3jdXuUDYAfTId9XFhciDFMAFA5w1jrlBWDJM6b/p1Si1A9M5xkMQOkA6/yqbT/g1aZjFiv1rOlP12wACgf4Q+UL4I9Mx/xYzZ5qOKSvAqBsgFuVN4CZ5wzHLCr7iek0ywAoG+Ada/0BVM+bDvppheGA0WMAKBrg2KbKI8BdpoN+ZnquYIICoGSAT25TPgGqfUlv8AUASgb4YrHyC7Ay4Q0W7gegYIDTHe3K/gOelvAO9yjvABf+3KKdSQHs9pKgdjn7sbKZ/cD7Qw8mu8WFdV0fERERERERERERERERERERERERCelgpUW/SGo1BdMFNanM0a5+afOOrzw3vqTS/eZecri+OsWJ6Nz9ap6bXRXV9cRxX/d7e9nl+gCYUK/0lAFwmfOdNSwGYAAA9auzRQDs1t31xp5XAAwBoP71IQkA1VjX+9oFwDAA6idWSQD4guNdPZcBYCAA9f3bBQA0vnfX6AunAJQMUL9W4h+g+c3j+XsdgOEA1JcJANjP6Y6uVQAMCKAe7x9g02YuN/QgAIMCWPi6d4DqHpcbegqAQQHU+/Z7B7jJ4XY2KgCGBVAvz/gGuG2Ru93cB8DQAOrDvgGqI+42cxSAwQGcW+Ab4DFnezmuABgcQD3ON0Dj3xqSs90ADBDgmctsPQJUL7rayRsADBHgbw55Bvimo41UKFEAp5VadCIpgKP6yW2J9a6usHnHd134lOLmbvbx24TWR0RERERERERERERERERERERERD7bctii0qRWM9tmNYetbjJdMMSuk+ZTvWWzqWnVP+vQEPsap7A+42I4EW11gVUPywPFFieBXZ44blpkfxZ6dwrrA2DSAN/2C1DtsPaXbQLACACeyvgF2MEa4DAFwAgAVvlad8oAS66xXfglAIwCYKlfgOoyy3UPPATAKAC+4xngAst1X68AGAXA2zwDVPtcf7MegEEB/J1vgJutll04HIBxAMw29QzwqNWyf68AGAdAXeAZoDpu87wTAIwF4Mu+Afa2eNya7QCMBWA/3wAHZc0f11IBMBaAHX0DVDeZP+4gAKMBeJ93gIeNnzajBIDRAPyDd4BdR5s+7XkFwGgA9vAOUE0wfdrdAIwH4B/9Ayw1fNg+JRbglNMWPZYUwEE2qzlt9fssO/e2rbHpVH+y2dSf8z1xnuGS16W8PiIiIiIiIiIiIiIiIiIiIiIiInLaBo9V/fbssX4xtOX8TRWk9/6qdKgWy+1QCyaNk9qD0h6remL2Ih1Dfzl/U/XSe39VGt6u5scsrQXAoqT2AMAE+qscgOptN9+iB2BAXZMRBLB5jU95LgPAuAC2V4IA1nxjea2+QgrAgHpUEkDVvqanPAXAuADObSsK4LoaHnJcATAugBOVKIAlz+R/yGkARgbwXVkAa7ixPLsQgHEBfHitMID5byx/VgEwLoCVShjA/DeWDwFgXABHnxQHMN+N5d3HADAugP2VOID5bixfrgAYF8At8gCqRrkf0Q+AcQGcqQQCXJnzCc1uAGBUALM7JQI8kPPG8h0KgFEBLFcSAea+sfwYAKMCOPWoTICHczyg6p9Zpg/wao8trrLPjtOD7q1qP7zF6b2/HHXNsd75tT44/05Se+A7CURERERERERERERERERERERERG56w2P7q6zmZEFoDa/5Fe9P7/3lrdVHln3AkEmTpPbAiei6Na4WAP2fiD7TR7+e2doQIEfyZXZ1iQoHoHr13NChAIwA4MDDtfrwxAB86MOR9W8AYPgAT9XyOxliABZ8+PXMxxUAgwf4t70qMIDqlbMj3wVg6ADbdcyo4ACeXcmpbQAMHGDF380/dv8A/zH1zMClCoBBA6y/cpsKEaC6+MzANwEYMsBZ/ddb/YtPAMBl74+7vBiAAQN8eqflf3kJANit+3vj5igABgvwlRNKhQtQtXlv3AoABgowO3OTxV+3SwL4z/8N26cAGCTAwpZ3WZ33kASw7VytuwAwQIBTh3XsanngSBJA9S+t/w3A0ADOGtZxvbJOFMAN+j8KgEEBvPXI/FF1OnIpCmDZohEADAXgrH2vbd5wqM5nfkUBVEcWygK4x2N9quzz/BtSl2416slyo8bOzNWe8kce2H3LsSXb3Rw675Pe+6tFe6328HhSe/gvDuj5ccZcNDsAAAAldEVYdGRhdGU6Y3JlYXRlADIwMjQtMDYtMTdUMjE6MzA6MTgrMDA6MDDt4fgHAAAAJXRFWHRkYXRlOm1vZGlmeQAyMDI0LTA2LTE3VDIxOjMwOjE4KzAwOjAwnLxAuwAAAABJRU5ErkJggg==";function In(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Tables"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("div",{className:"text-center",children:e.jsx("a",{href:"https://www.ibm.com/",children:e.jsx("img",{src:Rn,alt:"Logo",className:"inline h-12 mx-4 my-4"})})}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://research.ibm.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"IBM Research"}),", we introduce the"," ",e.jsx("strong",{className:"font-bold",children:"HELM Tables"})," leaderboard on HELM. ",e.jsx("strong",{className:"font-bold",children:"HELM Tables"})," is a holistic evaluation of leading language models that tests their capability to understand, process and analyze structured tabular input data."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(f,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(M,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Ln=({id:s,title:t,text:n})=>((t==="Classic"||t==="Lite"||t==="Instruct")&&(t="HELM "+t),e.jsx("div",{className:"max-w-sm rounded overflow-hidden bg-gray-100 hover:scale-105 transition-transform duration-300",children:e.jsx("a",{href:ue(void 0,s),children:e.jsxs("div",{className:"px-6 py-4",children:[e.jsxs("div",{className:"font-bold text-xl mb-2",children:[e.jsx("div",{className:"py-3",children:e.jsx("svg",{fill:"#000000",width:"20px",height:"20px",viewBox:"0 0 24 24",xmlns:"http://www.w3.org/2000/svg",children:e.jsx("path",{d:"M22,7H16.333V4a1,1,0,0,0-1-1H8.667a1,1,0,0,0-1,1v7H2a1,1,0,0,0-1,1v8a1,1,0,0,0,1,1H22a1,1,0,0,0,1-1V8A1,1,0,0,0,22,7ZM7.667,19H3V13H7.667Zm6.666,0H9.667V5h4.666ZM21,19H16.333V9H21Z"})})}),t+" →"]}),e.jsx("p",{className:"text-gray-700 text-base",children:n})]})})}));function Sn(){const[s,t]=l.useState();return l.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(n=>n.json()).then(n=>{t(n)}).catch(n=>{console.error("Error fetching JSON:",n)})},[]),e.jsx("div",{className:"p-10 mb-20",children:e.jsx("div",{className:"grid grid-cols-2 lg:grid-cols-3 gap-4",children:s&&s.map((n,a)=>n.id==="home"?null:e.jsx(Ln,{id:n.id,title:n.title,text:n.description},a))})})}function kn(){return e.jsxs("div",{className:"flex flex-col md:flex-row px-6 py-32",children:[e.jsxs("div",{className:"flex-1 p-4 flex flex-col justify-center",children:[e.jsx("div",{className:"flex justify-start",children:e.jsxs("div",{children:[e.jsx("h1",{className:"text-4xl mb-4 mx-4 mt-2",children:e.jsx("strong",{children:"A reproducible and transparent framework for evaluating foundation models."})}),e.jsx("h3",{className:`text-xl - mb-4 mx-4 mt-2`,children:"Find leaderboards with many scenarios, metrics, and models with support for multimodality and model-graded evaluation."})]})}),e.jsxs("div",{className:"flex flex-col md:flex-row justify-start mt-6 ml-4",children:[e.jsx("button",{className:"px-6 btn btn-grey rounded-md mb-4 md:mb-0",onClick:()=>window.scrollTo({top:760,behavior:"smooth"}),children:e.jsx("div",{children:"Leaderboards ↓"})}),e.jsx("button",{className:"px-6 btn btn-grey rounded-md md:ml-4",children:e.jsx("a",{href:"https://github.com/stanford-crfm/helm",children:"Github"})})]})]}),e.jsx("div",{className:"mx-4 mt-6 md:mt-0 md:w-1/3",children:e.jsx("img",{src:Xe,alt:"HELM Hero",className:"object-cover w-full h-full"})})]})}const Cn=""+new URL("aisingapore-6dfc9acf.png",import.meta.url).href,Pn=[Ye,Cn,$e,Ze,es,ss,ts,ns,as,rs,ls,is,cs,os,ds,ms,hs,xs,us];function Tn(){const[s,t]=l.useState(void 0);return l.useEffect(()=>{const n=new AbortController;async function a(){const r=await C(n.signal);t(r)}return a(),()=>n.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(kn,{}),e.jsx("div",{className:"container py-5 mx-auto text-lg",children:e.jsx("div",{className:"flex flex-col sm:flex-row justify-center mb-10 flex sm:gap-8 md:gap-32",children:e.jsx("h1",{className:"text-4xl mx-4 ",children:e.jsx("strong",{children:"HELM Leaderboards"})})})}),e.jsx(Sn,{}),e.jsx("div",{className:"mx-auto text-lg px-16",children:e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:Pn.map((n,a)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:n,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},a))})})})]})})]}):null}const Bn=""+new URL("overview-74aea3d8.png",import.meta.url).href,Dn=""+new URL("process-flow-bd2eba96.png",import.meta.url).href;function Hn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Image2Struct: A Benchmark for Evaluating Vision-Language Models in Extracting Structured Information from Images"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-2 md:gap-8 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2410.22456",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-latex",children:"Latex dataset"}),e.jsx("a",{className:"px-5 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-webpage",children:"Webpage dataset"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-musicsheet",children:"Music sheet dataset"})]}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Image2Struct"})," is a benchmark for evaluating vision-language models in practical tasks of extracting structured information from images."]}),e.jsx("br",{}),e.jsx("p",{children:"In our tasks, VLMs are prompted to generate the underlying structured information (i.e., code) from an input image. The code can be compiled, and the output image is evaluated against the input image to produce a score. This round-trip evaluation allows us to quantitatively evaluate VLMs on complex tasks with multiple correct answers. We create a pipeline that downloads fresh, user-submitted data from active online communities upon execution, evaluates the VLMs shortly, and produces a leaderboard."}),e.jsx("br",{}),e.jsx("img",{src:Bn,alt:"Evaluation flowchart",className:"mx-auto block w-full",sizes:"100vw"}),e.jsx("br",{}),e.jsx("p",{children:"We introduce 3 tasks:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"LaTex: equations, tables, plots and algorithms from ArXiV papers."}),e.jsx("li",{children:"Webpages: pages from GitHub written in HTML, CSS and Javascript. ..."}),e.jsx("li",{children:"Music sheets: crops of measures from IMSLP music sheets."})]}),e.jsx("div",{className:"flex flex-row justify-center mt-8",children:e.jsx("a",{className:"px-10 btn rounded-md",href:"#/leaderboard",children:"Full Leaderboard"})})]}),e.jsx("div",{className:"flex-1",children:e.jsx(M,{numRowsToDisplay:12})})]}),e.jsx("br",{}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("p",{children:"We provide an automated process for collecting new fresh data from online communities, evaluating the models and producing a leaderboard. The pipeline is designed to be executed on a regular basis to keep the leaderboard up-to-date."}),e.jsx("br",{}),e.jsxs("p",{children:["In addition to the automated data collection, we also provide a"," ",e.jsx("i",{children:"wild"})," subset for the LaTeX and webpage tasks that are collected from Wikipedia and various popular websites. These instances do not have a corresponding code, and the evaluation is done by our proposed metric: block EMD (Earth Mover Distance)."]})]}),e.jsx("div",{className:"flex-1",children:e.jsx("img",{src:Dn,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block w-full",sizes:"200vw"})})]})]})}function Un(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Elements of World Knowledge (EWoK)"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("p",{children:["We present the"," ",e.jsx("a",{className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://arxiv.org/abs/2405.09605",children:"Elements of World Knowledge (EWoK)"})," ","leaderboard in collaboration with the EWoK team. EWoK is a benchmark for evaluating world modeling in language models by testing their ability to use knowledge of a concept to match a target text with a plausible/implausible context. EWoK targets specific concepts from multiple knowledge domains known to be vital for world modeling in humans, including social interactions and spatial relations."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2405.09605",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(M,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function On(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Medical"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("p",{className:"my-2",children:"With the increasing scale and impact of language models, there has also been interest interest in using language models in the medical domain. However, the capabilities and risks of these models are not well-understood, and there is significant potential for harm in the medical setting."}),e.jsxs("p",{className:"my-2",children:["To address this, we present the"," ",e.jsx("a",{className:"font-bold",href:"https://arxiv.org/abs/2405.09605",children:"HELM Medical"})," ","leaderboard for evaluation of language models in the medical domain. The HELM Medical leaderboard presents evaluations of leading general-purpose language models as well as language models fine-tuned on the medical domain. These models are evaluated on a range of medical tasks based on the benchmarks used in"," ",e.jsx("a",{className:"underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://arxiv.org/abs/2212.13138",children:"Singhal et al. 2022"}),". We hope that this leaderboard encourages further work in evaluating language models on tasks from the medical domain."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(f,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(M,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(f,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Fn=""+new URL("helm-safety-2907a7b6.png",import.meta.url).href;function _n(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"HELM Safety"}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("img",{src:Fn,alt:"Logo",className:"mx-auto p-0 block",style:{width:"300px"}}),e.jsx("p",{children:"Language models demonstrate powerful capabilities and pose significant risks. Given their widespread deployment, standardized public benchmarking of such models is vital. While language models are routinely evaluated on standard capability benchmarks, comparable standardization for benchmarking safety risks lags behind. To address this gap, we introduce HELM-Safety as a collection of 5 safety benchmarks that span 6 risk categories (e.g. violence, fraud, discrimination, sexual, harassment, deception). We present evaluation results for recent leading open weights and closed models."}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://crfm.stanford.edu/2024/11/08/helm-safety.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsx("div",{className:"flex-1",children:e.jsx(M,{})})]})]})}function Vn(){return window.PROJECT_ID==="lite"?e.jsx(Se,{}):window.PROJECT_ID==="instruct"?e.jsx(nn,{}):window.PROJECT_ID==="image2struct"?e.jsx(Hn,{}):window.PROJECT_ID==="heim"?e.jsx(pn,{}):window.PROJECT_ID==="mmlu"?e.jsx(ln,{}):window.PROJECT_ID==="vhelm"?e.jsx(vn,{}):window.PROJECT_ID==="air-bench"?e.jsx(on,{}):window.PROJECT_ID==="thaiexam"?e.jsx(hn,{}):window.PROJECT_ID==="finance"?e.jsx(un,{}):window.PROJECT_ID==="call-center"?e.jsx(En,{}):window.PROJECT_ID==="cleva"?e.jsx(Mn,{}):window.PROJECT_ID==="tables"?e.jsx(In,{}):window.PROJECT_ID==="ewok"?e.jsx(Un,{}):window.PROJECT_ID==="medical"?e.jsx(On,{}):window.PROJECT_ID==="safety"?e.jsx(_n,{}):window.PROJECT_ID==="home"?e.jsx(Tn,{}):e.jsx(Se,{})}function zn(){return e.jsx(Ss,{children:e.jsx(ks,{children:e.jsxs(D,{path:"/",element:e.jsx(gt,{}),children:[e.jsx(D,{index:!0,element:e.jsx(Vn,{})}),e.jsx(D,{path:"leaderboard",element:e.jsx(en,{})}),e.jsx(D,{path:"models",element:e.jsx(vt,{})}),e.jsx(D,{path:"scenarios",element:e.jsx(Nt,{})}),e.jsx(D,{path:"groups",element:e.jsx(Mt,{})}),e.jsx(D,{path:"groups/:groupName",element:e.jsx(Rt,{})}),e.jsx(D,{path:"runs",element:e.jsx(Lt,{})}),e.jsx(D,{path:"runs/:runName",element:e.jsx(Zt,{})})]})})})}de.createRoot(document.getElementById("root")).render(e.jsx(Cs.StrictMode,{children:e.jsx(zn,{})})); diff --git a/src/helm/benchmark/static_build/assets/index-aeb31514.js b/src/helm/benchmark/static_build/assets/index-aeb31514.js new file mode 100644 index 0000000000..e27697fa0f --- /dev/null +++ b/src/helm/benchmark/static_build/assets/index-aeb31514.js @@ -0,0 +1,10 @@ +import{r as l,a as $e,L as p,O as Ye,d as Ze,u as xe,f as Y,H as es,h as ss,i as B,R as ts}from"./react-d4a0b69b.js";import{g as H,b as V,m as X,s as ue,a as ns,d as le,y as as,c as ie,e as Z,l as ee}from"./tremor-54a99cc4.js";import"./recharts-6d337683.js";(function(){const t=document.createElement("link").relList;if(t&&t.supports&&t.supports("modulepreload"))return;for(const r of document.querySelectorAll('link[rel="modulepreload"]'))n(r);new MutationObserver(r=>{for(const i of r)if(i.type==="childList")for(const c of i.addedNodes)c.tagName==="LINK"&&c.rel==="modulepreload"&&n(c)}).observe(document,{childList:!0,subtree:!0});function a(r){const i={};return r.integrity&&(i.integrity=r.integrity),r.referrerPolicy&&(i.referrerPolicy=r.referrerPolicy),r.crossOrigin==="use-credentials"?i.credentials="include":r.crossOrigin==="anonymous"?i.credentials="omit":i.credentials="same-origin",i}function n(r){if(r.ep)return;r.ep=!0;const i=a(r);fetch(r.href,i)}})();var fe={exports:{}},K={};/** + * @license React + * react-jsx-runtime.production.min.js + * + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */var rs=l,ls=Symbol.for("react.element"),is=Symbol.for("react.fragment"),cs=Object.prototype.hasOwnProperty,os=rs.__SECRET_INTERNALS_DO_NOT_USE_OR_YOU_WILL_BE_FIRED.ReactCurrentOwner,ds={key:!0,ref:!0,__self:!0,__source:!0};function ge(s,t,a){var n,r={},i=null,c=null;a!==void 0&&(i=""+a),t.key!==void 0&&(i=""+t.key),t.ref!==void 0&&(c=t.ref);for(n in t)cs.call(t,n)&&!ds.hasOwnProperty(n)&&(r[n]=t[n]);if(s&&s.defaultProps)for(n in t=s.defaultProps,t)r[n]===void 0&&(r[n]=t[n]);return{$$typeof:ls,type:s,key:i,ref:c,props:r,_owner:os.current}}K.Fragment=is;K.jsx=ge;K.jsxs=ge;fe.exports=K;var e=fe.exports,$={},ce=$e;$.createRoot=ce.createRoot,$.hydrateRoot=ce.hydrateRoot;function ms({title:s,titleId:t,...a},n){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?l.createElement("title",{id:t},s):null,l.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M3.75 6.75h16.5M3.75 12h16.5m-16.5 5.25h16.5"}))}const hs=l.forwardRef(ms),pe=hs;function xs({title:s,titleId:t,...a},n){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?l.createElement("title",{id:t},s):null,l.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9 12.75L11.25 15 15 9.75M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const us=l.forwardRef(xs),fs=us;function gs({title:s,titleId:t,...a},n){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?l.createElement("title",{id:t},s):null,l.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9.75 9.75l4.5 4.5m0-4.5l-4.5 4.5M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const ps=l.forwardRef(gs),js=ps,je=""+new URL("crfm-logo-74391ab8.png",import.meta.url).href,be=""+new URL("helm-logo-simple-2ed5400b.png",import.meta.url).href;function bs({title:s,titleId:t,...a},n){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M12 2.25a.75.75 0 01.75.75v11.69l3.22-3.22a.75.75 0 111.06 1.06l-4.5 4.5a.75.75 0 01-1.06 0l-4.5-4.5a.75.75 0 111.06-1.06l3.22 3.22V3a.75.75 0 01.75-.75zm-9 13.5a.75.75 0 01.75.75v2.25a1.5 1.5 0 001.5 1.5h13.5a1.5 1.5 0 001.5-1.5V16.5a.75.75 0 011.5 0v2.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V16.5a.75.75 0 01.75-.75z",clipRule:"evenodd"}))}const ws=l.forwardRef(bs),vs=ws;function Ns({title:s,titleId:t,...a},n){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M15.75 2.25H21a.75.75 0 01.75.75v5.25a.75.75 0 01-1.5 0V4.81L8.03 17.03a.75.75 0 01-1.06-1.06L19.19 3.75h-3.44a.75.75 0 010-1.5zm-10.5 4.5a1.5 1.5 0 00-1.5 1.5v10.5a1.5 1.5 0 001.5 1.5h10.5a1.5 1.5 0 001.5-1.5V10.5a.75.75 0 011.5 0v8.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V8.25a3 3 0 013-3h8.25a.75.75 0 010 1.5H5.25z",clipRule:"evenodd"}))}const As=l.forwardRef(Ns),Es=As;function ys({title:s,titleId:t,...a},n){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M15 3.75a.75.75 0 01.75-.75h4.5a.75.75 0 01.75.75v4.5a.75.75 0 01-1.5 0V5.56l-3.97 3.97a.75.75 0 11-1.06-1.06l3.97-3.97h-2.69a.75.75 0 01-.75-.75zm-12 0A.75.75 0 013.75 3h4.5a.75.75 0 010 1.5H5.56l3.97 3.97a.75.75 0 01-1.06 1.06L4.5 5.56v2.69a.75.75 0 01-1.5 0v-4.5zm11.47 11.78a.75.75 0 111.06-1.06l3.97 3.97v-2.69a.75.75 0 011.5 0v4.5a.75.75 0 01-.75.75h-4.5a.75.75 0 010-1.5h2.69l-3.97-3.97zm-4.94-1.06a.75.75 0 010 1.06L5.56 19.5h2.69a.75.75 0 010 1.5h-4.5a.75.75 0 01-.75-.75v-4.5a.75.75 0 011.5 0v2.69l3.97-3.97a.75.75 0 011.06 0z",clipRule:"evenodd"}))}const Ms=l.forwardRef(ys),Rs=Ms;function Is({title:s,titleId:t,...a},n){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M12.53 16.28a.75.75 0 01-1.06 0l-7.5-7.5a.75.75 0 011.06-1.06L12 14.69l6.97-6.97a.75.75 0 111.06 1.06l-7.5 7.5z",clipRule:"evenodd"}))}const Ls=l.forwardRef(Is),we=Ls;function Ss({title:s,titleId:t,...a},n){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M11.47 4.72a.75.75 0 011.06 0l3.75 3.75a.75.75 0 01-1.06 1.06L12 6.31 8.78 9.53a.75.75 0 01-1.06-1.06l3.75-3.75zm-3.75 9.75a.75.75 0 011.06 0L12 17.69l3.22-3.22a.75.75 0 111.06 1.06l-3.75 3.75a.75.75 0 01-1.06 0l-3.75-3.75a.75.75 0 010-1.06z",clipRule:"evenodd"}))}const ks=l.forwardRef(Ss),Cs=ks;function Ts({title:s,titleId:t,...a},n){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M10.5 3.75a6.75 6.75 0 100 13.5 6.75 6.75 0 000-13.5zM2.25 10.5a8.25 8.25 0 1114.59 5.28l4.69 4.69a.75.75 0 11-1.06 1.06l-4.69-4.69A8.25 8.25 0 012.25 10.5z",clipRule:"evenodd"}))}const Ps=l.forwardRef(Ts),Bs=Ps;function se(s,t){return t?t==="home"?"https://crfm.stanford.edu/helm/":s?`https://crfm.stanford.edu/helm/${t}/${s}/`:`https://crfm.stanford.edu/helm/${t}/latest/`:"#"}function ve(){const[s,t]=l.useState([]),[a,n]=l.useState();return l.useEffect(()=>{if(a&&a.title&&a.title!=="All Leaderboards"){const r=a.title==="Lite"||a.title==="Classic"?"HELM "+a.title:a.title;document.title=r+" - Holistic Evaluation of Language Models (HELM)"}},[a]),l.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(r=>r.json()).then(r=>{if(t(r),window.PROJECT_ID){const i=r.find(c=>c.id===window.PROJECT_ID);n(i)}else{const i=r.find(c=>c.id==="lite");n(i)}}).catch(r=>{console.error("Error fetching JSON:",r)})},[]),a===void 0||a.title===void 0?null:e.jsxs("div",{className:"dropdown z-50",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"btn normal-case bg-white font-bold p-2 border-0 text-lg block whitespace-nowrap z-40","aria-haspopup":"true","aria-controls":"menu",children:[a.title," ",e.jsx(we,{fill:"black",color:"black",className:"text w-4 h-4 inline"})]}),e.jsx("ul",{tabIndex:0,className:"-translate-x-36 dropdown-content z-[1] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:s.map((r,i)=>e.jsx("li",{className:"z-40",children:e.jsxs("a",{href:se(void 0,r.id),className:"block",role:"menuitem",children:[e.jsx("strong",{className:a.title===r.title?"underline":"",children:r.title}),": ",r.description]})},i))})]})}function L(s){return s.startsWith("http://")||s.startsWith("https://")?s:`${window.BENCHMARK_OUTPUT_BASE_URL.replace(/\/$/,"")}/${s.replace(/^\//,"")}`}function z(){return window.RELEASE?`/releases/${window.RELEASE}`:`/runs/${window.SUITE}`}async function Ds(s){try{return await(await fetch(L(`${z()}/summary.json`),{signal:s})).json()}catch(t){return console.log(t),{release:void 0,suites:void 0,suite:void 0,date:""}}}function Hs(){const[s,t]=l.useState({release:void 0,suites:void 0,suite:void 0,date:""}),[a,n]=l.useState();l.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(u=>u.json()).then(u=>{if(window.PROJECT_ID){const v=u.find(j=>j.id===window.PROJECT_ID);n(v)}else{const v=u.find(j=>j.id==="lite");n(v)}}).catch(u=>{console.error("Error fetching JSON:",u)})},[]),l.useEffect(()=>{const u=new AbortController;async function v(){const j=await Ds(u.signal);t(j)}return v(),()=>u.abort()},[]);const r=a!==void 0&&a.releases!==void 0?a.releases:["v1.0.0"],i=s.release||s.suite||null;if(!i)return null;const c=`Release ${i} (${s.date})`;if(r.length<=1)return e.jsx("div",{children:c});const o=r.indexOf(i),h=o<0?e.jsx(H,{color:"blue",children:"preview"}):o===0?e.jsx(H,{color:"blue",children:"latest"}):e.jsx(H,{color:"yellow",children:"stale"});return e.jsxs("div",{className:"dropdown",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"normal-case bg-white border-0 block whitespace-nowrap","aria-haspopup":"true","aria-controls":"menu",children:[c," ",h," ",e.jsx(we,{fill:"black",color:"black",className:"inline text w-4 h-4"})]}),e.jsx("ul",{tabIndex:0,className:"dropdown-content z-[50] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:r.map(u=>e.jsx("li",{children:e.jsx("a",{href:se(u,a?a.id:"lite"),className:"block",role:"menuitem",children:u})},u))})]})}function Us(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsxs("div",{className:"dropdown md:hidden mr-4",children:[e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(pe,{className:"w-16 h-16"})}),e.jsxs("ul",{tabIndex:0,className:"menu menu-lg dropdown-content mt-3 z-[1] p-2 bg-base-100 shadow",children:[e.jsx("li",{children:e.jsx(p,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(p,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(p,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(p,{to:"runs",className:"whitespace-nowrap",children:"Predictions"})}),e.jsx("li",{children:e.jsx(p,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})})]})]})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx("a",{href:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:je,className:"object-contain"})}),e.jsx(p,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:be,className:"object-contain"})}),e.jsx(ve,{})]}),e.jsx("div",{className:"flex-none hidden md:block",children:e.jsxs("ul",{className:"flex flex-row gap-6 px-1",children:[e.jsx("li",{children:e.jsx(p,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(p,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(p,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(p,{to:"runs",children:"Predictions"})}),e.jsx("li",{children:e.jsx(p,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})}),e.jsx("li",{className:"hidden lg:flex",children:e.jsx(Hs,{})})]})})]})}function Os(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsx("div",{className:"dropdown md:hidden mr-4",children:e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(pe,{className:"w-16 h-16"})})})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx(p,{to:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:je,className:"object-contain"})}),e.jsx(p,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:be,className:"object-contain"})}),e.jsx(ve,{})]})]})}function Fs(){return e.jsxs(e.Fragment,{children:[window.PROJECT_ID==="home"?e.jsx(Os,{}):e.jsx(Us,{}),e.jsx("main",{className:"p-8 pt-0",children:e.jsx("div",{className:"mx-auto max-w-[1500]px",children:e.jsx(Ye,{})})})]})}async function P(s){try{return await(await fetch(L(`${z()}/schema.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),{adapter:[],metric_groups:[],metrics:[],models:[],perturbations:[],run_groups:[]}}}function _s({href:s,children:t}){return e.jsx("a",{href:s,className:"link link-primary link-hover",target:"_blank",rel:"noreferrer",children:t})}function _({value:s}){return e.jsx("span",{children:e.jsx(Ze,{components:{a:_s},children:s})})}function T({title:s,subtitle:t,markdown:a=!1}){return e.jsxs("header",{className:"m-4 ml-0",children:[e.jsx("h1",{className:"text-4xl",children:s}),a&&t!==void 0?e.jsx("h2",{className:"mt-2 text-neutral",children:e.jsx(_,{value:t})}):t!==void 0&&e.jsx("h2",{className:"mt-2 text-neutral",children:t})]})}const Vs={open:"green",limited:"yellow",closed:"red"},zs={open:"Open",limited:"Limited",closed:"Closed"};function Ws({level:s}){return e.jsx(H,{color:Vs[s],children:zs[s]})}function k(){return e.jsx("div",{className:"w-full",children:e.jsx("div",{className:"block mx-auto my-24 loading loading-spinner loading-lg"})})}function qs(){const[s,t]=l.useState([]);l.useEffect(()=>{const c=new AbortController;async function o(){const h=await P(c.signal);t(h.models)}return o(),()=>c.abort()},[]);const[a,n,r]=s.reduce((c,o)=>{switch(o.access){case"open":c[0]+=1;break;case"limited":c[1]+=1;break;case"closed":c[2]+=1;break}return c},[0,0,0]),i=Object.values(s.reduce((c,o)=>{const h=o.creator_organization;return c[h]===void 0?(c[h]={name:h,models:1},c):(c[h].models+=1,c)},{}));return s.length===0?e.jsx(k,{}):e.jsxs(e.Fragment,{children:[e.jsx(T,{title:"Models"}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Creator"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Description"}),e.jsx("th",{children:"Access"})]})}),e.jsx("tbody",{children:s.map(c=>e.jsxs("tr",{children:[e.jsx("td",{className:"text-lg",children:c.creator_organization}),e.jsxs("td",{children:[e.jsx("span",{className:"text-xl",children:c.display_name}),e.jsx("br",{}),e.jsx("span",{children:c.name})]}),e.jsx("td",{children:e.jsx(_,{value:c.description})}),e.jsx("td",{children:e.jsx(Ws,{level:c.access})})]}))})]}),e.jsx(T,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-3 grid-cols-1 gap-8",children:[e.jsxs(V,{className:"flex flex-col justify-between",children:[e.jsx(X,{children:"Models"}),e.jsx(ue,{className:"text-6xl md:!text-[96px]",children:s.length}),e.jsx(ns,{values:[a,n,r],colors:["green","yellow","red"]}),e.jsx(le,{categories:["Open","Limited","Closed"],colors:["green","yellow","red"]})]}),e.jsxs(V,{className:"md:col-span-2",children:[e.jsx(X,{children:"Creator Organizations"}),e.jsxs("div",{className:"flex justify-between mt-4",children:[e.jsx(as,{data:i,category:"models",index:"name",variant:"pie",className:"basis-5/12"}),e.jsx(le,{categories:i.map(c=>c.name),className:"basis-7/12"})]})]})]})]})]})}function G({to:s,children:t,inTable:a=!1,title:n=""}){return a?e.jsx(p,{className:"link link-hover",to:s,title:n,children:t}):e.jsx(p,{className:"link link-primary link-hover",to:s,children:t})}function Gs(){const[s,t]=l.useState([]);l.useEffect(()=>{const n=new AbortController;async function r(){const i=await P(n.signal);t(i.run_groups.filter(c=>!c.todo&&c.taxonomy&&!c.display_name.includes("CLEVA")))}return r(),()=>n.abort()},[]);const a=Object.values(s.reduce((n,r)=>{var c;const i=((c=r.taxonomy)==null?void 0:c.task)||"Unknown";return n[i]===void 0?(n[i]={name:i,value:1},n):(n[i].value+=1,n)},{}));return s.length===0?e.jsx(k,{}):(console.log(s),e.jsxs(e.Fragment,{children:[e.jsx(T,{title:"Scenarios",subtitle:"A scenario represents a use case and consists of a dataset of instances."}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Scenario"}),e.jsx("th",{children:"Task"}),e.jsx("th",{children:"What"}),e.jsx("th",{children:"Who"}),e.jsx("th",{children:"When"}),e.jsx("th",{children:"Language"}),e.jsx("th",{children:"Description"})]})}),e.jsx("tbody",{children:s.map(n=>{var r,i,c,o,h;return e.jsxs("tr",{children:[e.jsxs("td",{children:[e.jsx(G,{to:`/groups/${n.name}`,children:e.jsx("span",{className:"text-lg",children:n.display_name})}),e.jsx("span",{className:"block",children:n.name})]}),e.jsx("td",{children:((r=n.taxonomy)==null?void 0:r.task)||""}),e.jsx("td",{children:((i=n.taxonomy)==null?void 0:i.what)||""}),e.jsx("td",{children:((c=n.taxonomy)==null?void 0:c.who)||""}),e.jsx("td",{children:((o=n.taxonomy)==null?void 0:o.when)||""}),e.jsx("td",{children:((h=n.taxonomy)==null?void 0:h.language)||""}),e.jsx("td",{children:e.jsx(_,{value:n.description})})]})})})]}),e.jsx(T,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-4 gap-8",children:[e.jsxs(V,{className:"flex flex-col",children:[e.jsx(X,{children:"Total scenarios"}),e.jsx(ue,{className:"mx-auto my-6 md:mt-16 !text-[72px] md:!text-[96px]",children:s.length})]}),e.jsx(V,{className:"col-span-3",children:e.jsxs("div",{className:"grid md:grid-cols-2 gap-x-12",children:[e.jsx(ie,{data:a.slice(0,Math.floor(a.length/2))}),e.jsx(ie,{data:a.slice(Math.ceil(a.length/2))})]})})]})]})]}))}function Qs(){return L(`${z()}/groups.json`)}async function Ne(s){try{return await(await fetch(Qs(),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),[]}}function te({children:s}){return e.jsx("div",{role:"navigation",className:"tabs flex-nowrap border-b-2 border-gray-2 overflow-x-auto overflow-y-hidden",children:s})}function Q({active:s=!1,onClick:t=()=>{},size:a="md",children:n}){return e.jsx("div",{onClick:t,className:`whitespace-nowrap text-${a} mb-[-2px] text-md tab tab-bordered${s?" border-2 border-grey-500 rounded":" border-none"}`,children:n})}function Ks({title:s,titleId:t,...a},n){return l.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 20 20",fill:"currentColor","aria-hidden":"true",ref:n,"aria-labelledby":t},a),s?l.createElement("title",{id:t},s):null,l.createElement("path",{fillRule:"evenodd",d:"M4.25 5.5a.75.75 0 00-.75.75v8.5c0 .414.336.75.75.75h8.5a.75.75 0 00.75-.75v-4a.75.75 0 011.5 0v4A2.25 2.25 0 0112.75 17h-8.5A2.25 2.25 0 012 14.75v-8.5A2.25 2.25 0 014.25 4h5a.75.75 0 010 1.5h-5z",clipRule:"evenodd"}),l.createElement("path",{fillRule:"evenodd",d:"M6.194 12.753a.75.75 0 001.06.053L16.5 4.44v2.81a.75.75 0 001.5 0v-4.5a.75.75 0 00-.75-.75h-4.5a.75.75 0 000 1.5h2.553l-9.056 8.194a.75.75 0 00-.053 1.06z",clipRule:"evenodd"}))}const Js=l.forwardRef(Ks),oe=Js;function F(s){return Number.isNaN(Number(s))?String(s):String(Math.round(Number(s)*1e3)/1e3)}function de({value:s,title:t,hideIcon:a}){if(typeof s.value=="string"&&s.value.includes("⚠")&&(s.value=s.value.replace("⚠","")),s.value===void 0)return"-";if(s.run_spec_names){const n=(()=>{if(s.run_spec_names.length==1)return"/runs/"+s.run_spec_names[0];if(s.run_spec_names.length>1){const r="/runs/?q="+s.run_spec_names.map(c=>`^${c}$`).join("|");return encodeURI(r)}})();return n?e.jsx(G,{to:n,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center ",children:[F(s.value),!a&&e.jsx(oe,{className:"w-3 h-3 ml-1",stroke:"#cbcbcb",fill:"#cbcbcb"})]})}):t?e.jsx("a",{title:t,children:F(s.value)}):e.jsx(e.Fragment,{children:F(s.value)})}return s.href?e.jsx(G,{to:s.href,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center",children:[F(s.value),!a&&e.jsx(oe,{className:"w-3 h-3 ml-1",stroke:"#cbcbcb",fill:"#cbcbcb"})]})}):s.markdown?e.jsx(_,{value:String(s.value)}):t?e.jsx("a",{title:t,children:F(s.value)}):e.jsx(e.Fragment,{children:F(s.value)})}function ne({schema:s,groupTable:t,numRowsToDisplay:a,sortColumnIndex:n=1,sortable:r=!0,displayColumnIndexes:i=void 0,miniStyle:c=!1}){const[o,h]=l.useState(1),[u,v]=l.useState(n);function j(d){return d.length>30?d.substring(0,27)+"...":d}const E=d=>{const b=["AIRBench 2024 -","-book"];if(d.value==="Model/adapter")return"Model";if(b.some(x=>d.value.includes(x))){let x=d.value;return b.forEach(w=>{x=x.replace(w,"")}),j(x)}else return j(d.value)},g=d=>{if(s){const b=s.models.find(x=>x.display_name===d);if(b){let x=b.description;return x.includes("/")&&(x=x.replace("/","_")),x}}return""},y=d=>{h(d===u?o*-1:d===0?-1:1),v(d)},m=d=>{if(s){const b=s.models.find(x=>x.display_name===d);if(b){let x=b.name;return x.includes("/")&&(x=x.replace("/","_")),x}}return""},f=()=>{const d=t.header[u].lower_is_better,b=o*(d?1:-1),x=t.rows.slice();return x.sort((w,R)=>{var C,D;const I=(C=w[u])==null?void 0:C.value,N=(D=R[u])==null?void 0:D.value;return I!==void 0&&N===void 0?-1:N!==void 0&&I===void 0?1:typeof I=="number"&&typeof N=="number"?(I-N)*b:typeof I=="string"&&typeof N=="string"?b===1?I.localeCompare(N):N.localeCompare(I):0}),a>0?x.slice(0,a):x};function A(d){const b=d.lastIndexOf(" - ");return b===-1?d:d.substring(0,b)+"*"+d.substring(b+1)}const M=d=>{const x=A(d).split("*")[0].trim();if(s){const w=s.run_groups.find(R=>R.display_name===x||R.short_display_name===x);if(w)return w.name}return""};return e.jsxs("table",{className:c?"table w-full":"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsx("tr",{children:t.header.filter((d,b)=>i===void 0||i.includes(b)).map((d,b)=>e.jsx("th",{className:`${b===u?"bg-gray-100":"bg-white"} ${b===0?"left-0 z-40":""} ${d.description?"underline decoration-dashed decoration-gray-300 ":""} whitespace-nowrap px-4 sticky top-0`,title:d.description?d.description:"",children:e.jsxs("div",{className:c?"flex gap-2 items-center":"z-20 flex justify-between items-center min-w-48 w-48 max-w-48 text-wrap",children:[e.jsx("span",{className:"inline-block w-full break-words",children:E(d)}),r?e.jsx("button",{className:"link",onClick:()=>y(b),children:e.jsx(Cs,{className:"w-6 h-6"})}):null]})},`$${b}`))})}),e.jsx("tbody",{children:f().map((d,b)=>e.jsx("tr",{children:d.filter((x,w)=>i===void 0||i.includes(w)).map((x,w)=>e.jsx("td",{className:`${w===0?"z-20 text-lg sticky left-0":"z-0"} ${b%2===0?"bg-gray-50":"bg-white"}`,children:w==1?e.jsx("div",{className:`${x&&x.style&&x.style["font-weight"]&&x.style["font-weight"]==="bold"?"font-bold":""}`,children:e.jsx(de,{value:{...x,href:"/runs/?q="+m(String(d[0].value))},title:`Click value to see all predictions for: ${m(String(d[0].value))}`})}):e.jsx("div",{className:`${x&&x.style&&x.style["font-weight"]&&x.style["font-weight"]==="bold"?"font-bold":""} ${w===0?"underline decoration-dashed decoration-gray-300 z-10":"z-0"}`,children:e.jsx(de,{value:{...x},title:String(d[0].value)===x.value?g(String(d[0].value)):`Click value to see predictions for ${String(d[0].value)} for ${M(E(t.header[w]))}: ${m(String(d[0].value))}`})})},`${w}`))},`$${d[0].value}`))})]})}function Xs(){const[s,t]=l.useState(0),[a,n]=l.useState(),[r,i]=l.useState();return l.useEffect(()=>{const c=new AbortController;async function o(){const h=P(c.signal),u=Ne(c.signal),v=await h;i(v);const j=await u;n(j)}return o(),()=>c.abort()},[]),a===void 0||r===void 0?e.jsx(k,{}):a.length===0?e.jsxs("div",{children:[e.jsx(T,{title:"Results",subtitle:"Groupings of the processes, methods, and metrics involved in evaluating models, particularly in the context of natural language understanding and question answering.",className:"mb-16"}),e.jsx("div",{children:"No groups found."})]}):e.jsxs("div",{children:[e.jsx(T,{title:"Results",subtitle:"Groupings of the processes, methods, and metrics involved in evaluating models, particularly in the context of natural language understanding and question answering.",className:"mb-16"}),e.jsxs("div",{children:[a.length>1?e.jsx(te,{children:a.map((c,o)=>e.jsx(Q,{active:o===s,onClick:()=>t(o),children:c.title},o))}):null,e.jsx(ne,{schema:r,groupTable:a[s],numRowsToDisplay:-1,sortColumnIndex:1,sortable:!0},`${s}`)]})]})}async function Ae(s,t){try{return await(await fetch(L(`${z()}/groups/${s}.json`),{signal:t})).json()}catch(a){return a instanceof Error&&a.name!=="AbortError"&&console.log(a),[]}}function Ee({schema:s,runGroupName:t,numRowsToDisplay:a=-1}){const[n,r]=l.useState(),[i,c]=l.useState(0);return l.useEffect(()=>{const o=new AbortController;async function h(){const u=await Ae(t,o.signal);r(u)}return h(),()=>o.abort()},[s,t]),n===void 0||n.length===0?e.jsx(k,{}):n.length===0?e.jsx("div",{children:"Group currently has no tables."}):e.jsxs("div",{children:[n.length>1?e.jsx(te,{children:n.map((o,h)=>e.jsx(Q,{active:h===i,onClick:()=>c(h),children:o.title},h))}):null,e.jsx(ne,{schema:s,groupTable:n[i],numRowsToDisplay:a,sortColumnIndex:1},`${t}-${i}`)]})}function $s(){const{groupName:s}=xe(),[t,a]=l.useState(void 0);l.useEffect(()=>{const i=new AbortController;async function c(){const h=await P(i.signal);a(h)}return c(),()=>i.abort()},[]);const r=(()=>{if(t!==void 0){for(const i of t.run_groups)if(i.name===s)return i}})();return t===void 0?e.jsx(k,{}):r===void 0?e.jsxs("div",{children:['Group "',s,'" not found.']}):e.jsxs(e.Fragment,{children:[e.jsx(T,{title:r.display_name,subtitle:r.description,markdown:!0,className:"mr-8"}),e.jsx(Ee,{schema:t,runGroupName:r.name},r.name)]})}async function Ys(s){try{return await(await fetch(L(`${z()}/run_specs.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),[]}}function ae({currentPage:s,totalPages:t,onNextPage:a,onPrevPage:n,className:r}){let i="join";return r!==void 0&&(i=`join ${r}`),e.jsxs("div",{className:i,children:[e.jsx("button",{onClick:n,className:"join-item btn",children:"«"}),e.jsxs("button",{className:"join-item btn",children:["Page ",s," of ",t]}),e.jsx("button",{onClick:a,className:"join-item btn",children:"»"})]})}const J=100;function Zs(){const[s,t]=Y(),[a,n]=l.useState(),[r,i]=l.useState(Number(s.get("page")||1)),[c,o]=l.useState(!0),[h,u]=l.useState(s.get("q")||"");l.useEffect(()=>{const m=new AbortController;async function f(){const A=await Ys(m.signal);n(A)}return f(),()=>m.abort()},[]);const v=m=>{m.preventDefault();const A=m.target.q.value;u(A),t({q:A,page:"1"})};if(a===void 0)return e.jsx(k,{});const j=c?new RegExp(h):null,E=a.filter(m=>j?j.test(m.name):m.name.includes(h)),g=E.slice((r-1)*J,r*J),y=Math.ceil(E.length/J);return e.jsxs(e.Fragment,{children:[e.jsx(T,{title:"Predictions",subtitle:"All benchmark predictions"}),e.jsxs("form",{className:"flex mb-8",onSubmit:v,children:[e.jsxs("div",{className:"form-control",children:[e.jsx("input",{type:"text",name:"q",placeholder:"Search",className:"input input-bordered",value:h,onChange:m=>u(m.target.value)}),e.jsxs("label",{className:"label",children:[e.jsxs("span",{className:"label-text-alt flex item-center",children:[e.jsx("input",{type:"checkbox",className:"toggle toggle-xs",checked:c,onChange:()=>o(!c)}),e.jsx("span",{className:"ml-2",children:"Regex"})]}),e.jsx("span",{className:"label-text-alt",children:`${E.length} results`})]})]}),e.jsx("div",{className:"form-control ml-4",children:e.jsx("button",{className:"btn",children:e.jsx(Bs,{className:"w-6 h-6"})})})]}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Run"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Groups"}),e.jsx("th",{children:"Adapter method"}),e.jsx("th",{children:"Subject / Task"})]})}),e.jsx("tbody",{children:g.map((m,f)=>e.jsxs("tr",{children:[e.jsx("td",{children:e.jsx(G,{to:`/runs/${m.name}`,children:m.name})}),e.jsx("td",{children:m.adapter_spec.model}),e.jsx("td",{children:m.groups.join(", ")}),e.jsx("td",{children:m.adapter_spec.method}),e.jsx("td",{children:m.scenario_spec.args.subject||m.scenario_spec.args.task||"-"})]},`${m.name}-${f}`))})]})}),y>0?e.jsx(ae,{className:"flex justify-center my-8",onNextPage:()=>{const m=Math.min(r+1,y);i(m),s.set("page",String(m)),t(s)},onPrevPage:()=>{const m=Math.max(r-1,1);i(m),s.set("page",String(m)),t(s)},currentPage:r,totalPages:y}):e.jsx("div",{className:"my-8 text-center",children:"No results"})]})}function O(){return window.SUITE!==void 0?window.SUITE:void 0}async function et(s,t,a){try{return await(await fetch(L(`/runs/${a||O()}/${s}/scenario.json`),{signal:t})).json()}catch(n){n instanceof Error&&n.name!=="AbortError"&&console.log(n);return}}function ye(s,t){return L(`/runs/${t||O()}/${s}/run_spec.json`)}async function st(s,t,a){try{return await(await fetch(ye(s,a),{signal:t})).json()}catch(n){n instanceof Error&&n.name!=="AbortError"&&console.log(n);return}}function tt(s,t){return L(`/runs/${t||O()}/${s}/scenario_state.json`)}function Me(){return window.RELEASE!==void 0?window.RELEASE:void 0}async function nt(s){try{return await(await fetch(L(`/releases/${Me()}/runs_to_run_suites.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),{}}}function at(s,t){return Me()?s[t]:window.SUITE}function rt(s){const a={quasi_exact_match:!1,toxic_frac:!0,safety_score:!1,exact_match:!1},n=Object.keys(s);for(const r of n)if(s[r]!==void 0&&a[r]!==void 0)return a[r]?s[r]<.5?[r,!0]:[r,!1]:s[r]>=.5?[r,!0]:[r,!1];return["",!1]}function lt(s){const[t,a]=rt(s.stats);return t===""?null:a?e.jsx(it,{value:`${t.replace(/_/g," ")}: ${s.stats[t]}`}):e.jsx(ct,{value:`${t.replace(/_/g," ")}: ${s.stats[t]}`})}function it({value:s}){return e.jsx(H,{icon:fs,color:"green",children:s})}function ct({value:s}){return e.jsx(H,{icon:js,color:"red",children:s})}function U({value:s}){const[t,a]=l.useState(!1),[n,r]=l.useState(!1);return e.jsxs(e.Fragment,{children:[e.jsxs("div",{onMouseOver:()=>a(!0),onMouseOut:()=>a(!1),className:"relative",children:[e.jsx("div",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-[36rem] mb-2 whitespace-pre-wrap",children:s}),t?e.jsx("button",{className:"bg-white absolute p-2 leading-none height-fit min-h-none right-1 bottom-1 shadow",onClick:()=>r(!0),children:e.jsx(Rs,{fill:"black",color:"black",className:"text w-4 h-4"})}):null]}),e.jsx("dialog",{open:n,className:"modal p-16 bg-opacity-80 bg-white",onClick:()=>r(!1),children:e.jsx("div",{className:"modal-box max-w-none p-4 whitespace-pre-wrap bg-base-200",children:s})})]})}function Re({mediaObject:s}){if(s.content_type.includes("image")){if(s.location===void 0)return null;const t=L(s.location.replace("benchmark_output/","").replace("prod_env/","../"));return e.jsxs("div",{children:[e.jsx("img",{src:t}),e.jsx("br",{})]})}else if(s.content_type.includes("audio")){if(s.location===void 0)return null;const t=L(s.location.replace("benchmark_output/","").replace("prod_env/","../"));return e.jsx("div",{children:e.jsx("audio",{controls:!0,src:t})})}else return s.text&&s.content_type&&s.content_type==="text/plain"&&s.text.length>1?e.jsxs("div",{children:[s.text,e.jsx("br",{}),e.jsx("br",{})]}):e.jsx("div",{})}function Ie({multimediaObject:s}){return e.jsx("div",{children:s.media_objects.map(t=>e.jsx(Re,{mediaObject:t}))})}function ot(s){return Array.isArray(s)?s.length==0?"[]":`[${s.map(t=>String(t).replace(/\n/,"\\n")).join(", ")}]`:String(s)}function dt({request:s}){return e.jsxs("div",{children:[s.request.prompt.length>0?e.jsxs("div",{children:[e.jsxs("h3",{className:"block text text-gray-400",children:["Prompt (",s.request.prompt.length," Chars)"]}),e.jsx(U,{value:s.request.prompt})]}):s.request.multimodal_prompt?e.jsxs("div",{children:[e.jsx("h3",{className:"block text text-gray-400",children:"Prompt"}),e.jsx(Ie,{multimediaObject:s.request.multimodal_prompt})]}):e.jsx("h3",{className:"block text text-gray-400",children:"Empty Prompt"}),e.jsx(Z,{children:Object.keys(s.request).filter(t=>t!=="prompt").map((t,a)=>e.jsxs(ee,{children:[e.jsxs("span",{children:[t,":"]}),s.request&&s.request[t]?e.jsx("span",{children:ot(s.request[t])}):"null"]},a+1))})]})}function mt(s){return e.jsx("div",{children:s.map((t,a)=>e.jsxs("div",{children:[t.error&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Error"}),e.jsx(U,{value:t.error})," "]}),t.text&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Text"}),e.jsx(U,{value:t.text})," "]}),t.media_object&&e.jsx(Re,{mediaObject:t.media_object})]},a))})}function ht(s){return e.jsx("div",{children:Object.entries(s).map(([t,a])=>e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:t}),e.jsx(U,{value:a===null?"null":a.toString()})]}))})}function xt({predictionAnnotations:s}){return e.jsx("div",{children:s&&s!==void 0?Object.entries(s).map(([t,a])=>e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white my-2",children:[e.jsx("summary",{className:"collapse-title",children:e.jsx(e.Fragment,{children:"View "+t+" annotations"})}),e.jsx("div",{className:"collapse-content",children:Array.isArray(a)?mt(a):ht(a)})]},t)):null})}function ut({predictions:s,requests:t,metricFieldMap:a}){return s.length<1?null:e.jsx("div",{children:e.jsx("div",{className:"flex flex-wrap justify-start items-start",children:s.map((n,r)=>e.jsxs("div",{className:"w-full",children:[s.length>1?e.jsxs("h2",{children:["Trial ",n.train_trial_index]}):null,e.jsx("div",{className:"mt-2 w-full",children:n.base64_images&&n.base64_images.length>0?e.jsxs(e.Fragment,{children:[e.jsx("h3",{className:"mr-4",children:"Prediction image"}),n.base64_images.map(i=>e.jsx("img",{src:"data:image;base64,"+i,alt:"Base64 Image"}))]}):e.jsxs(e.Fragment,{children:[e.jsxs("h3",{children:[e.jsx("span",{className:"mr-4",children:"Prediction raw text"}),e.jsx(lt,{stats:n.stats})]}),e.jsx(U,{value:n.predicted_text}),n.mapped_output?e.jsxs(e.Fragment,{children:[e.jsx("h3",{children:"Prediction mapped output"}),e.jsx(U,{value:String(n.mapped_output)})]}):null]})}),e.jsx(xt,{predictionAnnotations:n.annotations}),e.jsxs("div",{className:"mx-1",children:[e.jsx("h3",{children:"Metrics"}),e.jsx(Z,{children:Object.keys(n.stats).map((i,c)=>e.jsxs(ee,{children:[a[i]?e.jsx("span",{title:a[i].description,children:a[i].display_name}):e.jsx("span",{children:i}),e.jsx("span",{children:String(n.stats[i])})]},c))})]}),e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white",children:[e.jsx("summary",{className:"collapse-title",children:"Request details"}),e.jsx("div",{className:"collapse-content",children:e.jsx(dt,{request:t[r]})})]})]},r))})})}const ft="correct";function gt({references:s}){return e.jsxs("span",{children:[e.jsx("h3",{children:"References"}),e.jsx("ul",{children:s.map((t,a)=>e.jsxs("li",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-72 mb-2 whitespace-pre-wrap",children:[t.output.text,t.tags.map(n=>e.jsx(H,{className:"mx-2",color:n===ft?"green":void 0,children:n}))]},a))})]})}function pt({instance:s,requests:t,predictions:a,metricFieldMap:n}){return e.jsxs("div",{children:[e.jsx("h3",{children:"Input"}),s.input.multimedia_content!==void 0?e.jsx(Ie,{multimediaObject:s.input.multimedia_content}):s.input.text.includes('
0?e.jsx(gt,{references:s.references}):null}),e.jsx("div",{children:a&&t?e.jsx(ut,{predictions:a,requests:t,metricFieldMap:n}):null})]})}async function jt(s,t,a){try{return await(await fetch(L(`/runs/${a||O()}/${s}/instances.json`),{signal:t})).json()}catch(n){return n instanceof Error&&n.name!=="AbortError"&&console.log(n),[]}}async function bt(s,t,a){try{return await(await fetch(L(`/runs/${a||O()}/${s}/display_predictions.json`),{signal:t})).json()}catch(n){return n instanceof Error&&n.name==="AbortError"&&console.log(n),[]}}async function wt(s,t,a){try{return await(await fetch(L(`/runs/${a||O()}/${s}/display_requests.json`),{signal:t})).json()}catch(n){return n instanceof Error&&n.name!=="AbortError"&&console.log(n),[]}}const W=10;function vt({runName:s,suite:t,metricFieldMap:a}){const[n,r]=Y(),[i,c]=l.useState([]),[o,h]=l.useState(),[u,v]=l.useState(),[j,E]=l.useState(1);l.useEffect(()=>{const f=new AbortController;async function A(){const M=f.signal,[d,b,x]=await Promise.all([jt(s,M,t),bt(s,M,t),wt(s,M,t)]);c(d);const w={};x.forEach(I=>{var D;const N=I.instance_id,C=((D=I.perturbation)==null?void 0:D.name)||"";w[N]===void 0&&(w[N]={}),w[N][C]===void 0&&(w[N][C]=[]),w[N][C].push(I)}),v(w);const R={};b.forEach(I=>{var D;const N=I.instance_id,C=((D=I.perturbation)==null?void 0:D.name)||"";R[N]===void 0&&(R[N]={}),R[N][C]===void 0&&(R[N][C]=[]),R[N][C].push(I)}),h(R)}return A(),()=>f.abort()},[s,t]);const g=i.slice((j-1)*W,(j-1)*W+W),y=Math.ceil(i.length/W);l.useEffect(()=>{const f=n.get("instance");if(f&&!window.helmHasScrolledToInstance&&g.length>0){if(g.findIndex(M=>M.id===f)===-1)return;requestAnimationFrame(()=>{const M=document.getElementById(`instance-${f}`);M&&M.scrollIntoView({behavior:"smooth"})}),window.helmHasScrolledToInstance=!0}},[n,j,r,g]);const m=f=>f.perturbation===void 0?`Instance id: ${f.id} [split: ${f.split}]`:`Instance id: ${f.id} [split: ${f.split}][perturbation: ${f.perturbation.name}]`;return o===void 0||u===void 0?e.jsx(k,{}):e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"grid gap-8",children:g.map((f,A)=>{var M,d;return e.jsxs("div",{id:"instance-"+f.id,className:"border p-4",children:[e.jsxs("div",{className:"flex items-center justify-between",children:[e.jsx("h3",{className:"text-xl mb-4",children:m(f)}),e.jsx("button",{className:"btn btn-sm normal-case px-2 py-1",onClick:()=>{const b=window.location.href+(window.location.href.includes("?")?"&instance=":"?instance=")+f.id;navigator.clipboard.writeText(b)},children:"Copy Link"})]}),e.jsx(pt,{instance:f,requests:u[f.id][((M=f.perturbation)==null?void 0:M.name)||""],predictions:o[f.id][((d=f.perturbation)==null?void 0:d.name)||""],metricFieldMap:a},`${f.id}-${A}`)]})})}),e.jsx(ae,{className:"flex justify-center my-8",onNextPage:()=>{const f=Math.min(j+1,y);E(f),n.set("instancesPage",String(f)),r(n)},onPrevPage:()=>{const f=Math.max(j-1,1);E(f),n.set("instancesPage",String(f)),r(n)},currentPage:j,totalPages:y})]})}async function Nt(s,t,a){try{return await(await fetch(L(`/runs/${a||O()}/${s}/stats.json`),{signal:t})).json()}catch(n){return n instanceof Error&&n.name!=="AbortError"&&console.log(n),[]}}function At({stat:s,metricFieldMap:t}){const a=`${s.name.split!==void 0?` on ${s.name.split}`:""}${s.name.sub_split!==void 0?`/${s.name.sub_split}`:""}${s.name.perturbation!==void 0?` with ${s.name.perturbation.name}`:" original"}`;return t[s.name.name]?e.jsxs("span",{title:t[s.name.name].description,children:[e.jsx("strong",{children:t[s.name.name].display_name||s.name.name}),a]}):e.jsxs("span",{children:[e.jsx("strong",{children:s.name.name}),a]})}const q=50,me=["name","mean","min","max","sum","sum_squared","variance","stddev"];function Et({runName:s,suite:t,metricFieldMap:a}){const[n,r]=Y(),[i,c]=l.useState(),[o,h]=l.useState(1),[u,v]=l.useState("");if(l.useEffect(()=>{const g=new AbortController;async function y(){const m=g.signal,f=await Nt(s,m,t);c(f)}return y(),()=>g.abort()},[s,t]),i===void 0||i.length===0)return e.jsx(k,{});const j=Math.floor(i.length/q),E=i.slice((o-1)*q,(o-1)*q+q);return e.jsxs("div",{children:[e.jsx("div",{className:"flex justify-start my-4",children:e.jsx("input",{type:"text",className:"input input-bordered w-full max-w-xs",placeholder:"Search for a metric",onChange:g=>v(g.target.value)})}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsx("tr",{children:me.map(g=>e.jsx("th",{children:g},g))})}),e.jsx("tbody",{children:E.filter(g=>!u||g.name.name.toLowerCase().includes(u.toLowerCase())).map(g=>e.jsx("tr",{children:me.map(y=>{const m=g[y];return typeof m=="number"?e.jsx("td",{children:m}):e.jsx("td",{children:e.jsx(At,{stat:g,metricFieldMap:a})},y)})}))})]})}),e.jsx(ae,{className:"flex justify-center my-8",onNextPage:()=>{const g=Math.min(o+1,j);h(g),n.set("metricsPage",String(g)),r(n)},onPrevPage:()=>{const g=Math.max(o-1,1);h(g),n.set("metricsPage",String(g)),r(n)},currentPage:o,totalPages:j})]})}function yt(){const{runName:s}=xe(),[t,a]=l.useState(0),[n,r]=l.useState(),[i,c]=l.useState(),[o,h]=l.useState(),[u,v]=l.useState(),[j,E]=l.useState({}),[g,y]=l.useState({});return l.useEffect(()=>{const m=new AbortController;async function f(){const A=m.signal;if(s===void 0)return()=>m.abort();const M=window.SUITE?window.SUITE:at(await nt(A),s);c(M);const[d,b,x]=await Promise.all([st(s,A,M),et(s,A,M),P(A)]);r(d),v(b),y(x.metrics.reduce((w,R)=>(w[R.name]=R,w),{})),E(x.adapter.reduce((w,R)=>(w[R.name]=R,w),{})),h(x.models.find(w=>w.name===(d==null?void 0:d.adapter_spec.model)))}return f(),()=>m.abort()},[s]),n===void 0||u===void 0||s===void 0||i===void 0||g===void 0?e.jsx(k,{}):e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"flex justify-between gap-8 mb-12",children:e.jsxs("div",{children:[e.jsxs("h1",{className:"text-3xl flex items-center",children:[u.name,e.jsx("a",{href:"/#/groups/"+u.name,children:e.jsx(Es,{className:"w-6 h-6 ml-2"})})]}),e.jsx("h3",{className:"text-xl",children:e.jsx(_,{value:u.description})}),e.jsx("h1",{className:"text-3xl mt-2",children:n.adapter_spec.model}),e.jsx("h3",{className:"text-xl",children:e.jsx(_,{value:(o==null?void 0:o.description)||""})}),e.jsx("div",{className:"mt-2 flex gap-2",children:u.tags.map(m=>e.jsx(H,{size:"xs",color:"gray",children:e.jsx("span",{className:"text text-md",children:m})}))})]})}),e.jsxs(V,{children:[e.jsxs("div",{className:"flex justify-between",children:[e.jsx("h3",{className:"text-lg mb-1",children:"Adapter Specification"}),e.jsxs("div",{className:"flex gap-2",children:[e.jsx(vs,{className:"w-6 h-6 mr-1 text text-primary"}),e.jsx("a",{className:"link link-primary link-hover",href:ye(n.name,i),download:"true",target:"_blank",children:"Spec JSON"}),e.jsx("a",{className:"link link-primary link-hover",href:tt(n.name,i),download:"true",target:"_blank",children:"Full JSON"})]})]}),e.jsx("div",{children:e.jsx(Z,{className:"grid md:grid-cols-2 lg:grid-cols-3 gap-x-8",children:Object.entries(n.adapter_spec).map(([m,f],A)=>e.jsxs(ee,{className:A<3?"!border-0":"",children:[e.jsx("strong",{className:"mr-1",title:j[m]?j[m].description:void 0,children:`${m}: `}),e.jsx("span",{className:"overflow-x-auto",children:f})]}))})})]}),e.jsx("div",{className:"mt-16 mb-8",children:e.jsxs(te,{children:[e.jsx(Q,{size:"lg",active:t===0,onClick:()=>a(0),children:"Instances + Predictions"}),e.jsx(Q,{size:"lg",active:t===1,onClick:()=>a(1),children:"All metrics"})]})}),t===0?e.jsx(vt,{runName:s,suite:i,metricFieldMap:g}):e.jsx(Et,{runName:s,suite:i,metricFieldMap:g})]})}function Mt(){const[s,t]=l.useState(void 0),[a,n]=l.useState(void 0),[r,i]=l.useState(void 0);if(l.useEffect(()=>{const o=new AbortController;async function h(){const u=P(o.signal),v=Ne(o.signal),j=await u;t(j);const E=await v,g=[];E.forEach(y=>{y.rows.forEach(m=>{g.push({title:String(m[0].value),name:m[0].href.replace("?group=","")})})}),n(g)}return h(),()=>o.abort()},[]),s===void 0||a===void 0)return e.jsx(k,{});if(a.length===0)return e.jsxs(e.Fragment,{children:[e.jsx(T,{title:"HELM Leaderboard",subtitle:"The HELM leaderboard shows how the various models perform across different scenarios and metrics.",markdown:!0}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]});const c=r!==void 0?r:a[0].name;return e.jsxs(e.Fragment,{children:[e.jsxs("div",{className:"flex flex-row justify-between",children:[e.jsx(T,{title:"HELM Leaderboard",subtitle:"The HELM leaderboard shows how the various models perform across different scenarios and metrics.",markdown:!0}),e.jsxs("div",{className:"w-64 pt-8",children:[e.jsx("label",{htmlFor:"group",className:"block text-sm font-medium text-gray-700",children:"Select a group:"}),e.jsx("select",{id:"group",name:"group",onChange:o=>i(o.target.value),className:"mt-1 block w-full pl-3 pr-10 py-2 text-base border-gray-300 focus:outline-none focus:ring focus:border-blue-300 rounded-md",children:a.map((o,h)=>e.jsx("option",{value:o.name,children:o.title},h))})]})]}),e.jsx(Ee,{schema:s,runGroupName:c},c)]})}const Rt=""+new URL("instruct-flowchart-48854f7c.svg",import.meta.url).href,It=""+new URL("instruct-graph-0a57d7d2.svg",import.meta.url).href;function Lt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 font-bold text-center",children:"HELM Instruct: A Multidimensional Instruction Following Evaluation Framework with Absolute Ratings"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://crfm.stanford.edu/2024/02/18/helm-instruct.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{children:["We introduce ",e.jsx("em",{children:"HELM Instruct"}),", a multidimensional evaluation framework for instruction-following LLMs with absolute ratings. The framework takes an instruction, a model, an evaluator, and a criterion to generate a score. In our study, we use HELM Instruct to compare 4 instruction-following models on 7 scenarios based on 4 Human/LM evaluators and 5 criteria. Check out the blog post for more details."]}),e.jsxs("div",{className:"grid my-16 grid-cols-1 md:mx-32 md:grid-cols-2 md:gap-2",children:[e.jsx("img",{src:Rt,alt:"Evaluation flowchart",className:"mx-auto block",sizes:"100vw"}),e.jsx("img",{src:It,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block",sizes:"100vw"})]}),e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Average"}),e.jsx("th",{children:"Helpfulness"}),e.jsx("th",{children:"Understandability"}),e.jsx("th",{children:"Completeness"}),e.jsx("th",{children:"Conciseness"}),e.jsx("th",{children:"Harmlessness"})]})}),e.jsxs("tbody",{children:[e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-4-0314"}),e.jsx("td",{children:"4.63"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.85"}),e.jsx("td",{children:"4.50"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.95"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-3.5-turbo-0613"}),e.jsx("td",{children:"4.60"}),e.jsx("td",{children:"4.34"}),e.jsx("td",{children:"4.86"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.41"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"anthropic_claude-v1.3"}),e.jsx("td",{children:"4.56"}),e.jsx("td",{children:"4.25"}),e.jsx("td",{children:"4.87"}),e.jsx("td",{children:"4.32"}),e.jsx("td",{children:"4.40"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"cohere_command-xlarge-beta"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"3.90"}),e.jsx("td",{children:"4.73"}),e.jsx("td",{children:"3.88"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"4.72"})]})]})]})]})}function Le({models:s}){return e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[s.length," models"]}),e.jsx("ul",{children:s.map((t,a)=>t.todo?e.jsxs("li",{className:"text-slate-300 mt-1",children:[t.creator_organization," / ",t.display_name]},a):e.jsx(p,{to:"models",children:e.jsxs("li",{className:"text-black mt-1",children:[t.creator_organization," / ",t.display_name]},a)}))})]})}function Se({runGroups:s}){const t=new Map(s.filter(r=>r.metric_groups!==void 0&&(r.subgroups===void 0||r.subgroups.length===0)).map(r=>[r.name,r])),a=new Set,n=[];return s.forEach(r=>{const i=r.subgroups?r.subgroups:[],c=[];i.forEach(o=>{const h=t.get(o);h&&(c.push(h),a.add(h.name))}),c.length>0&&n.push([r,c])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[a.size," scenarios"]}),e.jsx("ul",{children:n.map(([r,i])=>e.jsxs("li",{className:"my-3",children:[e.jsx(p,{className:"text-black",to:"groups/"+r.name,children:e.jsx("h2",{children:r.display_name})}),e.jsx("ul",{className:"list-disc list-inside",children:i.map(c=>c.todo?e.jsx("li",{className:`${c.todo?"ml-4 text-slate-300":"ml-4"}`,children:c.display_name},c.name):e.jsx(p,{className:"text-black",to:"groups/"+c.name,children:e.jsx("li",{className:`${c.todo?"ml-4 text-slate-300":"ml-4"}`,children:c.display_name},c.name)}))})]},r.name))})]})}const ke=""+new URL("helmhero-28e90f4d.png",import.meta.url).href;function S({runGroupName:s=void 0,tableIndexToDisplay:t=0,numRowsToDisplay:a=10,sortColumnIndex:n=1}){const[r,i]=l.useState(void 0),[c,o]=l.useState(void 0);return l.useEffect(()=>{const h=new AbortController;async function u(){const v=await P(h.signal);i(v);const j=v.run_groups;if(j.length===0)return;const E=s||j[0].name,g=await Ae(E,h.signal);o(g[t])}return u(),()=>h.abort()},[s,t]),r===void 0||c===void 0?e.jsx(k,{}):e.jsx("div",{className:"rounded-2xl overflow-hidden border-2 bg-white p-1 mx-2 my-0 overflow-x-auto",style:{overflow:"auto",justifyContent:"space-between"},children:e.jsx(ne,{schema:r,groupTable:c,numRowsToDisplay:a,sortColumnIndex:n,displayColumnIndexes:[0,1],sortable:!1,miniStyle:!0})})}function St(){return e.jsxs("div",{className:"flex flex-col px-4 sm:px-6 py-100 sm:py-10 sm:mb-96 md:mb-96 lg:mb-0 xl:mb-0 2xl:mb-0",children:[e.jsx("div",{className:"flex flex-col text-center mb-10 justify-start",children:e.jsx("h1",{className:"text-3xl sm:text-4xl mb-3 sm:mb-4 mx-2 mt-2",children:e.jsx("strong",{children:"A holistic framework for evaluating foundation models."})})}),e.jsxs("div",{className:"flex flex-col md:flex-col lg:flex-row lg:justify-center",children:[e.jsx("div",{className:"w-full lg:w-1/2 flex justify-center mb-4 lg:mb-0 h-full py-10",children:e.jsx("img",{src:ke,alt:"HELM Hero",className:"object-contain w-96"})}),e.jsxs("div",{className:"py-2 rounded-xl bg-gray-100 h-full",children:[e.jsx(S,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(p,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-2 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Ce=""+new URL("ai21-0eb91ec3.png",import.meta.url).href,Te=""+new URL("aisingapore-6dfc9acf.png",import.meta.url).href,Pe=""+new URL("aleph-alpha-7ce10034.png",import.meta.url).href,Be=""+new URL("anthropic-70d8bc39.png",import.meta.url).href,De=""+new URL("bigscience-7f0400c0.png",import.meta.url).href,He=""+new URL("cohere-3550c6cb.png",import.meta.url).href,Ue=""+new URL("eleutherai-b9451114.png",import.meta.url).href,Oe=""+new URL("google-06d997ad.png",import.meta.url).href,re="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAoAAAAEBCAMAAADfF+TxAAAABGdBTUEAALGPC/xhBQAAACBjSFJNAAB6JgAAgIQAAPoAAACA6AAAdTAAAOpgAAA6mAAAF3CculE8AAAC0FBMVEUAAAAgcMIfcMEfcMEfccEfcMEfcMEfb8Efb8AfccAfcMEfbsEfbMEAAP8gcMIggL8fcMEfcMEfcMAfcMEeccMecMEecMIfcMIfcMAeccEndsQfbsEfcMAfb8EfccAeb8EXdLkgccEfcMEfcMEecMEhccEAgP8fb8EAVaoeb8IfcMIeccEaZswebsIfcMEgcMEac78fccIeacMfcMEfcMEecMEcccYrgNUfcMIeb8EfcMIfccEfcMEeb8AfcMIjdMUfcMEfcMEjcsEcccYecMAeccEfb8IfcMEgccIfcMEkbcgfb8IfccEfcMEdbb0fcMIkbcIfcMEba7wfcMIfcMEfcMEfcMEfb8IgccAfccAgcMEfcMEgcMEfb8EfccAfcMEfccAeb8MgcMEgc78ecMIfccEfcMEhb8IgccEfb8IfcMEecMEfcMIdcb8fcMEdbMQfcMEfcMEVar8fb8IfcMEgccIfcMAfcMEgcL8gcMEfcMEfcMEgccIicsMfcMIeb8AecMEfcMIfccEeb8AfcMIgccEgcMAgcL8gcL8fb8EfcMEfb8EfccEfcMEAgL8fcMEgb8Ifb8Igdb8gccIfcMIfcMEfb8Egb8EcccEfb8EeccAfcMEgccEfcMEdb8AgcMEgb8EzZswkbbYecMAgcr8hccIgcMEfccEgcMIgcMEfb8Edbr8fccEdb8Efb8Eccb0fb8AfcMEgccIfcMEfcMAhb8EecMEfcMEfcMAfcMIfcMAgcMEfcMEecMAfcMEid7sfcsAecb8ib8QfcMEfcMEfb8EfcMEeb78ecMEfcMEfcMAhb7wfcMAfcsEfccAfccEfcMEeb8AfcMAccb8fcMEdccAgcMIgcL8bbb8fb8EfcMEfcMEeb8EfccEgcMEecMAeb8EecMAeb8AebsMgccIeccQgcb8gcMEhb8QfcMIfccEecMEebcIfb8EfcMH////Dl82sAAAA7nRSTlMAcM/y7uPRvqeKa0ohAYkI8ciSWyLiVOCbVg06q/ezXguR9vCgRgLnA1zcbwpD6oAUmhH59Y8SBmR3S7zZVaIW+7YdCW1mU8Sq/g51zJ0jwxX6E31itN23YVGy5bG1o8t6TMooO9X8LoiF5tq7NP0ai/QMbN9YOaQgkO3kPyaWZbAyc36/eFlAEJXvjIN7BOlHnhhPpvPXVy2uTYRf1jWZZwUHnzg2ocV50qUsrE7HG47oaMCCPpjhco1JwttdlA9BRB5CdHy9PMnT2BevMWqcKUXQJOw9UDAcY63rh96phpd2bjNxK0i6JxlaqCrOBCMOsQAAAAFiS0dE77iw4qEAAAAHdElNRQfoBhEVHhJsM9kZAAAK9ElEQVR42u3d+Z/XRR3A8UEOS/gK2QqILCIL666AEATLEYfIsV/ABBQhFYSVgCXkKpCIU6UAFREKCwrkMlSSzMoslIRKDeUyCcUOOuyev6EUEWH3+92d2fl85j3T6/Wz85mZz/e5DxXmO6u0x+qpC7tIS6h+g4aNLv7Yxy9p3CSjHFQvlUVfqtzWtFkqy1YAzEvxE5d9siAMgJdn3AJsrgEopBYtr2glH6C+0i3A1gAUVGGbq9pKB3i1U3+t2gFQVkXtOxSLBnhNiUuApRqA4rq2YyfBAHVnlwDbAFBi13XpKhfgpxz669YdgDL7dI+eUgE2KHMHsJcGoNQa9C6TCVD3cQewLwAF95l+MgH2d+ZvwEAAiu76QRIB1r/BFcDBGoCyKxoiEKAe6grgMACKr7ynPIDljvwNygJQfg07iwNYONwNwBEagAGU7SINoC51A3AkAMPoxjJhANs48fdZDcBAummULICjx7gAeHOaAFt47JYqWx97a2CNq8VfzY1PcT2fcwHwthQXrIiIiIiIiIiIiIiIiIiIiIiIiIgo/G73WK8qqwnuQOr53VHtKx6f+jo62GqYkPpSOZLvtGo/+XqpL2Oipb+CbOpLBaDT7syIAGh7RcIkDcDAmywCoG5uB7ARAEMHWCEDYGsrf3dpAIYOUE8RAdDuioQuAAwfYLkIgHZXJHwegOEDnNpEBECbKxKmaQCGD1BPFwHQ5oqESgDGAHBGiQSAupc5wC8AMAaAVf8w2gvAmcb+7tYAjAJgfxEAZ802BTgHgHEAnDtPAkD9RUN/mS8BMA6Aer4IgDcZAlygARgJwHtEAMwuNAP4ZQDGAnBRiQSAeoSRv+KvADAWgBf+5l5PAEcaAVzs6V2pgR4bHCnAShEA9RITgEt9ASQiIiIiIiIiIiIiIiIiIiIiIiKi/4uWVVi0PKnVrLBZTcU8m6nuHWnbfaZT3T8yoe7sWdPUK5OaeuRXHX3kVpcmtUgK4NesDt6uspmqh/U539WmU61J7MhxTVckFDdM8SsXAEwJ4ANyAPatYeYEvwwCQG8AH5QDsKYrEpYCMEKAD8kBqAfnnXjtwwCMEOAQQQCH5Z14nQZghAAnCwKY/4qERwAYI8AVggDmvSKhbBEAYwS4XhLAfFckbNAAjBBgYYkkgPmuSPg6AGMEeJESBfDmnNN2+gYAYwQ4URbAjTmnfVQDMEaAK2UB1N/MNe23ABglwE3CAG7OMevwdgCMEWB2gDCAt2aqn/XbGoAxAvyOEgZQb6l+1usBGCXAreIAVn86p+toAEYJsFQcwNuLq5v0MR0GwLYDLBqTFMBtNqsZkLGZqlM3u9aaT7WqW7JlXO6vtm1XREREREREREREREREREREREREJKgNOyyak9RqlmyV207rXfVIY3nnLo3blcrr6OzoI4/hRHQqbbQ/gWl1Ijpr+s8XnJ3u8aSn4ki+h2ZNUekC/K7pgEkfzNa0yPRHaw0A5QNcrVIGuNv0Yo0nPphtqPHlHgCUD3DGqrQB1nvSdMRTZ2bbYzpuCQDFA8xOVqkD/J7dFQmtTC9EGKkAKB7gpSp9gMbXS565IuEq04lGAFA8wKe3eQCobrS6IsH0QoTsQgBKB3jdXuUDYAfTId9XFhciDFMAFA5w1jrlBWDJM6b/p1Si1A9M5xkMQOkA6/yqbT/g1aZjFiv1rOlP12wACgf4Q+UL4I9Mx/xYzZ5qOKSvAqBsgFuVN4CZ5wzHLCr7iek0ywAoG+Ada/0BVM+bDvppheGA0WMAKBrg2KbKI8BdpoN+ZnquYIICoGSAT25TPgGqfUlv8AUASgb4YrHyC7Ay4Q0W7gegYIDTHe3K/gOelvAO9yjvABf+3KKdSQHs9pKgdjn7sbKZ/cD7Qw8mu8WFdV0fERERERERERERERERERERERERCelgpUW/SGo1BdMFNanM0a5+afOOrzw3vqTS/eZecri+OsWJ6Nz9ap6bXRXV9cRxX/d7e9nl+gCYUK/0lAFwmfOdNSwGYAAA9auzRQDs1t31xp5XAAwBoP71IQkA1VjX+9oFwDAA6idWSQD4guNdPZcBYCAA9f3bBQA0vnfX6AunAJQMUL9W4h+g+c3j+XsdgOEA1JcJANjP6Y6uVQAMCKAe7x9g02YuN/QgAIMCWPi6d4DqHpcbegqAQQHU+/Z7B7jJ4XY2KgCGBVAvz/gGuG2Ru93cB8DQAOrDvgGqI+42cxSAwQGcW+Ab4DFnezmuABgcQD3ON0Dj3xqSs90ADBDgmctsPQJUL7rayRsADBHgbw55Bvimo41UKFEAp5VadCIpgKP6yW2J9a6usHnHd134lOLmbvbx24TWR0RERERERERERERERERERERERD7bctii0qRWM9tmNYetbjJdMMSuk+ZTvWWzqWnVP+vQEPsap7A+42I4EW11gVUPywPFFieBXZ44blpkfxZ6dwrrA2DSAN/2C1DtsPaXbQLACACeyvgF2MEa4DAFwAgAVvlad8oAS66xXfglAIwCYKlfgOoyy3UPPATAKAC+4xngAst1X68AGAXA2zwDVPtcf7MegEEB/J1vgJutll04HIBxAMw29QzwqNWyf68AGAdAXeAZoDpu87wTAIwF4Mu+Afa2eNya7QCMBWA/3wAHZc0f11IBMBaAHX0DVDeZP+4gAKMBeJ93gIeNnzajBIDRAPyDd4BdR5s+7XkFwGgA9vAOUE0wfdrdAIwH4B/9Ayw1fNg+JRbglNMWPZYUwEE2qzlt9fssO/e2rbHpVH+y2dSf8z1xnuGS16W8PiIiIiIiIiIiIiIiIiIiIiIiInLaBo9V/fbssX4xtOX8TRWk9/6qdKgWy+1QCyaNk9qD0h6remL2Ih1Dfzl/U/XSe39VGt6u5scsrQXAoqT2AMAE+qscgOptN9+iB2BAXZMRBLB5jU95LgPAuAC2V4IA1nxjea2+QgrAgHpUEkDVvqanPAXAuADObSsK4LoaHnJcATAugBOVKIAlz+R/yGkARgbwXVkAa7ixPLsQgHEBfHitMID5byx/VgEwLoCVShjA/DeWDwFgXABHnxQHMN+N5d3HADAugP2VOID5bixfrgAYF8At8gCqRrkf0Q+AcQGcqQQCXJnzCc1uAGBUALM7JQI8kPPG8h0KgFEBLFcSAea+sfwYAKMCOPWoTICHczyg6p9Zpg/wao8trrLPjtOD7q1qP7zF6b2/HHXNsd75tT44/05Se+A7CURERERERERERERERERERERERG56w2P7q6zmZEFoDa/5Fe9P7/3lrdVHln3AkEmTpPbAiei6Na4WAP2fiD7TR7+e2doQIEfyZXZ1iQoHoHr13NChAIwA4MDDtfrwxAB86MOR9W8AYPgAT9XyOxliABZ8+PXMxxUAgwf4t70qMIDqlbMj3wVg6ADbdcyo4ACeXcmpbQAMHGDF380/dv8A/zH1zMClCoBBA6y/cpsKEaC6+MzANwEYMsBZ/ddb/YtPAMBl74+7vBiAAQN8eqflf3kJANit+3vj5igABgvwlRNKhQtQtXlv3AoABgowO3OTxV+3SwL4z/8N26cAGCTAwpZ3WZ33kASw7VytuwAwQIBTh3XsanngSBJA9S+t/w3A0ADOGtZxvbJOFMAN+j8KgEEBvPXI/FF1OnIpCmDZohEADAXgrH2vbd5wqM5nfkUBVEcWygK4x2N9quzz/BtSl2416slyo8bOzNWe8kce2H3LsSXb3Rw675Pe+6tFe6328HhSe/gvDuj5ccZcNDsAAAAldEVYdGRhdGU6Y3JlYXRlADIwMjQtMDYtMTdUMjE6MzA6MTgrMDA6MDDt4fgHAAAAJXRFWHRkYXRlOm1vZGlmeQAyMDI0LTA2LTE3VDIxOjMwOjE4KzAwOjAwnLxAuwAAAABJRU5ErkJggg==",Fe=""+new URL("meta-5580e9f1.png",import.meta.url).href,_e=""+new URL("microsoft-f5ee5016.png",import.meta.url).href,Ve=""+new URL("mistral-18e1be23.png",import.meta.url).href,ze=""+new URL("nvidia-86fa75c1.png",import.meta.url).href,We=""+new URL("openai-3f8653e4.png",import.meta.url).href,qe=""+new URL("tii-24de195c.png",import.meta.url).href,Ge=""+new URL("together-a665a35b.png",import.meta.url).href,Qe=""+new URL("tsinghua-keg-97d4b395.png",import.meta.url).href,Ke="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAASYAAABfCAYAAABFnmpnAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAlhSURBVHgB7d3xddtGEgbw7+np/8gVeFLBqYOwg/gqOKaC+CqIU0GcCqxUcOnAvAruUoEnFZxcwUUjEDYMA8QCmN0dgN/vPUggAHFpejGcXSyWQON/T8v/HZa3SOdR3vvEsg5O5dnyZuD5xfH5lywfzu/FL0/L8fx61njj+Lr63js9d87lPYZFfb1DDpVf0wd8rpOvMLNO3p5/Pz4td1jvm8TjPMoygvIU8ch5OXS2nZ6Wn8+/iUoTfK6Tr8/bHtDUSZ3645vz78kDE71IPM4rMKUS+FFswwHNJ9Y71AngRH1HNJmUZVEXY0AbmP6Ej5eJxwl8iPNxKR6xLUc0AUpAFINlUP/BhTrpnTGlZkKeGVNqMPSi2B4BgxPFIrhQJ70DkyQet+Wm3NYyppagqQil33uiMfK0/GtoRxuYPE+2lIov8PNtwjFeWdV/sW2Cpn1PFMU9Bq50e2dMJuXKnOentqCcrWZLXUd8efWOqLYf0TuPc2RMJTOYVAIfin34CURxWKLyurshR8YkCceUzphSjknhdfWytgPY10Sx/AOdOnnT2aHwced0TKqp7MuzrD005VpHEMVh5+mr9kE3MHmddKU7v6d4BibFfnwPoli+a1e6gekP+EjpPyrZlJvaP8eeMqZ7EMVyaFdyZExTt6UIfAnK2fpwgS77cGBwokgE56QlRx/TVMYk8DWVfQn87CljMgxMFI3Yj1p9TJ5KBSbF/jAwUTTPdbLGVTmBv0tZmlegVOxP6jQ1RKU8B6bbzgaFj7vzMpaBCcryCkwfUc7v+Pr9a/uEBH6YMVGqUnXy+cMyR2Bqn3wsMOVo6tlo87HBj16jzBXl/IrxCd4e0AxG81BjkOU/Z5R7hN+/9Tc0712KEn2JD0/Lv7EdrzF8jgmaG3G9PuTEftz2NnrNZGlX5sYCRY6TQS7s21tTziqIjUHy+HfVCExzrmwe4EcRazZPC0oP2D59Wn5AM7+SB7EfNwOFuD35iBz3yd0t3DeHIgb78Jhzcl9SIzDR/lh9VDjKFZhKBIrU5xT4iDRUwGswrCl9QzXtk1ez9KtxTMarg1cW7ltq7OTyDIIfEIeCKBbX4UZ7yZjGRpsL/ESaWSBS9kZkXOtkrsA0Nj4mV59G7vIURFRMPzB5RT2ZuT1XeXsNTKUn9iMq6WU/MHld7ZGR7bkypruZ2+cqObgyBZtyFI3CUa6MaSwgCPLIXZ4iFl7mp2gEfj4OBSaP4JQ7gxnyMmN5ilj2erWRyDzeDG2Ej5yBYsiLxNewhCIWZkwUjWudHApMCh9DgUKQjwxs83qzovXpCPxE6z+jbfobfDyfa0OByWu8jgxsyznK+C5x2xLRmjtelcCwI5083MOH2o+cTbkIE8YJfEQaXCnwu7nV6yosXbcDnFsnOZtyOQNFankeb1a0jMLzyyqZLZEHr6lpzPN9oDkzJknc5qXfTPSK4JGyCgtKR/jxvBmYro+dY7/At04+n2+3Yzsc9G8Tyd2063e2C3zU6By2ibeGZgv0fg9PIEpzGtiWo06OBiavjKkfKHIHplwZk6K8HP/hQ06gWgQ+fYV2IpdokgvyU1wITAof/UAhyOtu4vFSe+2HOYF9TDX9BJ/+QsF+/h9P7crNyAGK9XIFitTyBD4U+/QbiGL5VCfHApNHBJaJx976zZ8tN+VyU+xjvmnaD2vCndoHY4HJ62pNjkBxSbfDXeBjj82dn0EUy9+7D3JmTKYbKEoEphcjZa+xt0GI9tVQDyCKwz4otbshZx+T6U5CVmLSe+ms73Fw5VrWhn8NojgsKL3pb8ydMUlnvUTG5N10VOyHZUpHEMVgMca+/PTN0M7bkT9S+OgGB0F+MrK+1B4yphOaT6UTiGI4ofmSTB07oGRgYsZU3gMG2u9EFVmW9Dsm6mTuply3X6nkVTmBj0izCixxRDNlyzuUyViJpti9dZN18lJg8ghO7VUyQRlteRzD9KUjmu+WfwWiGI5Py3uMzON0c+EPPQJTmzEJymjLY2D6mr0ndnPwPYhiEDTBSfo7LgUmj/E7awLEksDYlifwscfBlRacSjSriVJYXXzX33gpMHlM97EmUCwZfS69ctdS7I+AY5kolgN6My1cCkyK9dr71wTzLc3Y2jLX8upnW8KuWjyguayaY+T59yCa5wGf66TC3xczLdxeOFDhw66ULQkUel4E83wDn6acoh7LaLpXBAXNQDSvKUzvz0uOoEf79EPvscC3Th7QxInROb9bXtnCt1gemJa8BivP4/aXSP1LiuYqxq/wcwDRcoqmTp7g59CulMiYLCgtCRQWGKyfae5VpD33L719Wn6ED8+vgKL5TvCpY7W/F9AG8B7gw85168YoFpiWBIulfTxL+7T6Ig6uVCxr3g7hsIG67IbqB2yfZ3fApwRmqinnNWGcYD7FsuAo2PftKF5NTA4ZIA9WHxU+pF25wXSha1kUnHsSPGJdxsTANE1AFIu0K1OBSbHe0mbc0vJdv0OdiMqbCkwe/SwHzKe937nLu/QaiKiwEk25JRnTR8fyl2LGRFRJiabcEnr+XWv0NQceElVUImNa4nFkvUb5RFTYVGCqlTnoyHqN8omosKgZk3bWawx03PrMlUSbFrWP6bHya2BTjqiiqcBkFOVpZ52d30TX4dO5nhKYanc+M3vJg+8rRTMrMCnK0t7jGtnLNWRMDEwUVkpgKt0RrL3HHC5AdGUiNuX688soylIQUVVbaMqNbStZPhEVFDEwPSZuy6X2jIBEV48Z09c4VICosoh9TDqwrWQWw45vospSA1PJk3WoLEU5CiKq6jbxOAsWHtPVptDEbbkwY6IS7EtHBX7eYkd1NzUwWb+LoIzand8fQJTfq/Pi5R12FJhSmnKmVB+PztyeA2cWIKosNTApytCZ20uVT0QFRQtMY5lZqRRVQUTVpQamCIHh0j4vHFxJFEC0jOlx4T4vCiKqbksZ0x+oWz4RFbKVzm/DjInoSqQGJqPI7zFw+URUyJzAVDtjKVE+B1cSBTAnMJXo46mdMV3T4EpmhxSNtiuRMiZduX+taztRGZgorEh9TDqxP/eJxHmYiIKI1sc0VX7O18DBlURBzAlMuTMKTTgmZ2BSEFEIkTKmlI7nnMGRfS5EQWypj8nkbG6xj4koiNSJ4lon5JswThOOseDxHfJYmzEpyvjTqayp+xIV6ynW8Xod7XOtpdgOz/duisLHp8TjLzZXn0YvVQJMAAAAAElFTkSuQmCC",Je=""+new URL("yandex-38e09d70.png",import.meta.url).href,Xe=""+new URL("01-694cb9b7.png",import.meta.url).href,kt=[Ce,Te,Pe,Be,De,He,Ue,Oe,re,Fe,_e,Ve,ze,We,qe,Ge,Qe,Ke,Je,Xe];function he(){const[s,t]=l.useState(void 0);return l.useEffect(()=>{const a=new AbortController;async function n(){const r=await P(a.signal);t(r)}return n(),()=>a.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(St,{}),e.jsxs("div",{className:"mx-auto text-lg px-16",children:[e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:kt.map((a,n)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:a,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},n))})})})]}),e.jsx("div",{className:"container mx-auto",children:e.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-8",children:[e.jsx(Le,{models:s.models}),e.jsx(Se,{runGroups:s.run_groups})]})})]})]}):null}function Ct(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Massive Multitask Language Understanding (MMLU) on HELM"}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Massive Multitask Language Understanding (MMLU)"})," ",e.jsx("a",{href:"https://arxiv.org/pdf/2009.03300.pdf",className:"link",children:"(Hendrycks et al, 2020)"})," ","is a multiple-choice question answering test that covers 57 tasks including elementary mathematics, US history, computer science, law, and more. We publish evaluation results from evaluating various models on MMLU using HELM. Our evaluation results include the following:"]}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"Simple, standardized prompts"}),e.jsx("li",{children:"Accuracy breakdown for each of the 57 subjects"}),e.jsx("li",{children:"Full transparency of all raw prompts and predictions"})]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://crfm.stanford.edu/2024/05/01/helm-mmlu.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsx("div",{className:"flex-1",children:e.jsx(S,{})})]})]})}const Tt=""+new URL("air-overview-d2e6c49f.png",import.meta.url).href;function Pt(){const s={fontVariant:"small-caps",fontWeight:"bold"},t=e.jsx("span",{style:s,children:"AIR-Bench 2024"});return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:t}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("img",{src:Tt,alt:"AIR 2024 Categories",className:"mx-auto my-4 block w-3/4",sizes:"100vw"}),e.jsxs("p",{children:["We introduce ",t,", the first AI safety benchmark aligned with emerging government regulations and company policies, following the regulation-based safety categories grounded in our AI Risks study, AIR 2024. AIR 2024 decomposes 8 government regulations and 16 company policies into a four-tiered safety taxonomy with 314 granular risk categories in the lowest tier. ",t," contains 5,694 diverse prompts spanning these categories, with manual curation and human auditing to ensure quality. We evaluate leading language models on ",t,", uncovering insights into their alignment with specified safety concerns. By bridging the gap between public benchmarks and practical AI risks, ",t," ","provides a foundation for assessing model safety across jurisdictions, fostering the development of safer and more responsible AI systems."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2407.17436",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(S,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(p,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Bt=""+new URL("scb10x-204bd786.png",import.meta.url).href,Dt=""+new URL("scbx-71e53e72.jpg",import.meta.url).href;function Ht(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"ThaiExam"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("div",{className:"text-center",children:[e.jsx("a",{href:"https://scbx.com/",children:e.jsx("img",{src:Dt,alt:"Logo",className:"inline h-32 mx-4 my-4"})}),e.jsx("a",{href:"https://scb10x.com/",children:e.jsx("img",{src:Bt,alt:"Logo",className:"inline h-32 mx-4 my-4"})})]}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://www.scbx.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"SCBX"})," ","and"," ",e.jsx("a",{href:"https://www.scb10x.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"SCB 10X"}),", we introduce the ThaiExam HELM leaderboard. ThaiExam is a Thai language benchmark based on examinations for high school students and investment professionals in Thailand. The ThaiExam leaderboard is the first public leaderboard for large language models on Thai language scenarios, and features evaluations of leading language models. Like all other HELM leaderboards, the ThaiExam leaderboard provides full prompt-level transparency, and the results can be fully reproduced using the HELM framework. We hope that this leaderboard will encourage further work in multilingual language model evaluation."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(p,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(S,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(p,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Ut=""+new URL("wellsfargo-a86a6c4a.png",import.meta.url).href;function Ot(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Finance"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("div",{children:e.jsx("a",{href:"https://wellsfargo.com/",children:e.jsx("img",{src:Ut,alt:"Logo",className:"mx-auto block my-4 w-48"})})}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://www.wellsfargo.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Wells Fargo"}),", we introduce the ",e.jsx("span",{className:"font-bold",children:"HELM Finance"})," ","leaderboard for ecologically-valid evaluations of leading language models in the financial domain. The leaderboard evaluates the ability of language models to perform tasks from financial professions on publicly financial documents across a range of scenarios."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(p,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(S,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(p,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Ft=""+new URL("heim-logo-3e5e3aa4.png",import.meta.url).href;function _t({metricFieldMap:s,metricGroups:t}){const a=new Set,n=[];return t.forEach(r=>{const i=[];r.metrics.forEach(c=>{const o=s[c.name];o&&(i.push(o),a.add(o.name))}),i.length>0&&n.push([r,i])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[a.size," metrics"]}),e.jsx("ul",{children:n.map(([r,i])=>e.jsxs("li",{className:"my-3",children:[e.jsx("h4",{children:r.display_name}),e.jsx("ul",{className:"list-disc list-inside",children:i.map(c=>e.jsx("li",{className:"ml-4",children:c.display_name},c.name))})]},r.name))})]})}function Vt(){const[s,t]=l.useState(void 0);l.useEffect(()=>{const n=new AbortController;async function r(){const i=await P(n.signal);t(i)}return r(),()=>n.abort()},[]);const a=s?s.metrics.reduce((n,r)=>(n[r.name]=r,n),{}):void 0;return e.jsxs("div",{className:"container mx-auto px-16 text-base",children:[e.jsx("div",{className:"container max-w-screen-lg mx-auto",children:e.jsx("img",{className:"mx-auto w-96",src:Ft,alt:"HEIM Logo"})}),e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Holistic Evaluation of Text-To-Image Models"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2311.04287",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{className:"my-2",children:["Significant effort has recently been made in developing text-to-image generation models, which take textual prompts as input and generate images. As these models are widely used in real-world applications, there is an urgent need to comprehensively understand their capabilities and risks. However, existing evaluations primarily focus on image-text alignment and image quality. To address this limitation, we introduce a new benchmark,"," ",e.jsx("strong",{children:"Holistic Evaluation of Text-To-Image Models (HEIM)"}),"."]}),e.jsx("p",{className:"my-2",children:"We identify 12 different aspects that are important in real-world model deployment, including:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside unreset",children:[e.jsx("li",{children:"image-text alignment"}),e.jsx("li",{children:"image quality"}),e.jsx("li",{children:"aesthetics"}),e.jsx("li",{children:"originality"}),e.jsx("li",{children:"reasoning"}),e.jsx("li",{children:"knowledge"}),e.jsx("li",{children:"bias"}),e.jsx("li",{children:"toxicity"}),e.jsx("li",{children:"fairness"}),e.jsx("li",{children:"robustness"}),e.jsx("li",{children:"multilinguality"}),e.jsx("li",{children:"efficiency"})]}),e.jsx("p",{className:"my-2",children:"By curating scenarios encompassing these aspects, we evaluate state-of-the-art text-to-image models using this benchmark. Unlike previous evaluations that focused on alignment and quality, HEIM significantly improves coverage by evaluating all models across all aspects. Our results reveal that no single model excels in all aspects, with different models demonstrating strengths in different aspects."}),e.jsx("p",{className:"my-2",children:"For full transparency, this website contains all the prompts, generated images and the results for the automated and human evaluation metrics."}),e.jsx("p",{className:"my-2",children:"Inspired by HELM, we decompose the model evaluation into four key components: aspect, scenario, adaptation, and metric:"}),e.jsx("div",{className:"container max-w-screen-lg mx-auto my-8",children:e.jsx("img",{src:"https://crfm.stanford.edu/heim/latest/images/heim-main.png",alt:"HEIM scenarios, prompts, images and metrics"})}),s&&a?e.jsxs("div",{className:"grid grid-cols-1 md:grid-cols-3 gap-8",children:[e.jsx(Le,{models:s.models}),e.jsx(Se,{runGroups:s.run_groups}),e.jsx(_t,{metricFieldMap:a,metricGroups:s.metric_groups})]}):null]})}const zt=""+new URL("vhelm-framework-a1ca3f3f.png",import.meta.url).href,Wt=""+new URL("vhelm-model-8afb7616.png",import.meta.url).href,qt=""+new URL("vhelm-aspects-1437d673.png",import.meta.url).href;function Gt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Holistic Evaluation of Vision-Language Models"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2410.07112",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"#/leaderboard",children:"Leaderboard"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{className:"my-4",children:["Current benchmarks for assessing vision-language models (VLMs) often focus on their perception or problem-solving capabilities and neglect other critical aspects such as fairness, multilinguality, or toxicity. Furthermore, they differ in their evaluation procedures and the scope of the evaluation, making it difficult to compare models. To address these issues, we extend the HELM framework to VLMs to present the Holistic Evaluation of Vision Language Models (VHELM). To address these issues, we introduce VHELM, built on HELM for language models. VHELM aggregates various datasets to cover one or more of the 9 aspects:"," ",e.jsx("b",{children:"visual perception"}),", ",e.jsx("b",{children:"bias"}),", ",e.jsx("b",{children:"fairness"}),", ",e.jsx("b",{children:"knowledge"}),", ",e.jsx("b",{children:"multilinguality"}),", ",e.jsx("b",{children:"reasoning"}),", ",e.jsx("b",{children:"robustness"}),","," ",e.jsx("b",{children:"safety"}),", and ",e.jsx("b",{children:"toxicity"}),". In doing so, we produce a comprehensive, multi-dimensional view of the capabilities of the VLMs across these important factors. In addition, we standardize the standard inference parameters, methods of prompting, and evaluation metrics to enable fair comparisons across models. Our framework is designed to be lightweight and automatic so that evaluation runs are cheap and fast. For transparency, we release the raw model generations and complete results on this website."]}),e.jsx("p",{className:"my-4 font-bold",children:"VHELM is intended to be a living benchmark. We hope to continue adding new datasets, models and metrics over time, so please stay tuned!"}),e.jsxs("div",{className:"my-16 flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("img",{src:Wt,alt:"A vision-lanuage model (VLM) takes in an image and a text prompt and generates text.",className:""}),e.jsx("img",{src:zt,alt:"An example of an evaluation for an Aspect (Knowledge) - a Scenario (MMMU) undergoes Adaptation (multimodal multiple choice) for a Model (GPT-4 Omni), then Metrics (Exact match) are computed",className:""})]}),e.jsxs("div",{className:"flex-1",children:[e.jsx(S,{}),e.jsx(p,{to:"leaderboard",className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})]})]}),e.jsx("div",{className:"container max-w-screen-lg mx-auto my-8",children:e.jsx("img",{src:qt,alt:"An example of each aspect in VHELM: Visual Perception, Bias, Fairness, Knowledge, Multilinguality, Reasoning, Robustness, Toxicity Mitigation and Safety. ",className:""})})]})}const Qt=""+new URL("accenture-6f97eeda.png",import.meta.url).href,Kt=""+new URL("cresta-9e22b983.png",import.meta.url).href;function Jt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Call Center"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("div",{className:"text-center",children:[e.jsx("a",{href:"https://www.accenture.com/",children:e.jsx("img",{src:Qt,alt:"Logo",className:"inline h-12 mx-4 my-4"})}),e.jsx("a",{href:"https://www.cresta.com/",children:e.jsx("img",{src:Kt,alt:"Logo",className:"inline h-8 mx-4 my-4"})})]}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://www.accenture.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Accenture"})," ","and"," ",e.jsx("a",{href:"https://www.cresta.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Cresta"}),", we introduce the HELM"," ",e.jsx("span",{className:"font-bold",children:"Call Center"})," leaderboard. HELM Call Center is a leaderboard consisting of evaluations of leading language models on scenarios with realistic tasks from the call center context."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(p,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(S,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(p,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Xt=""+new URL("cuhk-8c5631e9.png",import.meta.url).href;function $t(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Chinese Language Models EVAluation Platform (CLEVA)"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("div",{className:"text-center",children:e.jsx("a",{href:"https://www.cuhk.edu.hk/",children:e.jsx("img",{src:Xt,alt:"Logo",className:"inline h-12 mx-4 my-4"})})}),e.jsxs("p",{children:["In collaboration with the"," ",e.jsx("a",{href:"https://lwwangcse.github.io/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"LaVi Lab"})," ","team from"," ",e.jsx("a",{href:"https://www.cuhk.edu.hk/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"The Chinese University of Hong Kong (CUHK)"}),", we introduce the"," ",e.jsx("strong",{className:"font-bold",children:"Chinese Language Models EVAluation Platform (CLEVA)"})," ","leaderboard on HELM. CLEVA is a comprehensive Chinese-language benchmark for holistic evaluation of Chinese-language LLMs, and employs a standardized workflow to assess LLMs' performance across various dimensions."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2308.04813",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(S,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(p,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function Yt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Tables"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("div",{className:"text-center",children:e.jsx("a",{href:"https://www.ibm.com/",children:e.jsx("img",{src:re,alt:"Logo",className:"inline h-12 mx-4 my-4"})})}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://research.ibm.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"IBM Research"}),", we introduce the"," ",e.jsx("strong",{className:"font-bold",children:"HELM Tables"})," leaderboard on HELM. ",e.jsx("strong",{className:"font-bold",children:"HELM Tables"})," is a holistic evaluation of leading language models that tests their capability to understand, process and analyze structured tabular input data."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(p,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(S,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(p,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Zt=({id:s,title:t,text:a})=>((t==="Classic"||t==="Lite"||t==="Instruct")&&(t="HELM "+t),e.jsx("div",{className:"max-w-sm rounded overflow-hidden bg-gray-100 hover:scale-105 transition-transform duration-300",children:e.jsx("a",{href:se(void 0,s),children:e.jsxs("div",{className:"px-6 py-4",children:[e.jsxs("div",{className:"font-bold text-xl mb-2",children:[e.jsx("div",{className:"py-3",children:e.jsx("svg",{fill:"#000000",width:"20px",height:"20px",viewBox:"0 0 24 24",xmlns:"http://www.w3.org/2000/svg",children:e.jsx("path",{d:"M22,7H16.333V4a1,1,0,0,0-1-1H8.667a1,1,0,0,0-1,1v7H2a1,1,0,0,0-1,1v8a1,1,0,0,0,1,1H22a1,1,0,0,0,1-1V8A1,1,0,0,0,22,7ZM7.667,19H3V13H7.667Zm6.666,0H9.667V5h4.666ZM21,19H16.333V9H21Z"})})}),t+" →"]}),e.jsx("p",{className:"text-gray-700 text-base",children:a})]})})}));function en(){const[s,t]=l.useState();return l.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(a=>a.json()).then(a=>{t(a)}).catch(a=>{console.error("Error fetching JSON:",a)})},[]),e.jsx("div",{className:"p-10 mb-20",children:e.jsx("div",{className:"grid grid-cols-2 lg:grid-cols-3 gap-4",children:s&&s.map((a,n)=>a.id==="home"?null:e.jsx(Zt,{id:a.id,title:a.title,text:a.description},n))})})}function sn(){return e.jsxs("div",{className:"flex flex-col md:flex-row px-6 py-32",children:[e.jsxs("div",{className:"flex-1 p-4 flex flex-col justify-center",children:[e.jsx("div",{className:"flex justify-start",children:e.jsxs("div",{children:[e.jsx("h1",{className:"text-4xl mb-4 mx-4 mt-2",children:e.jsx("strong",{children:"A reproducible and transparent framework for evaluating foundation models."})}),e.jsx("h3",{className:`text-xl + mb-4 mx-4 mt-2`,children:"Find leaderboards with many scenarios, metrics, and models with support for multimodality and model-graded evaluation."})]})}),e.jsxs("div",{className:"flex flex-col md:flex-row justify-start mt-6 ml-4",children:[e.jsx("button",{className:"px-6 btn btn-grey rounded-md mb-4 md:mb-0",onClick:()=>window.scrollTo({top:760,behavior:"smooth"}),children:e.jsx("div",{children:"Leaderboards ↓"})}),e.jsx("button",{className:"px-6 btn btn-grey rounded-md md:ml-4",children:e.jsx("a",{href:"https://github.com/stanford-crfm/helm",children:"Github"})})]})]}),e.jsx("div",{className:"mx-4 mt-6 md:mt-0 md:w-1/3",children:e.jsx("img",{src:ke,alt:"HELM Hero",className:"object-cover w-full h-full"})})]})}const tn=[Ce,Te,Pe,Be,De,He,Ue,Oe,re,Fe,_e,Ve,ze,We,qe,Ge,Qe,Ke,Je,Xe];function nn(){const[s,t]=l.useState(void 0);return l.useEffect(()=>{const a=new AbortController;async function n(){const r=await P(a.signal);t(r)}return n(),()=>a.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(sn,{}),e.jsx("div",{className:"container py-5 mx-auto text-lg",children:e.jsx("div",{className:"flex flex-col sm:flex-row justify-center mb-10 flex sm:gap-8 md:gap-32",children:e.jsx("h1",{className:"text-4xl mx-4 ",children:e.jsx("strong",{children:"HELM Leaderboards"})})})}),e.jsx(en,{}),e.jsx("div",{className:"mx-auto text-lg px-16",children:e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:tn.map((a,n)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:a,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},n))})})})]})})]}):null}const an=""+new URL("overview-74aea3d8.png",import.meta.url).href,rn=""+new URL("process-flow-bd2eba96.png",import.meta.url).href;function ln(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Image2Struct: A Benchmark for Evaluating Vision-Language Models in Extracting Structured Information from Images"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-2 md:gap-8 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2410.22456",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-latex",children:"Latex dataset"}),e.jsx("a",{className:"px-5 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-webpage",children:"Webpage dataset"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-musicsheet",children:"Music sheet dataset"})]}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Image2Struct"})," is a benchmark for evaluating vision-language models in practical tasks of extracting structured information from images."]}),e.jsx("br",{}),e.jsx("p",{children:"In our tasks, VLMs are prompted to generate the underlying structured information (i.e., code) from an input image. The code can be compiled, and the output image is evaluated against the input image to produce a score. This round-trip evaluation allows us to quantitatively evaluate VLMs on complex tasks with multiple correct answers. We create a pipeline that downloads fresh, user-submitted data from active online communities upon execution, evaluates the VLMs shortly, and produces a leaderboard."}),e.jsx("br",{}),e.jsx("img",{src:an,alt:"Evaluation flowchart",className:"mx-auto block w-full",sizes:"100vw"}),e.jsx("br",{}),e.jsx("p",{children:"We introduce 3 tasks:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"LaTex: equations, tables, plots and algorithms from ArXiV papers."}),e.jsx("li",{children:"Webpages: pages from GitHub written in HTML, CSS and Javascript. ..."}),e.jsx("li",{children:"Music sheets: crops of measures from IMSLP music sheets."})]}),e.jsx("div",{className:"flex flex-row justify-center mt-8",children:e.jsx("a",{className:"px-10 btn rounded-md",href:"#/leaderboard",children:"Full Leaderboard"})})]}),e.jsx("div",{className:"flex-1",children:e.jsx(S,{numRowsToDisplay:12})})]}),e.jsx("br",{}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("p",{children:"We provide an automated process for collecting new fresh data from online communities, evaluating the models and producing a leaderboard. The pipeline is designed to be executed on a regular basis to keep the leaderboard up-to-date."}),e.jsx("br",{}),e.jsxs("p",{children:["In addition to the automated data collection, we also provide a"," ",e.jsx("i",{children:"wild"})," subset for the LaTeX and webpage tasks that are collected from Wikipedia and various popular websites. These instances do not have a corresponding code, and the evaluation is done by our proposed metric: block EMD (Earth Mover Distance)."]})]}),e.jsx("div",{className:"flex-1",children:e.jsx("img",{src:rn,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block w-full",sizes:"200vw"})})]})]})}function cn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Elements of World Knowledge (EWoK)"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("p",{children:["We present the"," ",e.jsx("a",{className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://arxiv.org/abs/2405.09605",children:"Elements of World Knowledge (EWoK)"})," ","leaderboard in collaboration with the EWoK team. EWoK is a benchmark for evaluating world modeling in language models by testing their ability to use knowledge of a concept to match a target text with a plausible/implausible context. EWoK targets specific concepts from multiple knowledge domains known to be vital for world modeling in humans, including social interactions and spatial relations."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2405.09605",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(S,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(p,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function on(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Medical"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("p",{className:"my-2",children:"With the increasing scale and impact of language models, there has also been interest interest in using language models in the medical domain. However, the capabilities and risks of these models are not well-understood, and there is significant potential for harm in the medical setting."}),e.jsxs("p",{className:"my-2",children:["To address this, we present the"," ",e.jsx("a",{className:"font-bold",href:"https://arxiv.org/abs/2405.09605",children:"HELM Medical"})," ","leaderboard for evaluation of language models in the medical domain. The HELM Medical leaderboard presents evaluations of leading general-purpose language models as well as language models fine-tuned on the medical domain. These models are evaluated on a range of medical tasks based on the benchmarks used in"," ",e.jsx("a",{className:"underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://arxiv.org/abs/2212.13138",children:"Singhal et al. 2022"}),". We hope that this leaderboard encourages further work in evaluating language models on tasks from the medical domain."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(p,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(S,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(p,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const dn=""+new URL("helm-safety-2907a7b6.png",import.meta.url).href;function mn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"HELM Safety"}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("img",{src:dn,alt:"Logo",className:"mx-auto p-0 block",style:{width:"300px"}}),e.jsx("p",{children:"Language models demonstrate powerful capabilities and pose significant risks. Given their widespread deployment, standardized public benchmarking of such models is vital. While language models are routinely evaluated on standard capability benchmarks, comparable standardization for benchmarking safety risks lags behind. To address this gap, we introduce HELM-Safety as a collection of 5 safety benchmarks that span 6 risk categories (e.g. violence, fraud, discrimination, sexual, harassment, deception). We present evaluation results for recent leading open weights and closed models."}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://crfm.stanford.edu/2024/11/08/helm-safety.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsx("div",{className:"flex-1",children:e.jsx(S,{})})]})]})}function hn(){return window.PROJECT_ID==="lite"?e.jsx(he,{}):window.PROJECT_ID==="instruct"?e.jsx(Lt,{}):window.PROJECT_ID==="image2struct"?e.jsx(ln,{}):window.PROJECT_ID==="heim"?e.jsx(Vt,{}):window.PROJECT_ID==="mmlu"?e.jsx(Ct,{}):window.PROJECT_ID==="vhelm"?e.jsx(Gt,{}):window.PROJECT_ID==="air-bench"?e.jsx(Pt,{}):window.PROJECT_ID==="thaiexam"?e.jsx(Ht,{}):window.PROJECT_ID==="finance"?e.jsx(Ot,{}):window.PROJECT_ID==="call-center"?e.jsx(Jt,{}):window.PROJECT_ID==="cleva"?e.jsx($t,{}):window.PROJECT_ID==="tables"?e.jsx(Yt,{}):window.PROJECT_ID==="ewok"?e.jsx(cn,{}):window.PROJECT_ID==="medical"?e.jsx(on,{}):window.PROJECT_ID==="safety"?e.jsx(mn,{}):window.PROJECT_ID==="home"?e.jsx(nn,{}):e.jsx(he,{})}function xn(){return e.jsx(es,{children:e.jsx(ss,{children:e.jsxs(B,{path:"/",element:e.jsx(Fs,{}),children:[e.jsx(B,{index:!0,element:e.jsx(hn,{})}),e.jsx(B,{path:"leaderboard",element:e.jsx(Mt,{})}),e.jsx(B,{path:"models",element:e.jsx(qs,{})}),e.jsx(B,{path:"scenarios",element:e.jsx(Gs,{})}),e.jsx(B,{path:"groups",element:e.jsx(Xs,{})}),e.jsx(B,{path:"groups/:groupName",element:e.jsx($s,{})}),e.jsx(B,{path:"runs",element:e.jsx(Zs,{})}),e.jsx(B,{path:"runs/:runName",element:e.jsx(yt,{})})]})})})}$.createRoot(document.getElementById("root")).render(e.jsx(ts.StrictMode,{children:e.jsx(xn,{})})); diff --git a/src/helm/benchmark/static_build/index.html b/src/helm/benchmark/static_build/index.html index 40bce719f1..48e8bbfbaa 100644 --- a/src/helm/benchmark/static_build/index.html +++ b/src/helm/benchmark/static_build/index.html @@ -7,7 +7,7 @@ Holistic Evaluation of Language Models (HELM) - +