diff --git a/.github/workflows/manage-python-cache.yml b/.github/workflows/manage-python-cache.yml index fab51b4efa..4ef3ef16cc 100644 --- a/.github/workflows/manage-python-cache.yml +++ b/.github/workflows/manage-python-cache.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - name: Check out repository uses: actions/checkout@v4 @@ -31,7 +31,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - name: Check out repository uses: actions/checkout@v4 diff --git a/.github/workflows/test-daily-integration.yml b/.github/workflows/test-daily-integration.yml deleted file mode 100644 index feb0f7e255..0000000000 --- a/.github/workflows/test-daily-integration.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Daily Integration Tests -on: - schedule: - - cron: "30 15 * * *" - -jobs: - test_openvino: - name: Test OpenVINO on Optimum Intel - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.9", "3.10", "3.11"] - steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - cache: pip - - run: python3 -m pip install --upgrade build - - run: python3 -m build - - run: python3 -m pip install "$(ls dist/crfm_helm-*.whl)[openvino]" - - run: helm-run --run-entries boolq:model=hf-internal-testing/tiny-random-MistralForCausalLM --enable-huggingface-models hf-internal-testing/tiny-random-MistralForCausalLM --suite v1 --max-eval-instances 10 --openvino diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f7845fc43e..ed88ed0448 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,7 +16,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} @@ -36,7 +36,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - name: Clear free space run: | diff --git a/.github/workflows/test_scenarios.yml b/.github/workflows/test_scenarios.yml index 81f5a820c3..dfbcdd2ee3 100644 --- a/.github/workflows/test_scenarios.yml +++ b/.github/workflows/test_scenarios.yml @@ -7,6 +7,8 @@ on: pull_request: paths: - 'src/helm/benchmark/scenarios/test_*_scenario.py' + schedule: + - cron: "30 15 * * *" jobs: test: @@ -14,7 +16,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - name: Clear free space run: | @@ -29,4 +31,6 @@ jobs: - name: Install HELM run: ./install-dev.sh && ./pre-commit.sh - name: Run scenario tests + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: python3 -m pytest -m scenarios diff --git a/CHANGELOG.md b/CHANGELOG.md index aaae5a512b..7058d24e55 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,56 @@ ## [Upcoming] +## [v0.5.4] - 2024-10-09 + +### Breaking Changes + +- Python 3.8 is no longer supported - please use Python 3.9 to 3.11 instead.(#2978) + +### Scenarios + +- Fix prompt for BANKING77 (#3009) +- Split up LINDSEA scenario (#2938) +- Normalize lpips and ssim for image2struct (#3020) + +### Models + +- Add o1 models (#2989) +- Add Palmyra-X-004 model (#2990) +- Add Palmyra-Med and Palmyra-Fin models (#3028) +- Add Llama 3.2 Turbo models on Together AI (#3029) +- Add Llama 3 Instruct Lite / Turbo on Together AI (#3031) +- Add Llama 3 CPT SEA-Lion v2 models (#3036) +- Add vision support to Together AI client (#3041) + +### Frontend + +- Display null annotator values correctly in the frontend (#3003) + +### Framework + +- Add support for Python 3.11 (#2922) +- Fix incorrect handling of ties in win rate computation (#3001, #2008) +- Add mean row aggregation to HELM summarize (#2997, #3030) + +### Developer Workflow + +- Move pre-commit to pre-push (#3013) +- Improve local frontend pre-commit (#3012) + +### Contributors + +Thank you to the following contributors for your work on this HELM release! + +- @brianwgoldman +- @chiheem +- @farzaank +- @JoelNiklaus +- @liamjxu +- @teetone +- @weiqipedia +- @yifanmai + ## [v0.5.3] - 2024-09-06 ### Breaking Changes @@ -627,7 +677,8 @@ Thank you to the following contributors for your contributions to this HELM rele - Initial release -[upcoming]: https://github.com/stanford-crfm/helm/compare/v0.5.3...HEAD +[upcoming]: https://github.com/stanford-crfm/helm/compare/v0.5.4...HEAD +[v0.5.3]: https://github.com/stanford-crfm/helm/releases/tag/v0.5.4 [v0.5.3]: https://github.com/stanford-crfm/helm/releases/tag/v0.5.3 [v0.5.2]: https://github.com/stanford-crfm/helm/releases/tag/v0.5.2 [v0.5.1]: https://github.com/stanford-crfm/helm/releases/tag/v0.5.1 diff --git a/CITATION.bib b/CITATION.bib new file mode 100644 index 0000000000..2ebc5f4c66 --- /dev/null +++ b/CITATION.bib @@ -0,0 +1,10 @@ +@article{ +liang2023holistic, +title={Holistic Evaluation of Language Models}, +author={Percy Liang and Rishi Bommasani and Tony Lee and Dimitris Tsipras and Dilara Soylu and Michihiro Yasunaga and Yian Zhang and Deepak Narayanan and Yuhuai Wu and Ananya Kumar and Benjamin Newman and Binhang Yuan and Bobby Yan and Ce Zhang and Christian Alexander Cosgrove and Christopher D Manning and Christopher Re and Diana Acosta-Navas and Drew Arad Hudson and Eric Zelikman and Esin Durmus and Faisal Ladhak and Frieda Rong and Hongyu Ren and Huaxiu Yao and Jue WANG and Keshav Santhanam and Laurel Orr and Lucia Zheng and Mert Yuksekgonul and Mirac Suzgun and Nathan Kim and Neel Guha and Niladri S. Chatterji and Omar Khattab and Peter Henderson and Qian Huang and Ryan Andrew Chi and Sang Michael Xie and Shibani Santurkar and Surya Ganguli and Tatsunori Hashimoto and Thomas Icard and Tianyi Zhang and Vishrav Chaudhary and William Wang and Xuechen Li and Yifan Mai and Yuhui Zhang and Yuta Koreeda}, +journal={Transactions on Machine Learning Research}, +issn={2835-8856}, +year={2023}, +url={https://openreview.net/forum?id=iO4LZibEqW}, +note={Featured Certification, Expert Certification} +} diff --git a/README.md b/README.md index 1808f831a4..5fded05017 100644 --- a/README.md +++ b/README.md @@ -17,35 +17,28 @@ Welcome! The **`crfm-helm`** Python package contains code used in the **Holistic To get started, refer to [the documentation on Read the Docs](https://crfm-helm.readthedocs.io/) for how to install and run the package. -# Holistic Evaluation of Text-To-Image Models - - - -Significant effort has recently been made in developing text-to-image generation models, which take textual prompts as -input and generate images. As these models are widely used in real-world applications, there is an urgent need to -comprehensively understand their capabilities and risks. However, existing evaluations primarily focus on image-text -alignment and image quality. To address this limitation, we introduce a new benchmark, -**Holistic Evaluation of Text-To-Image Models (HEIM)**. - -We identify 12 different aspects that are important in real-world model deployment, including: - -- image-text alignment -- image quality -- aesthetics -- originality -- reasoning -- knowledge -- bias -- toxicity -- fairness -- robustness -- multilinguality -- efficiency - -By curating scenarios encompassing these aspects, we evaluate state-of-the-art text-to-image models using this benchmark. -Unlike previous evaluations that focused on alignment and quality, HEIM significantly improves coverage by evaluating all -models across all aspects. Our results reveal that no single model excels in all aspects, with different models -demonstrating strengths in different aspects. - -This repository contains the code used to produce the [results on the website](https://crfm.stanford.edu/heim/latest/) -and [paper](https://arxiv.org/abs/2311.04287). +## Papers + +This repository contains code used to produce results for the following papers: + +- **Holistic Evaluation of Vision-Language Models (VHELM)** - [paper](https://arxiv.org/abs/2410.07112), [leaderboard](https://crfm.stanford.edu/helm/vhelm/latest/), [documentation](https://crfm-helm.readthedocs.io/en/latest/vhelm/) +- **Holistic Evaluation of Text-To-Image Models (HEIM)** - [paper](https://arxiv.org/abs/2311.04287), [leaderboard](https://crfm.stanford.edu/helm/heim/latest/), [documentation](https://crfm-helm.readthedocs.io/en/latest/heim/) + +The HELM Python package can be used to reproduce the published model evaluation results from these papers. To get started, refer to the documentation links above for the corresponding paper, or the [main Reproducing Leaderboards documentation](https://crfm-helm.readthedocs.io/en/latest/reproducing_leaderboards/). + +## Citation + +If you use this software in your research, please cite the [Holistic Evaluation of Language Models paper](https://openreview.net/forum?id=iO4LZibEqW) as below. + +```bibtex +@article{ +liang2023holistic, +title={Holistic Evaluation of Language Models}, +author={Percy Liang and Rishi Bommasani and Tony Lee and Dimitris Tsipras and Dilara Soylu and Michihiro Yasunaga and Yian Zhang and Deepak Narayanan and Yuhuai Wu and Ananya Kumar and Benjamin Newman and Binhang Yuan and Bobby Yan and Ce Zhang and Christian Alexander Cosgrove and Christopher D Manning and Christopher Re and Diana Acosta-Navas and Drew Arad Hudson and Eric Zelikman and Esin Durmus and Faisal Ladhak and Frieda Rong and Hongyu Ren and Huaxiu Yao and Jue WANG and Keshav Santhanam and Laurel Orr and Lucia Zheng and Mert Yuksekgonul and Mirac Suzgun and Nathan Kim and Neel Guha and Niladri S. Chatterji and Omar Khattab and Peter Henderson and Qian Huang and Ryan Andrew Chi and Sang Michael Xie and Shibani Santurkar and Surya Ganguli and Tatsunori Hashimoto and Thomas Icard and Tianyi Zhang and Vishrav Chaudhary and William Wang and Xuechen Li and Yifan Mai and Yuhui Zhang and Yuta Koreeda}, +journal={Transactions on Machine Learning Research}, +issn={2835-8856}, +year={2023}, +url={https://openreview.net/forum?id=iO4LZibEqW}, +note={Featured Certification, Expert Certification} +} +``` \ No newline at end of file diff --git a/docs/adding_new_models.md b/docs/adding_new_models.md index 79c66a637f..47c9b25539 100644 --- a/docs/adding_new_models.md +++ b/docs/adding_new_models.md @@ -60,7 +60,8 @@ Examples of common arguments within `args`: - Revision: `revision: my_revision` - Quantization: `load_in_8bit: true` - Model precision: `torch_dtype: torch.float16` -- Running remote code: `trust_remote_code: true` +- Model device: `device: cpu` or `device: cuda:0` +- Allow running remote code: `trust_remote_code: true` - Multi-GPU: `device_map: auto` diff --git a/docs/custom_scenarios.md b/docs/adding_new_scenarios.md similarity index 99% rename from docs/custom_scenarios.md rename to docs/adding_new_scenarios.md index 91348b6f1e..1e01b585bc 100644 --- a/docs/custom_scenarios.md +++ b/docs/adding_new_scenarios.md @@ -1,4 +1,4 @@ -# Custom Scenarios +# Adding New Scenarios HELM comes with more than a hundred built-in scenarios. However, you may want to run HELM on a scenario that is not built into HELM yet, or you may want to run HELM on scenarios that use your private datasets. Because HELM is a modular framework with a plug-in architecture, you can run evaluations with your custom scenarios on HELM without needing to modify HELM code. diff --git a/docs/custom_tokenizers.md b/docs/adding_new_tokenizers.md similarity index 98% rename from docs/custom_tokenizers.md rename to docs/adding_new_tokenizers.md index bb52864be9..26f7e0665d 100644 --- a/docs/custom_tokenizers.md +++ b/docs/adding_new_tokenizers.md @@ -1,4 +1,4 @@ -# Custom Tokenizers +# Adding New Tokenizers HELM comes with many built-in tokenizers, but in some cases, you may need to add your own custom tokenizer for your custom model. diff --git a/docs/code.md b/docs/code.md index f2acfa99c5..6f3f1b97a0 100644 --- a/docs/code.md +++ b/docs/code.md @@ -1,5 +1,9 @@ # Code Structure +**Warning** — The document is stale and was last modified more than ten months ago. The information below may be outdated and incorrect. Please proceed with caution! + +## Birds-Eye View + Here's a birds-eye view of how the benchmarking process interacts with the main classes (see `benchmark`): @@ -8,7 +12,7 @@ classes (see `benchmark`): an input (e.g., question) and a set of `Reference` outputs (e.g., multiple choice answers). -- A `DataPreprocessor` takes in a `Scenario` and produces a list of `Instance`s +- A `DataPreprocessor` takes in a `Scenario` and produces a list of `Instance`s. Each `Instance` is given a unique ID. The set of `Instance`s is augmented according to `DataAugmenterSpec`. @@ -45,9 +49,9 @@ There are three types of classes: In order to implement new scenarios: -1. Create a new file as a new Python scenario file in the `scenarios` folder. -2. Within the scenario file, create a `Scenario` class, e.g. `YourScenario`. -3. `YourScenario` should implement `get_instances`, a method that downloads the +1. Create a new Python file in the `scenarios` folder. +1. Within the scenario file, create a `Scenario` class, e.g. `YourScenario`. +1. `YourScenario` should implement `get_instances`, a method that downloads the dataset files if they don't already exist and returns a list of `Instance`s. Each `Instance` must have a list of (potentially one) `Reference` answers: a correct answer may be indicated with a `CORRECT_TAG` in @@ -57,48 +61,52 @@ In order to implement new scenarios: 1. For `Scenario`s with datasets that cannot be publicly shared, place a copy of the dataset at path `restricted/` and read from that path. See `NewsQAScenario` and `ICEScenario` for some examples. -4. Note that you need not enumerate every possible correct answer (nor must +1. Note that you need not enumerate every possible correct answer (nor must there even necessarily be a correct answer). -5. Make sure to document your scenario well with a clear docstring. -6. In addition, specify its `name`, `description`, and `tags`. -7. Define a function `get_specname_spec` in `run_specs.py` to retrieve a `ScenarioSpec` +1. Make sure to document your scenario well with a clear docstring. +1. In addition, specify its `name`, `description`, and `tags`. +1. Identify the appropriate metric for your task in one of the `*_metrics.py` files. + If the metric you'd like to use does not exist, follow the directions in [Adding new metrics](#adding-new-metrics). + Many will be in `basic_metrics.py`. +1. Define a function in `run_specs.py` annotated with `run_spec_function` to: + 1. Construct a `ScenarioSpec` for your scenario using a class name corresponding to the Python path of the class (e.g. `helm.benchmark.scenarios.your_scenario.YourScenario`) and any arguments which must be passed as a dictionary of `args`. -8. Have the `get_specname_spec` function retrieve an `AdapterSpec` for your + 1. Construct an `AdapterSpec` for your scenario specifying the type of language model generation which must be performed for the task. -9. Identify the appropriate metric for your task in one of the `*_metrics.py` files. - If the metric you'd like to use does not exist, follow the directions in [Adding new metrics](#adding-new-metrics). - Many will be in `basic_metrics.py`. -10. Have a `get_metric_spec` function retrieve one or more `MetricSpec` + 1. Construct one or more `MetricSpec` objects for your task, specifying the classname with the Python path of the object, with the same arguments as the `ScenarioSpec` constructor. -11. Have the `get_specname_spec` function return a `RunSpec` object, with a + 1. Construct and return `RunSpec` object, with a `name` corresponding to the scenario name and any patterns to match in curly braces, a `scenario_spec`, an `adapter_spec`, `metric_specs`, and `groups`. -12. Attempt to run your task with - `venv/bin/helm-run -r yourscenarioname:arg=value` where - `yourscenarioname` matches the `name` specified in YourScenario -13. Add the spec to dictionary `CANONICAL_RUN_SPEC_FUNCS` in `src/helm/benchmark/run_specs.py`. -14. Update `src/helm/proxy/static/contamination.yaml` with models that we trained on your scenario (i.e. contaminated). -15. Add a schema to `src/helm/benchmark/static/schema.yaml` and add the scenario to `subgroups` as needed. +1. Attempt to run your task with + `venv/bin/helm-run -r yourscenarioname:arg=value` where + `yourscenarioname` matches the `name` specified in YourScenario +1. Update `src/helm/benchmark/static/contamination.yaml` with models that were trained on your scenario (i.e. contaminated). +1. Add a schema to `src/helm/benchmark/static/schema.yaml` and add the scenario to `subgroups` as needed. ## Adding new metrics -To add a new metric: +To add a new metric, first determine if your metric is generic and likely to be widely used, or specific to your task. -1. If the metric is task-specific, create a new `yourtask_metrics.py` file. - Otherwise, if the metric is generic and likely to be widely used, add it - to `basic_metrics.py`. -2. If you are creating a task-specific metric, create a `YourTaskMetric` +* For generic metrics: + 1. Add a method to `basic_metrics.py` which takes two arguments: the `gold` answer and the model's `pred`iction. + 1. Add your method to the `metric_fn_mapping` lookup. +* For task specific metrics: + 1. Create a new `yourtask_metrics.py` file for class `YourTaskMetric` which inherits from `Metric` in `metric.py`. -3. Define methods `__init__` and `evaluate_generation` returning a list of `Stat` objects. -4. Each `Stat` should correspond to a distinct aggregate measurement over the generated examples. + 1. Define methods `__init__` and `evaluate_generation` returning a list of `Stat` objects. + +Your metric is responsible for producing `Stat` objects: + +* Each `Stat` should correspond to a distinct aggregate measurement over the generated examples. Some may have one metric (e.g. accuracy), while others may quantify multiple aspects (e.g. multiple distance metrics). -5. For each `value` generated for a `Stat`, add it to `yourstat` using `yourstat.add(value)`. +* For each `value` generated for a `Stat`, add it to `yourstat` using `yourstat.add(value)`. Usually, there will only be one value for each `Stat`, but multiple can be used, e.g. to show variance. ## Data augmentations diff --git a/docs/developer_adding_new_models.md b/docs/developer_adding_new_models.md index ac606c861b..b91e8e027d 100644 --- a/docs/developer_adding_new_models.md +++ b/docs/developer_adding_new_models.md @@ -1,4 +1,6 @@ -# Adding new models +# Adding New Clients + +**Warning** — The document is stale. The information below may be outdated and incorrect. Please proceed with caution! ## Overview of the process To add a new model you need to define 3 objects: diff --git a/docs/developer_setup.md b/docs/developer_setup.md index cf3ae43a26..81b2ba2ec7 100644 --- a/docs/developer_setup.md +++ b/docs/developer_setup.md @@ -105,3 +105,37 @@ flake8 src scripts # Type checker mypy src scripts ``` + +## Executing helm commands with local modifications + +The recommended way to execute `helm-run`, `helm-summarize`, `helm-server`, etc, with your local version of the repository is to do an editable install, using the following steps: + +1. Activate your virtual environment. +1. Change directory to the repository root (contains setup.cfg). +1. Make sure you don't have an existing helm installation for that environment with `pip uninstall crfm-helm` +1. Run `pip install -e .` + +Now calling `helm-run` while the environment is activated will read from your local source. + +### Without installing + +If you have a compelling reason not to do an editable install, you can execute commands by: + +1. Change directory to `src` +1. Execute the module you want with a command like: `python -m helm.benchmark.run` + +## Checking in code + +The HELM repository does not allow direct modifications of the main branch. Instead, developers create a Pull Request which must then be approved by a different person before merging into main. Here is an example workflow: + +1. `git checkout main` to start from the main branch. +1. `git pull origin main` to get up to date. +1. Make whatever changes you'll like to group into a single review. +1. Run tests. +1. Make a new branch with `git checkout -b //` to upload to github. +1. Loading any HELM github page should now prompt you about creating a new pull request. If not, you can also find your branch on [the branches page](https://github.com/stanford-crfm/helm/branches) to create one. +1. Update the title and description as necessary, then create the pull request. +1. Once the reviewer is satisfied, they can approve and either of you can then `Squash and Merge` the branch into main. diff --git a/docs/downloading_raw_results.md b/docs/downloading_raw_results.md index 028826c9f2..d01f631b39 100644 --- a/docs/downloading_raw_results.md +++ b/docs/downloading_raw_results.md @@ -1,16 +1,16 @@ # Downloading Raw Results -All of HELM's raw result data is stored in Google Cloud Storage (GCS) in the public `crfm-helm-public` bucket. If you wish to download the raw result data, you can use Google Cloud's tools to do so. The following walks through how to use the `gsutil` tool ([documentation](https://cloud.google.com/storage/docs/gsutil)) to download the data. +All of HELM's raw result data is stored in Google Cloud Storage (GCS) in the public `crfm-helm-public` bucket. If you wish to download the raw result data, you can use the Google Cloud Platform (GCP) tools to do so. The following walks through how to use the `gcloud storage` command line tool ([documentation](https://cloud.google.com/sdk/gcloud/reference/storage)) to download the data. ## Setup -1. Follow [Google's installation instructions](https://cloud.google.com/storage/docs/gsutil_install) to install `gsutil`. When prompted "Would you like to log in (Y/n)?", respond with no, because the HELM GCS bucket is public and does not require credentials. +1. Follow [Google's installation instructions](https://cloud.google.com/sdk/docs/install) to install `gcloud`. If the installer prompts you to log in, you may skip this step because the HELM GCS bucket allows public unauthenticated access. 2. Create a local directory to store the data: ```sh export LOCAL_BENCHMARK_OUTPUT_PATH=./benchmark_output mkdir $LOCAL_BENCHMARK_OUTPUT_PATH ``` -3. Set the Google Cloud Storage path to the appropriate path: +3. Set the GCS path to the appropriate path: ```sh export GCS_BENCHMARK_OUTPUT_PATH=gs://crfm-helm-public/lite/benchmark_output ``` @@ -28,13 +28,13 @@ Locations of the `benchmark_output` folders for each project: Warning: Downloading a whole HELM project requires a very large amounts of disk space - a few hundred GB for most projects, and more than 1 TB for Classic. Ensure that you have enough local disk space before downloading these projects. -1. (Optional) Use the `gsutil du` ([documentation](https://cloud.google.com/storage/docs/gsutil/commands/du)) command to compute the size of the download and ensure you have enough space on your local disk: +1. (Optional) Use the `gcloud storage du` ([documentation](https://cloud.google.com/sdk/gcloud/reference/storage/du)) command to compute the size of the download and ensure you have enough space on your local disk: ```sh -gsutil du -sh $GCS_BENCHMARK_OUTPUT_PATH +gcloud storage du -sh $GCS_BENCHMARK_OUTPUT_PATH ``` -2. Run `gsutil rsync` ([documentation](https://cloud.google.com/storage/docs/gsutil/commands/rsync)) to download the data to the folder created in the previous step: +2. Run `gcloud storage rsync` ([documentation](https://cloud.google.com/sdk/gcloud/reference/storage/rsync)) to download the data to the folder created in the previous step: ```sh -gsutil -m rsync -r $GCS_BENCHMARK_OUTPUT_PATH $LOCAL_BENCHMARK_OUTPUT_PATH +gcloud storage rsync -r $GCS_BENCHMARK_OUTPUT_PATH $LOCAL_BENCHMARK_OUTPUT_PATH ``` ## Download a specific version @@ -56,9 +56,9 @@ export RELEASE_VERSION=v1.0.0 ```sh mkdir $LOCAL_BENCHMARK_OUTPUT_PATH/releases/$RELEASE_VERSION ``` -3. Run `gsutil rsync` ([documentation](https://cloud.google.com/storage/docs/gsutil/commands/rsync)) to download the data to the folder created in the previous step: +3. Run `gcloud storage rsync` ([documentation](https://cloud.google.com/sdk/gcloud/reference/storage/du)) to download the data to the folder created in the previous step: ```sh -gsutil -m rsync -r $GCS_BENCHMARK_OUTPUT_PATH/releases/$RELEASE_VERSION $LOCAL_BENCHMARK_OUTPUT_PATH/releases/$RELEASE_VERSION +gcloud storage rsync -r $GCS_BENCHMARK_OUTPUT_PATH/releases/$RELEASE_VERSION $LOCAL_BENCHMARK_OUTPUT_PATH/releases/$RELEASE_VERSION ``` 4. Inspect the file contents of `$LOCAL_BENCHMARK_OUTPUT_PATH/releases/$RELEASE_VERSION/summary.json`. For _each_ suite listed in the `suites` array field, repeat the steps in **Download a specific suite** for that suite. @@ -72,11 +72,17 @@ export SUITE_VERSION=v1.0.0 ```sh mkdir $LOCAL_BENCHMARK_OUTPUT_PATH/runs/$SUITE_VERSION ``` -3. Run `gsutil rsync` ([documentation](https://cloud.google.com/storage/docs/gsutil/commands/rsync)) to download the data to the folder created in the previous step: +3. Run `gcloud storage rsync` ([documentation](https://cloud.google.com/sdk/gcloud/reference/storage/du)) to download the data to the folder created in the previous step: ```sh -gsutil -m rsync -r $GCS_BENCHMARK_OUTPUT_PATH/runs/$SUITE_VERSION $LOCAL_BENCHMARK_OUTPUT_PATH/runs/$SUITE_VERSION +gcloud storage rsync -r $GCS_BENCHMARK_OUTPUT_PATH/runs/$SUITE_VERSION $LOCAL_BENCHMARK_OUTPUT_PATH/runs/$SUITE_VERSION ``` +## Troubleshooting + +If you are on an older version of `gcloud`, you may encounter the error messages `(gcloud) Invalid choice: 'du'.` or `(gcloud) Invalid choice: 'rsync'.`. If so, you should either upgrade your `gcloud` installation to the latest version, or you may use the deprecated `gsutil` CLI tool ([documentation](https://cloud.google.com/storage/docs/gsutil)) instead. + +To use `gsutil`, install gsutil following [Google's instructions](https://cloud.google.com/storage/docs/gsutil_install), then use the above command with `gcloud storage du` replaced with `gsutil du` ([documentation](https://cloud.google.com/storage/docs/gsutil/commands/du)) and `gcloud storage rsync` replaced with `gsutil rsync` ([documentation](https://cloud.google.com/storage/docs/gsutil/commands/rsync)). + ## GCS browser -If you wish to explore the raw data files in the web browser without downloading it, you can use the [GCS browser](https://console.cloud.google.com/storage/browser/crfm-helm-public). Note that this requires logging into any Google account and agreeing to the Google Cloud Platform Terms of Service. +If you wish to explore the raw data files in the web browser without downloading it, you can use the [GCS browser](https://console.cloud.google.com/storage/browser/crfm-helm-public). Note that this requires logging into any Google account and agreeing to the GCP Terms of Service. diff --git a/docs/editing_documentation.md b/docs/editing_documentation.md new file mode 100644 index 0000000000..4e2942a353 --- /dev/null +++ b/docs/editing_documentation.md @@ -0,0 +1,23 @@ +# Editing Documentation + +The documentation that you are reading now is an invaluable resource for newcomers and experienced users alike. Contributions to the documentation are very welcome. + +We currently use the [MkDocs](https://www.mkdocs.org/) as our static site generator and [ReadTheDocs](https://readthedocs.org/) as our web host. + +To edit the documentation, first clone the repository locally, then install HELM from the repository by following the [Developer Setup](developer_setup.md) instructions. After that, install the MkDocs dependencies by running the following from the root of the repository: + +```sh +pip install -r docs/requirements.txt +``` + +You should now be able to run MkDocs from the root of the repository: + +```sh +mkdocs serve +``` + +Then navigate to [http://localhost:8000/] to view your locally-built documentation. + +The source Markdown files for the documentation are stored in the `docs/` folder. By default, MkDocs watches the source directories for changes and automatically re-renders the web pages when it detects changes. + +We make heavy use of plugins and macros for auto-generating documentation from code and docstrings. For more information, please refer to the documentation for these plugins e.g. [mkdocs-macros](https://mkdocs-macros-plugin.readthedocs.io/en/latest/), [mkdocstrings](https://mkdocstrings.github.io/python/) and [mkdocstrings-python](https://mkdocstrings.github.io/python/). diff --git a/docs/get_helm_rank.md b/docs/get_helm_rank.md index d4aed1e27e..f1d20635ea 100644 --- a/docs/get_helm_rank.md +++ b/docs/get_helm_rank.md @@ -1,6 +1,8 @@ -# Get Your Model's Leaderboard Rank +# Efficient-HELM -This tutorial will show you how to locally add your model into the HELM leaderboard, with in 3 steps: +This tutorial will show you how to locally add your model into the HELM Classic leaderboard at a fraction of the cost of performing a full run, using a techinque from IBM Resarch described in [Efficient Benchmarking (of Language Models)](https://arxiv.org/pdf/2308.11696.pdf) a paper from IBM Research. + +**Warning** — The tutorial will currently only work for the HELM Classic leaderboard. Other leaderboards are not yet supported. ## Download HELM leaderboard results @@ -17,7 +19,7 @@ curl -O https://storage.googleapis.com/crfm-helm-public/benchmark_output/archive mkdir -p benchmark_output/runs/$LEADERBOARD_VERSION && unzip run_stats.zip -d benchmark_output/runs/$LEADERBOARD_VERSION ``` -now that the files are in your results directory, all HELM models will be shown in your UI along with your model. +Now that the files are in your results directory, all HELM models will be shown in your UI along with your model. ## Run Efficient-HELM diff --git a/docs/heim.md b/docs/heim.md index f5aaba19bf..9b2317154f 100644 --- a/docs/heim.md +++ b/docs/heim.md @@ -1,16 +1,68 @@ -# HEIM Quick Start (text-to-image evaluation) +# HEIM (Text-to-image Model Evaluation) -To run HEIM, follow these steps: +**Holistic Evaluation of Text-To-Image Models (HEIM)** is an extension of the HELM framework for evaluating **text-to-image models**. -1. Create a run specs configuration file. For example, to evaluate -[Stable Diffusion v1.4](https://huggingface.co/CompVis/stable-diffusion-v1-4) against the -[MS-COCO scenario](https://github.com/stanford-crfm/heim/blob/main/src/helm/benchmark/scenarios/image_generation/mscoco_scenario.py), run: +## Holistic Evaluation of Text-To-Image Models + + + +Significant effort has recently been made in developing text-to-image generation models, which take textual prompts asmy-suite +input and generate images. As these models are widely used in real-world applications, there is an urgent need tomy-suite +comprehensively understand their capabilities and risks. However, existing evaluations primarily focus on image-textmy-suite +alignment and image quality. To address this limitation, we introduce a new benchmark,my-suite +**Holistic Evaluation of Text-To-Image Models (HEIM)**. + +We identify 12 different aspects that are important in real-world model deployment, including: + +- image-text alignment +- image quality +- aesthetics +- originality +- reasoning +- knowledge +- bias +- toxicity +- fairness +- robustness +- multilinguality +- efficiency + +By curating scenarios encompassing these aspects, we evaluate state-of-the-art text-to-image models using this benchmark.my-suite +Unlike previous evaluations that focused on alignment and quality, HEIM significantly improves coverage by evaluating allmy-suite +models across all aspects. Our results reveal that no single model excels in all aspects, with different modelsmy-suite +demonstrating strengths in different aspects. + +## References + +- [Leaderboard](https://crfm.stanford.edu/helm/heim/latest/) +- [Paper](https://arxiv.org/abs/2311.04287) + +## Installation + +First, follow the [installation instructions](installation.md) to install the base HELM Python page. + +To install the additional dependencies to run HEIM, run: + +```sh +pip install "crfm-helm[heim]" ``` -echo 'entries: [{description: "mscoco:model=huggingface/stable-diffusion-v1-4", priority: 1}]' > run_entries.conf + +Some models (e.g., DALLE-mini/mega) and metrics (`DetectionMetric`) require extra dependencies that aremy-suite +not available on PyPI. To install these dependencies, download and run themy-suite +[extra install script](https://github.com/stanford-crfm/helm/blob/main/install-heim-extras.sh): + +```sh +bash install-heim-extras.sh ``` -2. Run the benchmark with certain number of instances (e.g., 10 instances): -`helm-run --conf-paths run_entries.conf --suite heim_v1 --max-eval-instances 10` -Examples of run specs configuration files can be found [here](https://github.com/stanford-crfm/helm/tree/main/src/helm/benchmark/presentation). -We used [this configuration file](https://github.com/stanford-crfm/helm/blob/main/src/helm/benchmark/presentation/run_entries_heim.conf) -to produce results of the paper. +## Getting Started + +The following is an example of evaluating [Stable Diffusion v1.4](https://huggingface.co/CompVis/stable-diffusion-v1-4) on the [MS-COCO scenario](https://github.com/stanford-crfm/heim/blob/main/src/helm/benchmark/scenarios/image_generation/mscoco_scenario.py) using 10 instances. + +```sh +helm-run --run-entries mscoco:model=huggingface/stable-diffusion-v1-4 --suite my-heim-suite --max-eval-instances 10 +``` + +## Reproducing the Leaderboard + +To reproduce the [entire HEIM leaderboard](https://crfm.stanford.edu/helm/heim/latest/), refer to the instructions for HEIM on the [Reproducing Leaderboards](reproducing_leaderboards.md) documentation. diff --git a/docs/huggingface_models.md b/docs/huggingface_models.md index 4511cce2fa..f781e4ae2a 100644 --- a/docs/huggingface_models.md +++ b/docs/huggingface_models.md @@ -1,18 +1,25 @@ # Hugging Face Model Hub Integration -HELM can be used to evaluate `AutoModelForCausalLM` models (e.g. [`BioMedLM`](https://huggingface.co/stanford-crfm/BioMedLM)) on [Hugging Face Model Hub](https://huggingface.co/models). +HELM can be used to evaluate `AutoModelForCausalLM` models (e.g. [`BioMedLM`](https://huggingface.co/stanford-crfm/BioMedLM)) on [Hugging Face Model Hub](https://huggingface.co/models) or local disk. Note that only `AutoModelForCausalLM` models are supported; other classes such as `AutoModelForSeq2SeqLM` may be supported in the future. + +## Using `model_deployments.yaml` + +You can add Hugging Face models using the method discussed in [Adding New Models](adding_new_models.md). This can be used for both models on Hugging Face Hub and local disk. Please refer to that page for instructions for how to do so. + +## Using command-line flags + +In some cases, you can use command-line flags with `helm-run` to evaluating Hugging Face models. This provides a more convenient way to use Hugging Face models that does not require configuration files. To use `AutoModelForCausalLM` models from Hugging Face Model Hub, add the Hugging Face model IDs to the `--enable-huggingface-models` flags to `helm-run`. This will make the corresponding Hugging Face models available to use in your run spec descriptions. In the run spec description, use the Hugging Face model ID as the model name. To use a revision of a model other than the default main revision, append a `@` followed by the revision name to the model ID passed to the `--enable-huggingface-models` flag. -Current restrictions: +Current restrictions with command-line flags: -- Only `AutoModelForCausalLM` is supported; other classes such as `AutoModelForSeq2SeqLM` may be supported in the future. - Models without a namespace are not supported (e.g. `bert-base-uncased`). -- Models at local file paths are not supported. +- The model must have `model_max_length` set in the tokenizer configuration. -Examples: +Example model on Hugging Face Hub: ```bash # Run boolq on stanford-crfm/BioMedLM at the default main revision @@ -30,16 +37,13 @@ helm-run \ --max-eval-instances 10 ``` -To use Optimum Intel, add `--openvino` flag to `helm-run`. Optimum Intel provides a simple interface to optimize Transformer models and convert them to OpenVINO™ Intermediate Representation format to accelerate end-to-end pipelines on Intel® architectures using OpenVINO™ runtime. It runs the model on the CPU. - -Examples: +Example model on local disk: ```bash -# Run boolq on stanford-crfm/BioMedLM optimized by Optimum Intel OpenNIVO +# Run boolq on stanford-crfm/BioMedLM at the default main revision helm-run \ - --run-entries boolq:model=stanford-crfm/BioMedLM \ - --enable-huggingface-models stanford-crfm/BioMedLM \ + --run-entries boolq:model=your-org/your-model \ + --enable-local-huggingface-models path/to/your-org/your-model \ --suite v1 \ - --max-eval-instances 10 \ - --openvino + --max-eval-instances 10 ``` diff --git a/docs/index.md b/docs/index.md index 0071f90eed..5075f628af 100644 --- a/docs/index.md +++ b/docs/index.md @@ -18,6 +18,11 @@ To add new models and scenarios, refer to the Developer Guide's chapters: - [Developer Setup](developer_setup.md) - [Code Structure](code.md) +## Papers -We also support evaluating text-to-image models as introduced in **Holistic Evaluation of Text-to-Image Models (HEIM)** -([paper](https://arxiv.org/abs/2311.04287), [website](https://crfm.stanford.edu/heim/latest)). +This repository contains code used to produce results for the following papers: + +- **Holistic Evaluation of Vision-Language Models (VHELM)** - [paper](https://arxiv.org/abs/2410.07112), [leaderboard](https://crfm.stanford.edu/helm/vhelm/latest/), [documentation](https://crfm-helm.readthedocs.io/en/latest/vhelm/) +- **Holistic Evaluation of Text-To-Image Models (HEIM)** - [paper](https://arxiv.org/abs/2311.04287), [leaderboard](https://crfm.stanford.edu/helm/heim/latest/), [documentation](https://crfm-helm.readthedocs.io/en/latest/heim/) + +The HELM Python package can be used to reproduce the published model evaluation results from these papers. To get started, refer to the documentation links above for the corresponding paper, or the [main Reproducing Leaderboards documentation](https://crfm-helm.readthedocs.io/en/latest/reproducing_leaderboards/). \ No newline at end of file diff --git a/docs/installation.md b/docs/installation.md index 21e1d3358c..3af5d55f6d 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -35,18 +35,9 @@ Within this virtual environment, run: pip install crfm-helm ``` -### For HEIM (text-to-image evaluation) +## Install Multimodal Support -To install the additional dependencies to run HEIM, run: +Additional steps are required for multimodal evaluations: -``` -pip install "crfm-helm[heim]" -``` - -Some models (e.g., DALLE-mini/mega) and metrics (`DetectionMetric`) require extra dependencies that are -not available on PyPI. To install these dependencies, download and run the -[extra install script](https://github.com/stanford-crfm/helm/blob/main/install-heim-extras.sh): - -``` -bash install-heim-extras.sh -``` +- **HEIM (Text-to-image Model Evaluation)** - to install the additional dependencies to run HEIM (text-to-image evaluation), refer to the [HEIM documentation](heim.md). +- **VHELM (Vision-Language Models)** - To install the additional dependencies to run VHELM (Vision-Language Models), refer to the [VHELM documentation](vhelm.md). diff --git a/docs/mkdocs_macros.py b/docs/mkdocs_macros.py index 3bd3d05c8e..12a669a191 100644 --- a/docs/mkdocs_macros.py +++ b/docs/mkdocs_macros.py @@ -1,6 +1,6 @@ from typing import Dict, List, Type -from helm.benchmark.model_metadata_registry import ALL_MODELS_METADATA, TEXT_MODEL_TAG, CODE_MODEL_TAG, ModelMetadata +from helm.benchmark.model_metadata_registry import ALL_MODELS_METADATA, TEXT_MODEL_TAG, CODE_MODEL_TAG, DEPRECATED_MODEL_TAG, ModelMetadata from helm.benchmark.run_expander import RUN_EXPANDERS, RunExpander @@ -11,6 +11,8 @@ def models_by_organization_with_tag(tag: str) -> Dict[str, List[ModelMetadata]]: result: Dict[str, List[ModelMetadata]] = {} for model_metadata in ALL_MODELS_METADATA: + if DEPRECATED_MODEL_TAG in model_metadata.tags: + continue if tag not in model_metadata.tags: continue if model_metadata.creator_organization == "simple": diff --git a/docs/models.md b/docs/models.md index ce4ed6a06f..f87d62d667 100644 --- a/docs/models.md +++ b/docs/models.md @@ -18,6 +18,47 @@ {% endfor %} {% endfor %} -## HEIM (text-to-image evaluation) +## Vision-Language Models -For a list of text-to-image models, please visit the [models page](https://crfm.stanford.edu/heim/latest/?models) of the HEIM results website. +{% for organization, models in models_by_organization_with_tag("VISION_LANGUAGE_MODEL_TAG").items() %} + +### {{ organization }} + +{% for model in models %} + +#### {{ model.display_name }} — `{{ model.name }}` + +{{ model.description }} + +{% endfor %} +{% endfor %} + +## Text-to-image Models + +{% for organization, models in models_by_organization_with_tag("TEXT_TO_IMAGE_MODEL_TAG").items() %} + +### {{ organization }} + +{% for model in models %} + +#### {{ model.display_name }} — `{{ model.name }}` + +{{ model.description }} + +{% endfor %} +{% endfor %} + +## Audio-Language Models + +{% for organization, models in models_by_organization_with_tag("AUDIO_LANGUAGE_MODEL_TAG").items() %} + +### {{ organization }} + +{% for model in models %} + +#### {{ model.display_name }} — `{{ model.name }}` + +{{ model.description }} + +{% endfor %} +{% endfor %} \ No newline at end of file diff --git a/docs/proxy-server.md b/docs/proxy_server.md similarity index 88% rename from docs/proxy-server.md rename to docs/proxy_server.md index 818b98b72f..42dadf9b84 100644 --- a/docs/proxy-server.md +++ b/docs/proxy_server.md @@ -1,4 +1,6 @@ -# Proxy Access to Language Models +# Proxy Server + +**Warning** — The document is stale. The information below may be outdated and incorrect. Please proceed with caution! We provide a single unified entry point into accessing large language models (e.g., GPT-3, Jurassic). This provides both a web interface and a REST API. diff --git a/docs/quick_start.md b/docs/quick_start.md index 3a1fb2f561..c9785256f1 100644 --- a/docs/quick_start.md +++ b/docs/quick_start.md @@ -3,24 +3,14 @@ Run the following: ``` -# Create a run specs configuration -echo 'entries: [{description: "mmlu:subject=philosophy,model=openai/gpt2", priority: 1}]' > run_entries.conf - # Run benchmark -helm-run --conf-paths run_entries.conf --suite v1 --max-eval-instances 10 +helm-run --run-entries mmlu:subject=philosophy,model=openai/gpt2 --suite my-suite --max-eval-instances 10 # Summarize benchmark results -helm-summarize --suite v1 +helm-summarize --suite my-suite # Start a web server to display benchmark results -helm-server +helm-server --suite my-suite ``` Then go to http://localhost:8000/ in your browser. - - -## Next steps - -Click [here](get_helm_rank.md) to find out how to run the full benchmark and get your model's leaderboard rank. - -For the quick start page for HEIM, visit [here](heim.md). \ No newline at end of file diff --git a/docs/reproducing_leaderboards.md b/docs/reproducing_leaderboards.md index 77b517911b..72b0336edf 100644 --- a/docs/reproducing_leaderboards.md +++ b/docs/reproducing_leaderboards.md @@ -94,6 +94,16 @@ export NUM_EVAL_INSTANCES=1000 export PRIORITY=2 ``` +### VHELM >=v2.0.0 + +```bash +export RUN_ENTRIES_CONF_PATH=run_entries_vhelm.conf +export SCHEMA_PATH=schema_vhelm.yaml +export NUM_TRAIN_TRIALS=1 +export NUM_EVAL_INSTANCES=1000 +export PRIORITY=2 +``` + ### AIR-Bench ```bash diff --git a/docs/run_entries_configuration_files.md b/docs/run_entries_configuration_files.md index 09cb069ea6..22d92a917e 100644 --- a/docs/run_entries_configuration_files.md +++ b/docs/run_entries_configuration_files.md @@ -28,7 +28,7 @@ You would then use this file with `helm-run` with this command: helm-run --conf-file tutorial_run_entries.conf --suite tutorial --max-eval-instances 10 ``` -## Model run expander wildcards +## Model Run Expander Wildcards It is very common to use run entries configuration file with a **model run expander wildcards** e.g. `model=text`. For instance, diff --git a/docs/tutorial.md b/docs/tutorial.md index c000825d9a..77620560e7 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -8,34 +8,26 @@ We will run two runs using the `mmlu` scenario on the `openai/gpt2` model. The ` `helm-run` is a command line tool for running benchmarks. -To run this benchmark using the HELM command-line tools, we need to specify **run spec descriptions** that describes the desired runs. For this example, the run spec descriptions are `mmlu:subject=anatomy,model=openai/gpt2` (for anatomy) and `mmlu:subject=philosophy,model=openai/gpt2` (for philosophy). +To run this benchmark using the HELM command-line tools, we need to specify **run entries** that describes the desired runs. For this example, the run entries are `mmlu:subject=anatomy,model=openai/gpt2` (for anatomy) and `mmlu:subject=philosophy,model=openai/gpt2` (for philosophy). -Next, we need to create a **run spec configuration file** containing these run spec descriptions. A run spec configuration file is a text file containing `RunEntries` serialized to JSON, where each entry in `RunEntries` contains a run spec description. The `description` field of each entry should be a **run spec description**. Create a text file named `run_entries.conf` with the following contents: +We will now use `helm-run` to execute the runs. Run this command: -``` -entries: [ - {description: "mmlu:subject=anatomy,model=openai/gpt2", priority: 1}, - {description: "mmlu:subject=philosophy,model=openai/gpt2", priority: 1}, -] -``` - -We will now use `helm-run` to execute the runs that have been specified in this run spec configuration file. Run this command: - -``` -helm-run --conf-paths run_entries.conf --suite v1 --max-eval-instances 10 +```sh +helm-run --run-entries mmlu:subject=anatomy,model=openai/gpt2 mmlu:subject=philosophy,model=openai/gpt2 --suite my-suite --max-eval-instances 10 ``` -The meaning of the additional arguments are as follows: +The meaning of the arguments are as follows: +- `--run-entries` specifies the run entries from the desired runs. - `--suite` specifies a subdirectory under the output directory in which all the output will be placed. -- `--max-eval-instances` limits evaluation to only the first *N* inputs (i.e. instances) from the benchmark. +- `--max-eval-instances` limits evaluation to only *N* instances (i.e. items) from the benchmark, using a randomly shuffled order of instances. `helm-run` creates an environment directory environment and an output directory by default. - The environment directory is `prod_env/` by default and can be set using `--local-path`. Credentials for making API calls should be added to a `credentials.conf` file in this directory. - The output directory is `benchmark_output/` by default and can be set using `--output-path`. -After running this command, navigate to the `benchmark_output/runs/v1/` directory. This should contain a two sub-directories named `mmlu:subject=anatomy,model=openai_gpt2` and `mmlu:subject=philosophy,model=openai_gpt2`. Note that the names of these sub-directories is based on the run spec descriptions we used earlier, but with `/` replaced with `_`. +After running this command, navigate to the `benchmark_output/runs/my-suite/` directory. This should contain a two sub-directories named `mmlu:subject=anatomy,model=openai_gpt2` and `mmlu:subject=philosophy,model=openai_gpt2`. Note that the names of these sub-directories is based on the run entries we used earlier, but with `/` replaced with `_`. Each output sub-directory will contain several JSON files that were generated during the corresponding run: @@ -45,60 +37,35 @@ Each output sub-directory will contain several JSON files that were generated du - `per_instance_stats.json` contains a serialized list of `PerInstanceStats`, which contains the statistics produced for the metrics for each instance (i.e. input). - `stats.json` contains a serialized list of `PerInstanceStats`, which contains the statistics produced for the metrics, aggregated across all instances (i.e. inputs). -`helm-run` provides additional arguments that can be used to filter out `--models-to-run`, `--groups-to-run` and `--priority`. It can be convenient to create a large `run_entries.conf` file containing every run spec description of interest, and then use these flags to filter down the RunSpecs to actually run. As an example, the main `run_specs.conf` file used for the HELM benchmarking paper can be found [here](https://github.com/stanford-crfm/helm/blob/main/src/helm/benchmark/presentation/run_specs.conf). - -**Using model or model_deployment:** Some models have several deployments (for exmaple `eleutherai/gpt-j-6b` is deployed under `huggingface/gpt-j-6b`, `gooseai/gpt-j-6b` and `together/gpt-j-6b`). Since the results can differ depending on the deployment, we provide a way to specify the deployment instead of the model. Instead of using `model=eleutherai/gpt-g-6b`, use `model_deployment=huggingface/gpt-j-6b`. If you do not, a deployment will be arbitrarily chosen. This can still be used for models that have a single deployment and is a good practice to follow to avoid any ambiguity. - ## Using `helm-summarize` The `helm-summarize` reads the output files of `helm-run` and computes aggregate statistics across runs. Run the following: -``` -helm-summarize --suite v1 +```sh +helm-summarize --suite my-suite ``` -This reads the pre-existing files in `benchmark_output/runs/v1/` that were written by `helm-run` previously, and writes the following new files back to `benchmark_output/runs/v1/`: +This reads the pre-existing files in `benchmark_output/runs/my-suite/` that were written by `helm-run` previously, and writes the following new files back to `benchmark_output/runs/my-suite/`: - `summary.json` contains a serialized `ExecutiveSummary` with a date and suite name. -- `run_specs.json` contains the run spec descriptions for all the runs. +- `run_specs.json` contains the run entries for all the runs. - `runs.json` contains serialized list of `Run`, which contains the run path, run spec and adapter spec and statistics for each run. - `groups.json` contains a serialized list of `Table`, each containing information about groups in a group category. - `groups_metadata.json` contains a list of all the groups along with a human-readable description and a taxonomy. -Additionally, for each group and group-relavent metric, it will output a pair of files: `benchmark_output/runs/v1/groups/latex/_.tex` and `benchmark_output/runs/v1/groups/json/_.json`. These files contain the statistics for that metric from each run within the group. - - +Additionally, for each group and group-relavent metric, it will output a pair of files: `benchmark_output/runs/my-suite/groups/latex/_.tex` and `benchmark_output/runs/my-suite/groups/json/_.json`. These files contain the statistics for that metric from each run within the group. ## Using `helm-server` Finally, the `helm-server` command launches a web server to visualize the output files of `helm-run` and `helm-benchmark`. Run: +```sh +helm-server --suite my-suite ``` -helm-server -``` - -Open a browser and go to http://localhost:8000/ to view the visualization. You should see a similar view as [live website for the paper](https://crfm.stanford.edu/helm/v1.0/), but for the data from your benchmark runs. The website has three main sections: - -- **Models** contains a list of available models. -- **Scenarios** contains a list of available scenarios. -- **Results** contains results from the runs, organized into groups and categories of groups. -- **Raw Runs** contains a searchable list of runs. -## Other Tips +Open a browser and go to http://localhost:8000/ to view the visualization. You should see a similar view as [live website for the paper](https://crfm.stanford.edu/helm/classic/latest/), but for the data from your benchmark runs. The website has the following sections accessible from the top menu bar: -- The suite name can be used as a versioning mechanism to separate runs using different versions of scenarios or models. -- Tools such as [`jq`](https://stedolan.github.io/jq/) are useful for examining the JSON output files on the command line. +- **Leaderboards** contains the leaderboards with aggregate metrics. +- **Models** contains a list of models and their descriptions +- **Scenarios** contains a list of scenarios and their descriptions. +- **Predictions** contains a searchable list of runs. diff --git a/docs/vhelm.md b/docs/vhelm.md new file mode 100644 index 0000000000..59fb31fdaf --- /dev/null +++ b/docs/vhelm.md @@ -0,0 +1,44 @@ +# VHELM (Vision-Language Models) + +**Holistic Evaluation of Vision-Language Models (VHELM)** is an extension of the HELM framework for evaluating **Vision-Language Models (VLMs)**. + +VHELM aggregates various datasets to cover one or more of the 9 aspects: visual perception, bias, fairness, knowledge, multilinguality, reasoning, robustness, safety, and toxicity. In doing so, we produce a comprehensive, multi-dimensional view of the capabilities of the VLMs across these important factors. In addition, we standardize the standard inference parameters, methods of prompting, and evaluation metrics to enable fair comparisons across models. + +## References + +- [Leaderboard](https://crfm.stanford.edu/helm/vhelm/v2.0.1/) +- Paper (TBD) + +## Installation + +First, follow the [installation instructions](installation.md) to install the base HELM Python page. + +To install the additional dependencies to run VHELM, run: + +```sh +pip install "crfm-helm[vlm]" +``` + +## Quick Start + +The following is an example of evaluating `openai/gpt-4o-mini-2024-07-18` on 10 instance from the Accounting subset of MMMU. + +```sh +# Download schema_vhelm.yaml +wget https://raw.githubusercontent.com/stanford-crfm/helm/refs/heads/main/src/helm/benchmark/static/schema_vhelm.yaml + +# Run benchmark +helm-run --run-entries mmmu:subject=Accounting,model=openai/gpt-4o-mini-2024-07-18 --suite my-vhelm-suite --max-eval-instances 10 + +# Summarize benchmark results +helm-summarize --suite my-vhelm-suite --schema-path schema_vhelm.yaml + +# Start a web server to display benchmark results +helm-server --suite my-vhelm-suite +``` + +Then go to http://localhost:8000/ in your browser. + +## Reproducing the Leaderboard + +To reproduce the [entire VHELM leaderboard](https://crfm.stanford.edu/helm/vhelm/latest/), refer to the instructions for VHELM on the [Reproducing Leaderboards](reproducing_leaderboards.md) documentation. diff --git a/helm-frontend/package.json b/helm-frontend/package.json index 08fcd90d81..4339eae32d 100644 --- a/helm-frontend/package.json +++ b/helm-frontend/package.json @@ -25,7 +25,7 @@ "react-router-dom": "^6.14.2", "react-spinners": "^0.13.8", "recharts": "^2.7.2", - "serve-static": "^1.16.0" + "serve-static": "^1.16.2" }, "devDependencies": { "@testing-library/jest-dom": "^5.17.0", diff --git a/helm-frontend/project_metadata.json b/helm-frontend/project_metadata.json index 779fac7ec0..dfaf297d16 100644 --- a/helm-frontend/project_metadata.json +++ b/helm-frontend/project_metadata.json @@ -3,7 +3,7 @@ "title": "Lite", "description": "Lightweight, broad evaluation of the capabilities of language models using in-context learning", "id": "lite", - "releases": ["v1.8.0", "v1.7.0", "v1.6.0", "v1.5.0", "v1.4.0", "v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"] + "releases": ["v1.10.0", "v1.9.0", "v1.8.0", "v1.7.0", "v1.6.0", "v1.5.0", "v1.4.0", "v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"] }, { "title": "Classic", @@ -27,24 +27,30 @@ "title": "MMLU", "description": "Massive Multitask Language Understanding (MMLU) evaluations using standardized prompts", "id": "mmlu", - "releases": ["v1.8.0", "v1.7.0", "v1.6.0", "v1.5.0", "v1.4.0", "v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"] + "releases": ["v1.10.0", "v1.9.0", "v1.8.0", "v1.7.0", "v1.6.0", "v1.5.0", "v1.4.0", "v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"] }, { "title": "VHELM", "description": "Holistic Evaluation of Vision-Language Models", "id": "vhelm", - "releases": ["v1.0.0"] + "releases": ["v2.1.0", "v2.0.1", "v2.0.0", "v1.0.0"] }, { "title": "Image2Struct", "description": "Evaluations of Vision-Language Models on extracting structured information from images", "id": "image2struct", - "releases": ["v1.0.1", "v1.0.0"] + "releases": ["v1.0.2", "v1.0.1", "v1.0.0"] }, { "title": "AIR-Bench", "description": "Safety benchmark based on emerging government regulations and company policies", "id": "air-bench", + "releases": ["v1.3.0", "v1.2.0", "v1.1.0", "v1.0.0"] + }, + { + "title": "Safety", + "description": "Safety benchmark that aggregates popular safety benchmarks across 6 risk vectors", + "id": "safety", "releases": ["v1.0.0"] }, { diff --git a/helm-frontend/public/config.js b/helm-frontend/public/config.js index 0d0df029cb..bee793a085 100644 --- a/helm-frontend/public/config.js +++ b/helm-frontend/public/config.js @@ -1,4 +1,4 @@ -window.RELEASE = "v1.0.0"; +window.RELEASE = "v1.9.0"; window.BENCHMARK_OUTPUT_BASE_URL = "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/"; window.PROJECT_ID = "lite"; diff --git a/helm-frontend/src/assets/helm-safety.png b/helm-frontend/src/assets/helm-safety.png new file mode 100644 index 0000000000..ada2adc817 Binary files /dev/null and b/helm-frontend/src/assets/helm-safety.png differ diff --git a/helm-frontend/src/components/Hero.tsx b/helm-frontend/src/components/Hero.tsx index b6f9bcfa11..3587d356f5 100644 --- a/helm-frontend/src/components/Hero.tsx +++ b/helm-frontend/src/components/Hero.tsx @@ -15,34 +15,25 @@ export default function Hero() { {/* Container for Image and Leaderboard */} -
+
{/* Image section */}
HELM Hero
{/* Leaderboard section */} -
-
- -
- - - -
+
+ +
+ + +
diff --git a/helm-frontend/src/components/InstanceData.tsx b/helm-frontend/src/components/InstanceData.tsx index 36d763eb52..3d51352761 100644 --- a/helm-frontend/src/components/InstanceData.tsx +++ b/helm-frontend/src/components/InstanceData.tsx @@ -20,14 +20,8 @@ export default function InstanceData({ predictions, metricFieldMap, }: Props) { - const renderInstanceId = (instance: Instance): string => { - return instance.perturbation === undefined - ? `Instance id: ${instance.id} [split: ${instance.split}]` - : `Instance id: ${instance.id} [split: ${instance.split}][perturbation: ${instance.perturbation.name}]`; - }; return ( -
-

{renderInstanceId(instance)}

+

Input

{instance.input.multimedia_content !== undefined ? ( ([]); + const [displayPredictionsMap, setDisplayPredictionsMap] = useState< + undefined | Record> + >(); + const [displayRequestsMap, setDisplayRequestsMap] = useState< + undefined | Record> + >(); + const [currentInstancesPage, setCurrentInstancesPage] = useState(1); + + useEffect(() => { + const controller = new AbortController(); + async function fetchData() { + const signal = controller.signal; + + const [instancesResp, displayPredictions, displayRequests] = + await Promise.all([ + getInstances(runName, signal, suite), + getDisplayPredictionsByName(runName, signal, suite), + getDisplayRequestsByName(runName, signal, suite), + ]); + setInstances(instancesResp); + + const tempDisplayRequestsMap: { + [key: string]: { [key: string]: DisplayRequest[] }; + } = {}; + displayRequests.forEach((displayRequest) => { + const instanceId = displayRequest.instance_id; + const perturbationName = displayRequest.perturbation?.name || ""; + if (tempDisplayRequestsMap[instanceId] === undefined) { + tempDisplayRequestsMap[instanceId] = {}; + } + if ( + tempDisplayRequestsMap[instanceId][perturbationName] === undefined + ) { + tempDisplayRequestsMap[instanceId][perturbationName] = []; + } + tempDisplayRequestsMap[instanceId][perturbationName].push( + displayRequest, + ); + }); + setDisplayRequestsMap(tempDisplayRequestsMap); + + const tempDisplayPredictionsMap: { + [key: string]: { [key: string]: DisplayPrediction[] }; + } = {}; + displayPredictions.forEach((displayPrediction) => { + const instanceId = displayPrediction.instance_id; + const perturbationName = displayPrediction.perturbation?.name || ""; + if (tempDisplayPredictionsMap[instanceId] === undefined) { + tempDisplayPredictionsMap[instanceId] = {}; + } + if ( + tempDisplayPredictionsMap[instanceId][perturbationName] === undefined + ) { + tempDisplayPredictionsMap[instanceId][perturbationName] = []; + } + tempDisplayPredictionsMap[instanceId][perturbationName].push( + displayPrediction, + ); + }); + setDisplayPredictionsMap(tempDisplayPredictionsMap); + } + + void fetchData(); + + return () => controller.abort(); + }, [runName, suite]); + + const pagedInstances = instances.slice( + (currentInstancesPage - 1) * INSTANCES_PAGE_SIZE, + (currentInstancesPage - 1) * INSTANCES_PAGE_SIZE + INSTANCES_PAGE_SIZE, + ); + const totalInstancesPages = Math.ceil(instances.length / INSTANCES_PAGE_SIZE); + + // Handle scrolling to anchored instance + useEffect(() => { + const anchoredInstance = searchParams.get("instance"); + if ( + anchoredInstance && + !window.helmHasScrolledToInstance && + pagedInstances.length > 0 + ) { + // Find the index of the anchored instance + const instanceIndex = pagedInstances.findIndex( + (i) => i.id === anchoredInstance, + ); + if (instanceIndex === -1) return; + + // Wait for the DOM to be updated with the correct page + requestAnimationFrame(() => { + const element = document.getElementById(`instance-${anchoredInstance}`); + if (element) { + element.scrollIntoView({ behavior: "smooth" }); + } + }); + + window.helmHasScrolledToInstance = true; + } + }, [searchParams, currentInstancesPage, setSearchParams, pagedInstances]); + + const renderInstanceId = (instance: Instance): string => { + return instance.perturbation === undefined + ? `Instance id: ${instance.id} [split: ${instance.split}]` + : `Instance id: ${instance.id} [split: ${instance.split}][perturbation: ${instance.perturbation.name}]`; + }; + + if (displayPredictionsMap === undefined || displayRequestsMap === undefined) { + return ; + } + + return ( + <> +
+ {pagedInstances.map((instance, idx) => ( +
+
+

{renderInstanceId(instance)}

+ +
+ +
+ ))} +
+ { + const nextInstancePage = Math.min( + currentInstancesPage + 1, + totalInstancesPages, + ); + setCurrentInstancesPage(nextInstancePage); + searchParams.set("instancesPage", String(nextInstancePage)); + setSearchParams(searchParams); + }} + onPrevPage={() => { + const prevInstancePage = Math.max(currentInstancesPage - 1, 1); + setCurrentInstancesPage(prevInstancePage); + searchParams.set("instancesPage", String(prevInstancePage)); + setSearchParams(searchParams); + }} + currentPage={currentInstancesPage} + totalPages={totalInstancesPages} + /> + + ); +} diff --git a/helm-frontend/src/components/Landing/Image2StructLanding.tsx b/helm-frontend/src/components/Landing/Image2StructLanding.tsx index d4315d6f67..5686b26fc2 100644 --- a/helm-frontend/src/components/Landing/Image2StructLanding.tsx +++ b/helm-frontend/src/components/Landing/Image2StructLanding.tsx @@ -10,7 +10,10 @@ export default function Image2StructLanding() { Extracting Structured Information from Images
- + Paper We introduce 3 tasks:

  • - LaTex: equations, tables, plots and algorithms form ArXiV papers + LaTex: equations, tables, plots and algorithms from ArXiV papers.
  • - Webpages: pages from GitHub written in HTML, CSS and Javascript, + Webpages: pages from GitHub written in HTML, CSS and Javascript. ...
  • -
  • - Music sheets: crops of measures from music sheets from IMSLP -
  • +
  • Music sheets: crops of measures from IMSLP music sheets.
diff --git a/helm-frontend/src/components/Landing/SafetyLanding.tsx b/helm-frontend/src/components/Landing/SafetyLanding.tsx index 5e2d43be45..69714378ef 100644 --- a/helm-frontend/src/components/Landing/SafetyLanding.tsx +++ b/helm-frontend/src/components/Landing/SafetyLanding.tsx @@ -1,4 +1,5 @@ import MiniLeaderboard from "@/components/MiniLeaderboard"; +import helmSafety from "@/assets/helm-safety.png"; export default function SafetyLanding() { return ( @@ -7,6 +8,12 @@ export default function SafetyLanding() {
+ Logo

Language models demonstrate powerful capabilities and pose significant risks. Given their widespread deployment, standardized @@ -20,12 +27,12 @@ export default function SafetyLanding() { closed models.

- {/* Blog Post - */} + Full Leaderboard diff --git a/helm-frontend/src/components/MediaObjectDisplay.tsx b/helm-frontend/src/components/MediaObjectDisplay.tsx index 469fcd1bd7..a92b131886 100644 --- a/helm-frontend/src/components/MediaObjectDisplay.tsx +++ b/helm-frontend/src/components/MediaObjectDisplay.tsx @@ -21,6 +21,20 @@ export default function MediaObjectDisplay({ mediaObject }: Props) {
); + } else if (mediaObject.content_type.includes("audio")) { + if (mediaObject.location === undefined) { + return null; + } + const url = getBenchmarkEndpoint( + mediaObject.location + .replace("benchmark_output/", "") + .replace("prod_env/", "../"), + ); + return ( +
+ +
+ ); } else { if ( mediaObject.text && diff --git a/helm-frontend/src/components/NavDropdown.tsx b/helm-frontend/src/components/NavDropdown.tsx index d1b8ae07aa..486edd593e 100644 --- a/helm-frontend/src/components/NavDropdown.tsx +++ b/helm-frontend/src/components/NavDropdown.tsx @@ -9,6 +9,22 @@ function NavDropdown() { ProjectMetadata | undefined >(); + useEffect(() => { + if ( + currProjectMetadata && + currProjectMetadata.title && + currProjectMetadata.title !== "All Leaderboards" + ) { + const titlePrefix = + currProjectMetadata.title === "Lite" || + currProjectMetadata.title === "Classic" + ? "HELM " + currProjectMetadata.title + : currProjectMetadata.title; + document.title = + titlePrefix + " - Holistic Evaluation of Language Models (HELM)"; + } + }, [currProjectMetadata]); + useEffect(() => { fetch( "https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json", diff --git a/helm-frontend/src/components/ReleaseDropdown.tsx b/helm-frontend/src/components/ReleaseDropdown.tsx index 7e648b56ea..6258de2ee3 100644 --- a/helm-frontend/src/components/ReleaseDropdown.tsx +++ b/helm-frontend/src/components/ReleaseDropdown.tsx @@ -1,3 +1,4 @@ +import { Badge } from "@tremor/react"; import { useEffect, useState } from "react"; import getReleaseSummary from "@/services/getReleaseSummary"; import ReleaseSummary from "@/types/ReleaseSummary"; @@ -40,13 +41,6 @@ function ReleaseDropdown() { }); }, []); - function getReleases(): string[] { - return currProjectMetadata !== undefined && - currProjectMetadata.releases !== undefined - ? currProjectMetadata.releases - : ["v1.0.0"]; - } - useEffect(() => { const controller = new AbortController(); async function fetchData() { @@ -58,20 +52,34 @@ function ReleaseDropdown() { return () => controller.abort(); }, []); - const releases = getReleases(); + const releases = + currProjectMetadata !== undefined && + currProjectMetadata.releases !== undefined + ? currProjectMetadata.releases + : ["v1.0.0"]; - if (!summary.release && !summary.suite) { + const currentVersion = summary.release || summary.suite || null; + + if (!currentVersion) { return null; } - const releaseInfo = `Release ${summary.release || summary.suite} (${ - summary.date - })`; + const releaseInfo = `Release ${currentVersion} (${summary.date})`; if (releases.length <= 1) { return
{releaseInfo}
; } + const indexOfCurrentVersion = releases.indexOf(currentVersion); + + const badge = + indexOfCurrentVersion < 0 ? ( + preview + ) : indexOfCurrentVersion === 0 ? ( + latest + ) : ( + stale + ); return (
- {releaseInfo}  + {releaseInfo} {badge}