setup.cfg

[metadata]
name = crfm-helm
version = 0.5.4
author = Stanford CRFM
author_email = contact-crfm@stanford.edu
description = Benchmark for language models
long_description = file: README.md, docs/tutorial.md
long_description_content_type = text/markdown
keywords = language models benchmarking
license = Apache License 2.0
classifiers =
    Programming Language :: Python :: 3
    Programming Language :: Python :: 3 :: Only
    Programming Language :: Python :: 3.9
    Programming Language :: Python :: 3.10
    Programming Language :: Python :: 3.11
    License :: OSI Approved :: Apache Software License
url = https://github.com/stanford-crfm/helm

[options]
python_requires = >=3.9
package_dir =
    =src
packages = find:
zip_safe = False
include_package_data = True

install_requires=
    # Common
    cattrs~=22.2
    dacite~=1.6
    importlib-resources~=5.10
    Mako~=1.2
    numpy~=1.26,<3
    pandas~=2.0
    pyhocon~=0.3.59
    retrying~=1.3
    spacy~=3.5
    tqdm~=4.64
    zstandard~=0.18.0
    # sqlitedict==2.0.0 is slow! https://github.com/RaRe-Technologies/sqlitedict/issues/152
    # Keep sqlitedict version at 1.7.0.
    sqlitedict>=2.1.0,<3.0
    bottle~=0.12.23

    # Basic Scenarios
    datasets~=2.17
    pyarrow>=11.0.0  # Pinned transitive dependency for datasets; workaround for #1026
    pyarrow-hotfix~=0.6  # Hotfix for CVE-2023-47248

    # Basic metrics
    nltk~=3.7,!=3.9.0  # Cannot use 3.9.0 due to https://github.com/nltk/nltk/issues/3308
    rouge-score~=0.1.2
    scipy~=1.10
    uncertainty-calibration~=0.1.4
    scikit-learn~=1.1

    # Models and Metrics Extras
    transformers~=4.40  # For anthropic_client, vision_language.huggingface_vlm_client, huggingface_client, huggingface_tokenizer, test_openai_token_cost_estimator, model_summac (via summarization_metrics)
    # TODO: Upgrade torch - we need > 2.0.0 for newer versions of transformers
    torch>=1.13.1,<3.0.0  # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)
    torchvision>=0.14.1,<3.0.0  # For huggingface_client, yalm_tokenizer, model_summac (via summarization_metrics)

[options.extras_require]
proxy-server =
    gunicorn>=20.1

human-evaluation =
    scaleapi~=2.13.0
    surge-api~=1.1.0

scenarios =
    gdown~=5.1  # For disinformation_scenario, med_mcqa_scenario, med_qa_scenario: used by ensure_file_downloaded()
    sympy~=1.11.1  # For numeracy_scenario
    xlrd~=2.0.1  # For ice_scenario: used by pandas.read_excel()

metrics =
    google-api-python-client~=2.64  # For perspective_api_client via toxicity_metrics
    numba~=0.56  # For copyright_metrics
    pytrec_eval==0.5  # For ranking_metrics
    sacrebleu~=2.2.1  # For disinformation_metrics, machine_translation_metrics
    langdetect~=1.0.9  # For ifeval_metrics
    immutabledict~=4.2.0  # For ifeval_metrics
    gradio_client~=1.3  # For bigcodebench_metrics

summarization =
    summ-eval~=0.892  # For summarization_metrics

plots =
    colorcet~=3.0.1
    matplotlib~=3.6.0
    seaborn~=0.11.0

decodingtrust =
    fairlearn~=0.9.0

slurm =
    simple-slurm~=0.2.6

cleva =
    unidecode~=1.3
    pypinyin~=0.49.0
    jieba~=0.42.1
    opencc~=1.1
    langdetect~=1.0

images =
    crfm-helm[accelerate]
    pillow~=10.2

mongo =
    pymongo~=4.2

unitxt =
    evaluate~=0.4.1

bhasa = 
    pythainlp==5.0.0
    pyonmttok==1.37.0
    sacrebleu~=2.2.1

# Model extras
accelerate =
    accelerate~=0.25

aleph-alpha =
    aleph-alpha-client~=2.14.0
    tokenizers>=0.13.3

allenai =
    ai2-olmo~=0.2

amazon = 
    boto3~=1.28.57
    awscli~=1.29.57
    botocore~=1.31.57

anthropic =
    anthropic~=0.17,<0.39  # TODO(#3212): Limit anthropic to >=0.39 after resolving #3212.
    websocket-client~=1.3.2  # For legacy stanford-online-all-v4-s3

cohere =
    cohere~=5.3

mistral =
    mistralai~=1.1

openai =
    openai~=1.52
    tiktoken~=0.7
    pydantic~=2.0  # For model_dump(mode="json") - openai only requires pydantic>=1.9.0

google =
    google-cloud-aiplatform~=1.48

together =
    together~=1.1

yandex =
    sentencepiece~=0.2.0

models =
    crfm-helm[ai21]
    crfm-helm[accelerate]
    crfm-helm[aleph-alpha]
    crfm-helm[allenai]
    crfm-helm[amazon]
    crfm-helm[anthropic]
    crfm-helm[cohere]
    crfm-helm[google]
    crfm-helm[mistral]
    crfm-helm[openai]
    crfm-helm[reka]
    crfm-helm[together]
    crfm-helm[yandex]
    crfm-helm[ibm-enterprise-scenarios]

reka = 
    reka-api~=2.0.0

vlm =
    crfm-helm[openai]

    # For OpenFlamingo
    einops~=0.7.0
    einops-exts~=0.0.4
    open-clip-torch~=2.24

    # For IDEFICS
    torch~=2.1

    # For Qwen: https://github.com/QwenLM/Qwen-VL/blob/master/requirements.txt
    transformers_stream_generator~=0.0.4
    scipy~=1.10
    torchvision>=0.14.1,<3.0.0

    # For Reka AI
    crfm-helm[reka]
    
    # VLM scenarios
    crfm-helm[images]
    crfm-helm[image2struct]

    # For metrics
    pycocoevalcap~=1.2

ibm-enterprise-scenarios = 
    openpyxl~=3.1

image2struct =
    crfm-helm[images]

    # Latex
    # You will need to install LaTeX separately.
    # You can run `sudo apt-get install texlive-full` on Ubuntu.
    latex~=0.7.0
    pdf2image~=1.16.3

    # Webpage
    # You will need install Jekyll separately.
    selenium~=4.17.2
    html2text~=2024.2.26

    # Metrics
    opencv-python>=4.7.0.68,<4.8.2.0
    lpips~=0.1.4
    imagehash~=4.3.1 # for caching

heim =
    # HEIM scenarios
    gdown~=5.1

    # HEIM models
    diffusers~=0.24.0
    icetk~=0.0.4
    jax~=0.4.13
    jaxlib~=0.4.13
    crfm-helm[openai]

    # For model, kakaobrain/mindall-e
    einops~=0.7.0
    omegaconf~=2.3.0
    pytorch-lightning~=2.0.5

    # For model, craiyon/dalle-mini and craiyon/dalle-mega
    flax~=0.6.11
    ftfy~=6.1.1
    Unidecode~=1.3.6
    wandb~=0.16

    # HEIM perturbations
    google-cloud-translate~=3.11.2

    # HEIM metrics
    autokeras~=1.0.20
    clip-anytorch~=2.5.0
    google-cloud-storage~=2.9
    lpips~=0.1.4
    multilingual-clip~=1.0.10
    NudeNet~=2.0.9
    opencv-python>=4.7.0.68,<4.8.2.0
    pytorch-fid~=0.3.0
    tensorflow~=2.11
    timm~=0.6.12
    torch-fidelity~=0.3.0
    torchmetrics~=0.11.1

    # Transitive dependency of NudeNet
    # This needs to be a version that provides wheels for all Python versions
    # supported by crfm-helm i.e. Python 3.9, 3.10, 3.11, 3.12
    # Disallow version 0.23.* because it has no Python 3.9 wheels.
    scikit-image>=0.22,==0.*,!=0.23.*

    # Shared image dependencies
    crfm-helm[images]

audiolm =
    crfm-helm[openai]
    crfm-helm[google]

    # For HuggingFace audio datasets
    soundfile~=0.12
    librosa~=0.10

    # For LLaMA-Omni
    openai-whisper==20240930

    # For Qwen2-Audio
    transformers~=4.45.1
    transformers_stream_generator~=0.0.4
    scipy~=1.10
    torchvision>=0.14.1,<3.0.0

    # For metrics
    pycocoevalcap~=1.2
    jiwer~=3.0
    rapidfuzz~=3.10
    jieba~=0.42.1

# Install everything
all =
    crfm-helm[proxy-server]
    crfm-helm[human-evaluation]
    crfm-helm[scenarios]
    crfm-helm[metrics]
    crfm-helm[plots]
    crfm-helm[decodingtrust]
    crfm-helm[slurm]
    crfm-helm[cleva]
    crfm-helm[images]
    crfm-helm[models]
    crfm-helm[mongo]
    crfm-helm[heim]
    crfm-helm[vlm]
    crfm-helm[audiolm]
    # crfm-helm[bhasa] is excluded because pyonmttok does not support Python 3.12
    # crfm-helm[dev] is excluded because end-users don't need it.
    # crfm-helm[summarize] is excluded because it requires torch<2.0
    # TODO(#2280): Add crfm-helm[summarize] back.

# Development only
# Do not include in all
dev =
    pytest~=7.2.0
    pre-commit~=2.20.0
    # Errors produced by type checkers and linters are very version-specific
    # so they are pinned to an exact version.
    black==24.3.0
    mypy==1.5.1
    flake8==5.0.4

[options.entry_points]
console_scripts = 
    helm-run = helm.benchmark.run:main
    helm-summarize = helm.benchmark.presentation.summarize:main
    helm-server = helm.benchmark.server:main
    helm-create-plots = helm.benchmark.presentation.create_plots:main
    crfm-proxy-server = helm.proxy.server:main
    crfm-proxy-cli = helm.proxy.cli:main

[options.packages.find]
where = src
exclude =
    tests*

# Settings for Flake8: Tool For Style Guide Enforcement
[flake8]
max-line-length = 120
exclude =
    venv/*
    src/helm/clients/image_generation/dalle_mini/*
    src/helm/clients/image_generation/mindalle/*
    src/helm/clients/vision_language/open_flamingo/*

# Ignore completely:
# E203 - White space before ':', (conflicts with black)
# E231 - Missing whitespace after ',', ';', or ':'
# E731 - do not assign a lambda expression, use a def
# W503 - line break before binary operator, (conflicts with black)
# W605 - invalid escape sequence '\', (causes failures)
ignore = E203,E231,E731,W503,W605

# Settings for Mypy: static type checker for Python 3
[mypy]
ignore_missing_imports = True
check_untyped_defs = True
# TODO: Remove disable_error_code
disable_error_code = annotation-unchecked
# TODO: Change disallow_untyped_defs to True
disallow_untyped_defs = False
exclude = dalle_mini|mindalle|open_flamingo

[tool:pytest]
addopts =
    # By default:
    # - we don't test models because doing so will
    #   make real requests and spend real money
    # - we don't test scenarios because these will
    #   download files, which is slow, consumes disk
    #   space, and increases the chance of spurious
    #   test failures due to failed downloads.
    #
    # For more documentation on pytest markers, see:
    # - https://docs.pytest.org/en/latest/how-to/mark.html#mark
    # - https://docs.pytest.org/en/latest/example/markers.html#mark-examples
    -m 'not models and not scenarios'
markers =
    # Marker for model tests that make real model requests
    models
    # Marker for scenario tests that download files
    scenarios