Skip to content

TorchBench Optim Regression Detector on A100 #659

TorchBench Optim Regression Detector on A100

TorchBench Optim Regression Detector on A100 #659

name: TorchBench Optim Regression Detector on A100
on:
schedule:
- cron: '0 4 * * *' # run at 4 AM UTC = midnight ET
workflow_dispatch:
inputs:
userbenchmark_name:
description: "Name of the user benchmark to run"
userbenchmark_options:
description: "Option of the user benchmark to run"
jobs:
run-userbenchmark:
runs-on: linux.aws.a100
timeout-minutes: 1440 # 24 hours
environment: docker-s3-upload
env:
CONDA_ENV: "optim"
PLATFORM_NAME: "gcp_a100"
TORCHBENCH_USERBENCHMARK_SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.TORCHBENCH_USERBENCHMARK_SCRIBE_GRAPHQL_ACCESS_TOKEN }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
steps:
- name: Checkout TorchBench
uses: actions/checkout@v3
with:
path: benchmark
- name: Install Conda
run: |
set -x
pushd benchmark
bash ./.ci/torchbench/install-conda.sh
- name: Install TorchBench
run: |
set -x
bash ./.ci/torchbench/activate-conda.sh
pushd benchmark
# only install the subset of models currently running.
python install.py BERT_pytorch DALLE2_pytorch hf_GPT2_large hf_T5_large resnet50 timm_vision_transformer_large yolov3
- name: Print torch.version.git_version
run: |
set -x
bash ./.ci/torchbench/activate-conda.sh
python -c "import torch; print(torch.version.git_version)"
- name: Run optim user benchmark
run: |
set -x
bash ./.ci/torchbench/activate-conda.sh
# remove old results
if [ -d benchmark-output ]; then rm -Rf benchmark-output; fi
pushd benchmark
if [ -d .userbenchmark ]; then rm -Rf .userbenchmark; fi
# TODO: scale this to run other benchmarks, but let's start with optim
# Only run a subset, see if this fixes the workflow
python -m userbenchmark.optim.run_optim_benchmarks -s -c ${{ github.event.inputs.userbenchmark_options }}
cp -r ./.userbenchmark/optim ../benchmark-output
- name: Detect potential regressions
continue-on-error: true
run: |
set -x
bash ./.ci/torchbench/activate-conda.sh
pushd benchmark
RESULTS=($(find ${PWD}/../benchmark-output -name "metrics-*.json" -maxdepth 2 | sort -r))
# TODO: the following assumes only one metrics-*.json is found. It will keep
# overwriting gh-issue.md if multiple are found. Scaling this up is a potential next step.
for r in ${RESULTS[@]}; do
python regression_detector.py --platform "${PLATFORM_NAME}" --treatment "${r}" --owner @janeyx99 \
--gh-issue-path gh-issue.md --errors-path errors.txt
done
- name: Create the github issue
continue-on-error: true
if: false
uses: peter-evans/create-issue-from-file@v4
with:
title: Optim Perf Signal Detected by TorchBench CI on ${{ env.TORCHBENCH_REGRESSION_DETECTED }}
token: ${{ secrets.TORCHBENCH_ACCESS_TOKEN }}
content-filepath: ./benchmark/gh-issue.md
labels: |
torchbench-perf-report
- name: Upload result jsons to Scribe and S3
run: |
pushd benchmark
bash ./.ci/torchbench/activate-conda.sh
RESULTS=($(find ${PWD}/../benchmark-output -name "metrics-*.json" -maxdepth 2 | sort -r))
echo "Uploading result jsons: ${RESULTS}"
for r in ${RESULTS[@]}; do
python ./scripts/userbenchmark/upload_scribe.py --userbenchmark_json "${r}" --userbenchmark_platform "${PLATFORM_NAME}"
python ./scripts/userbenchmark/upload_s3.py --upload-file "${r}" --userbenchmark_platform "${PLATFORM_NAME}"
done
- name: Upload artifact
uses: actions/upload-artifact@v3
with:
name: TorchBench result
path: benchmark-output/
- name: Finally, error if errors.txt exists
if: always()
run: |
set -x
# Do not error earlier as we want all artifacts and regressions to be reported first
# TODO: potentially move errors.txt to benchmark-output so it gets uploaded to S3
bash ./.ci/torchbench/activate-conda.sh
pushd benchmark
if [ -e errors.txt ]; then cat errors.txt && exit 1; fi
- name: Remove conda environment
if: always()
run: |
. ${HOME}/miniconda3/etc/profile.d/conda.sh
conda remove -n "${CONDA_ENV}" --all