.github/workflows/android-perf.yml

name: android-perf

on:
  schedule:
    - cron: 0 0 * * *
  pull_request:
    paths:
      - .github/workflows/android-perf.yml
      - extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
  push:
    branches:
      - main
    paths:
      - .github/workflows/android-perf.yml
      - extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
  # Note: GitHub has an upper limit of 10 inputs
  workflow_dispatch:
    inputs:
      models:
        description: Models to be benchmarked
        required: false
        type: string
        default: stories110M
      devices:
        description: Target devices to run benchmark
        required: false
        type: string
        default: samsung_galaxy_s22
      benchmark_configs:
        description: The list of configs used the benchmark
        required: false
        type: string
  workflow_call:
    inputs:
      models:
        description: Models to be benchmarked
        required: false
        type: string
        default: stories110M
      devices:
        description: Target devices to run benchmark
        required: false
        type: string
        default: samsung_galaxy_s22
      benchmark_configs:
        description: The list of configs used the benchmark
        required: false
        type: string

concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

jobs:
  set-parameters:
    runs-on: ubuntu-22.04
    outputs:
      benchmark_configs: ${{ steps.set-parameters.outputs.benchmark_configs }}
    steps:
      - uses: actions/checkout@v3
        with:
          submodules: 'false'
      - uses: actions/setup-python@v4
        with:
          python-version: '3.10'
      - name: Set parameters
        id: set-parameters
        shell: bash
        env:
          # Separate default values from the workflow dispatch. To ensure defaults are accessible
          # during scheduled runs and to provide flexibility for different defaults between
          # on-demand and periodic benchmarking.
          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama' }}
          CRON_DEFAULT_DEVICES: samsung_galaxy_s22
        run: |
          set -eux
          MODELS="${{ inputs.models }}"
          if [ -z "$MODELS" ]; then
            MODELS="$CRON_DEFAULT_MODELS"
          fi
          DEVICES="${{ inputs.devices }}"
          if [ -z "$DEVICES" ]; then
            DEVICES="$CRON_DEFAULT_DEVICES"
          fi

          PYTHONPATH="${PWD}" python .ci/scripts/gather_benchmark_configs.py \
            --os "android" \
            --models $MODELS \
            --devices $DEVICES

  prepare-test-specs:
    runs-on: linux.2xlarge
    needs: set-parameters
    strategy:
      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
      fail-fast: false
    steps:
      - uses: actions/checkout@v3

      - name: Prepare the spec
        shell: bash
        working-directory: extension/benchmark/android/benchmark
        run: |
          set -eux

          # The model will be exported in the next step to this S3 path
          MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/model.zip"
          # We could write a script to properly use jinja here, but there is only one variable,
          # so let's just sed it
          sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' android-llm-device-farm-test-spec.yml.j2
          cp android-llm-device-farm-test-spec.yml.j2 android-llm-device-farm-test-spec.yml

          # Just print the test spec for debugging
          cat android-llm-device-farm-test-spec.yml

      - name: Upload the spec
        uses: seemethere/upload-artifact-s3@v5
        with:
          s3-bucket: gha-artifacts
          s3-prefix: |
            ${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}
          retention-days: 1
          if-no-files-found: error
          path: extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml

  export-models:
    name: export-models
    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
    needs: set-parameters
    secrets: inherit
    strategy:
      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
      fail-fast: false
    with:
      runner: linux.2xlarge.memory
      docker-image: executorch-ubuntu-22.04-qnn-sdk
      submodules: 'true'
      timeout: 60
      upload-artifact: android-models
      upload-artifact-to-s3: true
      secrets-env: EXECUTORCH_HF_TOKEN
      script: |
        # The generic Linux job chooses to use base env, not the one setup by the image
        echo "::group::Setting up dev environment"
        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
        conda activate "${CONDA_ENV}"
        if [[ ${{ matrix.config }} == *"qnn"* ]]; then
            PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
            PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
        fi
        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
        # Install requirements for export_llama
        PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh

        pip install -U "huggingface_hub[cli]"
        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
        pip install accelerate sentencepiece
        pip list

        ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.config }}
        echo "::endgroup::"

        echo "::group::Exporting ${{ matrix.config }} model: ${{ matrix.model }}"
        BUILD_MODE="cmake"

        if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
            # HuggingFace model. Assume the pattern is always like "<org>/<repo>"
            HF_MODEL_REPO=${{ matrix.model }}
            OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}"

            if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
                # Llama models on Hugging Face
                if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then
                    # SpinQuant
                    # Download prequantized chceckpoint from Hugging Face
                    DOWNLOADED_PATH=$(
                      bash .ci/scripts/download_hf_hub.sh \
                        --model_id "${HF_MODEL_REPO}" \
                        --files "tokenizer.model" "params.json" "consolidated.00.pth"
                    )
                    # Export using ExecuTorch's model definition
                    python -m examples.models.llama.export_llama \
                      --model "llama3_2" \
                      --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
                      --params "${DOWNLOADED_PATH}/params.json" \
                      --use_sdpa_with_kv_cache \
                      -X \
                      --xnnpack-extended-ops \
                      --preq_mode 8da4w_output_8da8w \
                      --preq_group_size 32 \
                      --max_seq_length 2048 \
                      --output_name "${OUT_ET_MODEL_NAME}.pte" \
                      -kv \
                      -d fp32 \
                      --preq_embedding_quantize 8,0 \
                      --use_spin_quant native \
                      --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
                elif [[ ${{ matrix.config }} == "llama3_qlora" ]]; then
                    # QAT + LoRA
                    # Download prequantized chceckpoint from Hugging Face
                    DOWNLOADED_PATH=$(
                      bash .ci/scripts/download_hf_hub.sh \
                        --model_id "${HF_MODEL_REPO}" \
                        --files "tokenizer.model" "params.json" "consolidated.00.pth"
                    )
                    # Export using ExecuTorch's model definition
                    python -m examples.models.llama.export_llama \
                      --model "llama3_2" \
                      --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
                      --params "${DOWNLOADED_PATH}/params.json" \
                      -qat \
                      -lora 16 \
                      --preq_mode 8da4w_output_8da8w \
                      --preq_group_size 32 \
                      --preq_embedding_quantize 8,0 \
                      --use_sdpa_with_kv_cache \
                      -kv \
                      -X \
                      --xnnpack-extended-ops \
                      -d fp32 \
                      --max_seq_length 2048 \
                      --output_name "${OUT_ET_MODEL_NAME}.pte" \
                      --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
                elif [[ ${{ matrix.config }} == "llama3_fb16" ]]; then
                    # Original BF16 version, without any quantization
                    DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
                    python -m examples.models.llama.export_llama \
                      --model "llama3_2" \
                      --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
                      --params "${DOWNLOADED_PATH}/params.json" \
                      -kv \
                      --use_sdpa_with_kv_cache \
                      -X \
                      -d bf16 \
                      --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
                      --output_name="${OUT_ET_MODEL_NAME}.pte"
                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
                elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
                    export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
                    export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
                    export PYTHONPATH=$(pwd)/..

                    DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
                    python -m examples.qualcomm.oss_scripts.llama3_2.llama -- \
                      --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
                      --params "${DOWNLOADED_PATH}/params.json" \
                      --tokenizer_model "${DOWNLOADED_PATH}/tokenizer.model" \
                      --compile_only \
                      --ptq 16a4w \
                      -m SM8650 \
                      --model_size 1B \
                      --model_mode kv \
                      --prompt "Once"

                    OUT_ET_MODEL_NAME="llama3_2_qnn" # Qualcomm hard-coded it in their script
                    find . -name "${OUT_ET_MODEL_NAME}.pte" -not -path "./${OUT_ET_MODEL_NAME}.pte" -exec mv {} ./ \;
                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
                else
                    # By default, test with the Hugging Face model and the xnnpack recipe
                    DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
                    python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
                    ls -lh "${OUT_ET_MODEL_NAME}.pte"
                fi
            else
                echo "Unsupported model ${{ matrix.model }}"
                exit 1
            fi

            zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
            ls -lh model.zip
            mkdir -p "${ARTIFACTS_DIR_NAME}"
            mv model.zip "${ARTIFACTS_DIR_NAME}"
        elif [[ ${{ matrix.model }} == "llama" ]]; then
            # Install requirements for export_llama
            PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
            # Test llama2
            if [[ ${{ matrix.config }} == *"xnnpack"* ]]; then
                DELEGATE_CONFIG="xnnpack+custom+qe"
            elif [[ ${{ matrix.config }} == *"qnn"* ]]; then
                DELEGATE_CONFIG="qnn"
            else
                echo "Unsupported delegate ${{ matrix.config }}"
                exit 1
            fi
            DTYPE="fp32"
            PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh \
              -model "${{ matrix.model }}" \
              -build_tool "${BUILD_MODE}" \
              -dtype "${DTYPE}" \
              -mode "${DELEGATE_CONFIG}" \
              -upload "${ARTIFACTS_DIR_NAME}"
        else
            PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh \
              "${{ matrix.model }}" \
              "${BUILD_MODE}" \
              "${{ matrix.config }}" \
              "${ARTIFACTS_DIR_NAME}"
        fi
        echo "::endgroup::"

  build-benchmark-app:
    name: build-benchmark-app
    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
    needs: set-parameters
    with:
      runner: linux.2xlarge
      docker-image: executorch-ubuntu-22.04-clang12-android
      submodules: 'true'
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      timeout: 90
      upload-artifact: android-apps
      upload-artifact-to-s3: true
      script: |
        set -eux

        # The generic Linux job chooses to use base env, not the one setup by the image
        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
        conda activate "${CONDA_ENV}"
        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
        export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded

        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
        PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh

        export ANDROID_ABIS="arm64-v8a"
        PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}

  # Let's see how expensive this job is, we might want to tone it down by running it periodically
  benchmark-on-device:
    if: always()
    permissions:
      id-token: write
      contents: read
    uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
    needs:
      - set-parameters
      - prepare-test-specs
      - build-benchmark-app
      - export-models
    strategy:
      matrix: ${{ fromJson(needs.set-parameters.outputs.benchmark_configs) }}
      fail-fast: false
    with:
      # Due to scheduling a job may be pushed beyond the default 60m threshold
      timeout: 120
      device-type: android
      runner: linux.2xlarge
      test-infra-ref: ''
      # This is the ARN of ExecuTorch project on AWS
      project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
      device-pool-arn: ${{ matrix.device_arn }}
      android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug.apk
      android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug-androidTest.apk
      test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.config }}/android-llm-device-farm-test-spec.yml

  upload-benchmark-results:
    needs:
      - benchmark-on-device
    if: always()
    runs-on: linux.2xlarge
    environment: upload-benchmark-results
    permissions:
      id-token: write
      contents: read
    steps:
      - uses: actions/checkout@v3
        with:
          submodules: false

      - name: Authenticate with AWS
        uses: aws-actions/configure-aws-credentials@v4
        with:
          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
          # The max duration enforced by the server side
          role-duration-seconds: 18000
          aws-region: us-east-1

      - name: Setup conda
        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
        with:
          python-version: '3.10'

      - name: Download the list of artifacts from S3
        env:
          ARTIFACTS_S3_DIR: s3://gha-artifacts/device_farm/${{ github.run_id }}/${{ github.run_attempt }}/artifacts/
        shell: bash
        run: |
          set -eux
          ${CONDA_RUN} python -mpip install awscli==1.32.18

          mkdir -p artifacts
          pushd artifacts
          ${CONDA_RUN} aws s3 sync "${ARTIFACTS_S3_DIR}" .
          popd

          ls -lah artifacts

      - name: Extract the benchmark results JSON
        shell: bash
        run: |
          set -eux

          mkdir -p benchmark-results

          for ARTIFACTS_BY_JOB in artifacts/*.json; do
            [ -f "${ARTIFACTS_BY_JOB}" ] || break
            echo "${ARTIFACTS_BY_JOB}"
            ${CONDA_RUN} python .github/scripts/extract_benchmark_results.py \
              --artifacts "${ARTIFACTS_BY_JOB}" \
              --output-dir benchmark-results \
              --repo ${{ github.repository }} \
              --head-branch ${{ github.head_ref || github.ref_name }} \
              --workflow-name "${{ github.workflow }}" \
              --workflow-run-id ${{ github.run_id }} \
              --workflow-run-attempt ${{ github.run_attempt }}
          done

          for SCHEMA in v2 v3; do
            for BENCHMARK_RESULTS in benchmark-results/"${SCHEMA}"/*.json; do
              cat "${BENCHMARK_RESULTS}"
              echo
            done
          done

      # TODO (huydhn): Remove v2 schema once the benchmark dashboard finishes the migration
      - name: Upload the benchmark results (v2)
        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
        with:
          benchmark-results-dir: benchmark-results/v2
          dry-run: false
          schema-version: v2

      - name: Upload the benchmark results (v3)
        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
        with:
          benchmark-results-dir: benchmark-results/v3
          dry-run: false
          schema-version: v3
          github-token: ${{ secrets.GITHUB_TOKEN }}