diff --git a/.ci/scripts/download_hf_hub.sh b/.ci/scripts/download_hf_hub.sh new file mode 100644 index 0000000000..b47fc5dd21 --- /dev/null +++ b/.ci/scripts/download_hf_hub.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +# Function to download files from the Hugging Face Hub +# Arguments: +# 1. model_id: The Hugging Face repository ID (e.g., "organization/model_name") +# 2. subdir: The optional subdirectory in the repo to look for files (pass "" if not used) +# 3. file_names: A space-separated list of filenames to be downloaded +# Returns: +# The directory containing the downloaded files +function download_hf_files() { + local model_id="$1" + local subdir="$2" + shift 2 + local file_names=("$@") # Capture all remaining arguments as an array + + local download_dir + + # Use the first file to determine the download directory + download_dir=$(python3 -c " +from huggingface_hub import hf_hub_download +# Download the first file and get its directory +path = hf_hub_download( + repo_id='${model_id}', + filename='${subdir:+${subdir}/}${file_names[0]}' +) +import os +print(os.path.dirname(path))") + + if [ $? -ne 0 ]; then + echo "Error: Failed to determine download directory from ${file_names[0]}" >&2 + return 1 + fi + + # Download remaining files into the same directory + for file_name in "${file_names[@]:1}"; do + python3 -c " +from huggingface_hub import hf_hub_download +# Download the file +hf_hub_download( + repo_id='${model_id}', + filename='${subdir:+${subdir}/}${file_name}' +)" + + if [ $? -ne 0 ]; then + echo "Error: Failed to download ${file_name} from ${model_id}" >&2 + return 1 + fi + done + + # Return the directory containing the downloaded files + echo "$download_dir" +} + +# Check if script is called directly +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + # Parse arguments from CLI + while [[ $# -gt 0 ]]; do + case $1 in + --model_id) + MODEL_ID="$2" + shift 2 + ;; + --subdir) + SUBDIR="$2" + shift 2 + ;; + --files) + shift + FILES_TO_DOWNLOAD=() + while [[ $# -gt 0 && $1 != --* ]]; do + FILES_TO_DOWNLOAD+=("$1") + shift + done + ;; + *) + echo "Unknown option: $1" >&2 + exit 1 + ;; + esac + done + + # Validate required arguments + if [ -z "$MODEL_ID" ] || [ ${#FILES_TO_DOWNLOAD[@]} -eq 0 ]; then + echo "Usage: $0 --model_id --subdir --files [ ...]" >&2 + exit 1 + fi + + # Call the function + DOWNLOAD_DIR=$(download_hf_files "$MODEL_ID" "$SUBDIR" "${FILES_TO_DOWNLOAD[@]}") + if [ $? -eq 0 ]; then + echo "$DOWNLOAD_DIR" + else + exit 1 + fi +fi diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index fce17e85a9..dfec515937 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -86,7 +86,7 @@ jobs: # Separate default values from the workflow dispatch. To ensure defaults are accessible # during scheduled runs and to provide flexibility for different defaults between # on-demand and periodic benchmarking. - CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'stories110M,dl3,mv3,mv2,ic4,ic3,vit' || 'stories110M' }} + CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'stories110M,dl3,mv3,mv2,ic4,ic3,vit,meta-llama/Llama-3.2-1B' || 'stories110M' }} CRON_DEFAULT_DEVICES: samsung_galaxy_s22 CRON_DEFAULT_DELEGATES: ${{ github.event_name == 'schedule' && 'xnnpack,qnn' || 'xnnpack' }} run: | @@ -108,6 +108,7 @@ jobs: declare -A DEVICE_POOL_ARNS DEVICE_POOL_ARNS[samsung_galaxy_s22]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa" DEVICE_POOL_ARNS[samsung_galaxy_s24]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db" + DEVICE_POOL_ARNS[google_pixel_8_pro]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a" # Resolve device names with their corresponding ARNs if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then @@ -168,18 +169,20 @@ jobs: name: export-models uses: pytorch/test-infra/.github/workflows/linux_job.yml@main needs: set-parameters + secrets: inherit strategy: matrix: model: ${{ fromJson(needs.set-parameters.outputs.models) }} delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }} fail-fast: false with: - runner: linux.4xlarge + runner: linux.2xlarge.memory docker-image: executorch-ubuntu-22.04-qnn-sdk submodules: 'true' timeout: 60 upload-artifact: android-models upload-artifact-to-s3: true + secrets-env: EXECUTORCH_HF_TOKEN script: | # The generic Linux job chooses to use base env, not the one setup by the image echo "::group::Setting up dev environment" @@ -190,14 +193,109 @@ jobs: PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh fi PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake" + # Install requirements for export_llama + PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }} echo "::endgroup::" echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}" BUILD_MODE="cmake" - DTYPE="fp32" - if [[ ${{ matrix.model }} =~ ^stories* ]]; then + if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then + pip install -U "huggingface_hub[cli]" + huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + pip install accelerate sentencepiece + # HuggingFace model. Assume the pattern is always like "/" + HF_MODEL_REPO=${{ matrix.model }} + OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}" + + if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then + # Llama models on Hugging Face + if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then + # SpinQuant + # Download prequantized chceckpoint from Hugging Face + DOWNLOADED_PATH=$( + bash .ci/scripts/download_hf_hub.sh \ + --model_id "${HF_MODEL_REPO}" \ + --files "tokenizer.model" "params.json" "consolidated.00.pth" + ) + # Export using ExecuTorch's model definition + python -m examples.models.llama.export_llama \ + --model "llama3_2" \ + --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ + --params "${DOWNLOADED_PATH}/params.json" \ + --use_sdpa_with_kv_cache \ + -X \ + --xnnpack-extended-ops \ + --preq_mode 8da4w_output_8da8w \ + --preq_group_size 32 \ + --max_seq_length 2048 \ + --output_name "${OUT_ET_MODEL_NAME}.pte" \ + -kv \ + -d fp32 \ + --preq_embedding_quantize 8,0 \ + --use_spin_quant native \ + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + ls -lh "${OUT_ET_MODEL_NAME}.pte" + elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then + # QAT + LoRA + # Download prequantized chceckpoint from Hugging Face + DOWNLOADED_PATH=$( + bash .ci/scripts/download_hf_hub.sh \ + --model_id "${HF_MODEL_REPO}" \ + --files "tokenizer.model" "params.json" "consolidated.00.pth" + ) + # Export using ExecuTorch's model definition + python -m examples.models.llama.export_llama \ + --model "llama3_2" \ + --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ + --params "${DOWNLOADED_PATH}/params.json" \ + -qat \ + -lora 16 \ + --preq_mode 8da4w_output_8da8w \ + --preq_group_size 32 \ + --preq_embedding_quantize 8,0 \ + --use_sdpa_with_kv_cache \ + -kv \ + -X \ + --xnnpack-extended-ops \ + -d fp32 \ + --max_seq_length 2048 \ + --output_name "${OUT_ET_MODEL_NAME}.pte" \ + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + ls -lh "${OUT_ET_MODEL_NAME}.pte" + else + if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then + # Original BF16 version, without any quantization + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") + python -m examples.models.llama.export_llama \ + --model "llama3_2" \ + --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ + --params "${DOWNLOADED_PATH}/params.json" \ + -kv \ + --use_sdpa_with_kv_cache \ + -X \ + -d bf16 \ + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ + --output_name="${OUT_ET_MODEL_NAME}.pte" + ls -lh "${OUT_ET_MODEL_NAME}.pte" + else + # By default, test with the Hugging Face model and the xnnpack recipe + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model") + python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME" + ls -lh "${OUT_ET_MODEL_NAME}.pte" + fi + fi + else + echo "Unsupported model ${{ matrix.model }}" + exit 1 + fi + + zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model" + ls -lh model.zip + mkdir -p "${ARTIFACTS_DIR_NAME}" + mv model.zip "${ARTIFACTS_DIR_NAME}" + elif [[ ${{ matrix.model }} =~ ^stories* ]]; then # Install requirements for export_llama PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh # Test llama2 @@ -209,6 +307,7 @@ jobs: echo "Unsupported delegate ${{ matrix.delegate }}" exit 1 fi + DTYPE="fp32" PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh \ -model "${{ matrix.model }}" \ -build_tool "${BUILD_MODE}" \ diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index 394c148cf1..f7c9bd5cc0 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -76,7 +76,7 @@ jobs: # Separate default values from the workflow dispatch. To ensure defaults are accessible # during scheduled runs and to provide flexibility for different defaults between # on-demand and periodic benchmarking. - CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'stories110M,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l' || 'stories110M' }} + CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'stories110M,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B' || 'stories110M' }} CRON_DEFAULT_DEVICES: apple_iphone_15 CRON_DEFAULT_DELEGATES: ${{ github.event_name == 'schedule' && 'xnnpack,coreml,mps' || 'xnnpack' }} run: | @@ -155,6 +155,7 @@ jobs: name: export-models uses: pytorch/test-infra/.github/workflows/macos_job.yml@main needs: set-parameters + secrets: inherit strategy: matrix: model: ${{ fromJson(needs.set-parameters.outputs.models) }} @@ -168,6 +169,7 @@ jobs: timeout: 60 upload-artifact: ios-models upload-artifact-to-s3: true + secrets-env: EXECUTORCH_HF_TOKEN script: | set -eux @@ -189,14 +191,110 @@ jobs: backends/apple/mps/install_requirements.sh fi + # Install requirements for export_llama + PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh + ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }} echo "::endgroup::" echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}" BUILD_MODE="cmake" - DTYPE="fp32" - if [[ ${{ matrix.model }} =~ ^stories* ]]; then + if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then + pip install -U "huggingface_hub[cli]" + huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN + ${CONDA_RUN} pip install accelerate sentencepiece + # HuggingFace model. Assume the pattern is always like "/" + HF_MODEL_REPO=${{ matrix.model }} + OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}" + + if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then + # Llama models on Hugging Face + if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then + # SpinQuant + # Download prequantized chceckpoint from Hugging Face + DOWNLOADED_PATH=$( + bash .ci/scripts/download_hf_hub.sh \ + --model_id "${HF_MODEL_REPO}" \ + --files "tokenizer.model" "params.json" "consolidated.00.pth" + ) + # Export using ExecuTorch's model definition + ${CONDA_RUN} python -m examples.models.llama.export_llama \ + --model "llama3_2" \ + --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ + --params "${DOWNLOADED_PATH}/params.json" \ + --use_sdpa_with_kv_cache \ + -X \ + --xnnpack-extended-ops \ + --preq_mode 8da4w_output_8da8w \ + --preq_group_size 32 \ + --max_seq_length 2048 \ + --output_name "${OUT_ET_MODEL_NAME}.pte" \ + -kv \ + -d fp32 \ + --preq_embedding_quantize 8,0 \ + --use_spin_quant native \ + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + ls -lh "${OUT_ET_MODEL_NAME}.pte" + elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then + # QAT + LoRA + # Download prequantized chceckpoint from Hugging Face + DOWNLOADED_PATH=$( + bash .ci/scripts/download_hf_hub.sh \ + --model_id "${HF_MODEL_REPO}" \ + --files "tokenizer.model" "params.json" "consolidated.00.pth" + ) + # Export using ExecuTorch's model definition + ${CONDA_RUN} python -m examples.models.llama.export_llama \ + --model "llama3_2" \ + --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ + --params "${DOWNLOADED_PATH}/params.json" \ + -qat \ + -lora 16 \ + --preq_mode 8da4w_output_8da8w \ + --preq_group_size 32 \ + --preq_embedding_quantize 8,0 \ + --use_sdpa_with_kv_cache \ + -kv \ + -X \ + --xnnpack-extended-ops \ + -d fp32 \ + --max_seq_length 2048 \ + --output_name "${OUT_ET_MODEL_NAME}.pte" \ + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' + ls -lh "${OUT_ET_MODEL_NAME}.pte" + else + if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then + # Original BF16 version, without any quantization + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") + ${CONDA_RUN} python -m examples.models.llama.export_llama \ + --model "llama3_2" \ + --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ + --params "${DOWNLOADED_PATH}/params.json" \ + -kv \ + --use_sdpa_with_kv_cache \ + -X \ + -d bf16 \ + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ + --output_name="${OUT_ET_MODEL_NAME}.pte" + ls -lh "${OUT_ET_MODEL_NAME}.pte" + else + # By default, test with the Hugging Face model and the xnnpack recipe + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model") + ${CONDA_RUN} python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME" + ls -lh "${OUT_ET_MODEL_NAME}.pte" + fi + fi + else + echo "Unsupported model ${{ matrix.model }}" + exit 1 + fi + + zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model" + ls -lh model.zip + mkdir -p "${ARTIFACTS_DIR_NAME}" + mv model.zip "${ARTIFACTS_DIR_NAME}" + elif [[ ${{ matrix.model }} =~ ^stories* ]]; then # Install requirements for export_llama PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ bash examples/models/llama/install_requirements.sh @@ -209,6 +307,7 @@ jobs: elif [[ ${{ matrix.delegate }} == "mps" ]]; then DELEGATE_CONFIG="mps" fi + DTYPE="fp32" PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ bash .ci/scripts/test_llama.sh \ -model "${{ matrix.model }}" \ diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 365c7564fe..7972269e92 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -358,11 +358,11 @@ jobs: secrets: inherit strategy: matrix: - hf_model_repo: [google/gemma-2b] + hf_model_repo: [google/gemma-2-2b] fail-fast: false with: secrets-env: EXECUTORCH_HF_TOKEN - runner: linux.12xlarge + runner: linux.2xlarge.memory docker-image: executorch-ubuntu-22.04-clang12 submodules: 'true' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} @@ -420,19 +420,10 @@ jobs: TOKENIZER_FILE=tokenizer.model TOKENIZER_BIN_FILE=tokenizer.bin ET_MODEL_NAME=et_model - # Fetch the file using a Python one-liner - DOWNLOADED_TOKENIZER_FILE_PATH=$(python -c " - from huggingface_hub import hf_hub_download - # Download the file from the Hugging Face Hub - downloaded_path = hf_hub_download( - repo_id='${{ matrix.hf_model_repo }}', - filename='${TOKENIZER_FILE}' - ) - print(downloaded_path) - ") - if [ -f "$DOWNLOADED_TOKENIZER_FILE_PATH" ]; then + DOWNLOADED_TOKENIZER_FILE_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${{ matrix.hf_model_repo }}" --files "${TOKENIZER_FILE}") + if [ -f "$DOWNLOADED_TOKENIZER_FILE_PATH/$TOKENIZER_FILE" ]; then echo "${TOKENIZER_FILE} downloaded successfully at: $DOWNLOADED_TOKENIZER_FILE_PATH" - python -m extension.llm.tokenizer.tokenizer -t $DOWNLOADED_TOKENIZER_FILE_PATH -o ./${TOKENIZER_BIN_FILE} + python -m extension.llm.tokenizer.tokenizer -t "$DOWNLOADED_TOKENIZER_FILE_PATH/$TOKENIZER_FILE" -o ./${TOKENIZER_BIN_FILE} ls ./tokenizer.bin else echo "Failed to download ${TOKENIZER_FILE} from ${{ matrix.hf_model_repo }}." diff --git a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm index c03ad14517..8878de4d94 100644 --- a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm +++ b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm @@ -63,7 +63,7 @@ @implementation LLaMATests return [filename hasSuffix:@".pte"] && [filename containsString:@"llama"]; }, @"tokenizer" : ^BOOL(NSString *filename) { - return [filename isEqual:@"tokenizer.bin"]; + return [filename isEqual:@"tokenizer.bin"] || [filename isEqual:@"tokenizer.model"]; }, }; } diff --git a/extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml.j2 b/extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml.j2 index dc610437fc..68f8399f16 100644 --- a/extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml.j2 +++ b/extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml.j2 @@ -19,8 +19,9 @@ phases: # Copy the model - mkdir -p /tmp/Payload/Benchmark.app/aatp/data - - cp *.bin /tmp/Payload/Benchmark.app/aatp/data + - cp tokenizer.* /tmp/Payload/Benchmark.app/aatp/data - cp *.pte /tmp/Payload/Benchmark.app/aatp/data + - ls -all /tmp/Payload/Benchmark.app/aatp/data - mkdir $DEVICEFARM_TEST_PACKAGE_PATH/Debug-iphoneos - mkdir $DEVICEFARM_TEST_PACKAGE_PATH/Release-iphoneos