diff --git a/.ci/scripts/build_llama_android.sh b/.ci/scripts/build_llama_android.sh index 7d3370ee56..0afe51f0b0 100644 --- a/.ci/scripts/build_llama_android.sh +++ b/.ci/scripts/build_llama_android.sh @@ -48,9 +48,9 @@ build_llama_runner() { -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -Bcmake-android-out/examples/models/llama2 examples/models/llama2 + -Bcmake-android-out/examples/models/llama examples/models/llama - cmake --build cmake-android-out/examples/models/llama2 -j4 --config Release + cmake --build cmake-android-out/examples/models/llama -j4 --config Release } install_flatc_from_source install_executorch_and_backend_lib diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index 94fd5d486b..ed2a9c2558 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -125,7 +125,7 @@ cmake_install_executorch_libraries() { cmake_build_llama_runner() { echo "Building llama runner" - dir="examples/models/llama2" + dir="examples/models/llama" retry cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Debug \ @@ -206,7 +206,7 @@ if [[ "${QNN}" == "ON" ]]; then EXPORT_ARGS="${EXPORT_ARGS} -kv -v --qnn --disable_dynamic_shape" fi # Add dynamically linked library location -$PYTHON_EXECUTABLE -m examples.models.llama2.export_llama ${EXPORT_ARGS} +$PYTHON_EXECUTABLE -m examples.models.llama.export_llama ${EXPORT_ARGS} # Create tokenizer.bin. echo "Creating tokenizer.bin" @@ -219,7 +219,7 @@ echo "Running ${EXPORTED_MODEL_NAME} in portable mode" if [[ "${BUILD_TOOL}" == "buck2" ]]; then # Run model. # shellcheck source=/dev/null - $BUCK run examples/models/llama2:main -- ${RUNTIME_ARGS} > result.txt + $BUCK run examples/models/llama:main -- ${RUNTIME_ARGS} > result.txt elif [[ "${BUILD_TOOL}" == "cmake" ]]; then cmake_install_executorch_libraries cmake_build_llama_runner @@ -227,7 +227,7 @@ elif [[ "${BUILD_TOOL}" == "cmake" ]]; then NOW=$(date +"%H:%M:%S") echo "Starting to run llama runner at ${NOW}" # shellcheck source=/dev/null - cmake-out/examples/models/llama2/llama_main ${RUNTIME_ARGS} > result.txt + cmake-out/examples/models/llama/llama_main ${RUNTIME_ARGS} > result.txt NOW=$(date +"%H:%M:%S") echo "Finished at ${NOW}" else diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh index 3f95fbc0b6..4e37d0ebaa 100755 --- a/.ci/scripts/test_model.sh +++ b/.ci/scripts/test_model.sh @@ -75,9 +75,9 @@ run_portable_executor_runner() { test_model() { if [[ "${MODEL_NAME}" == "llama2" ]]; then # Install requirements for export_llama - bash examples/models/llama2/install_requirements.sh - # Test export_llama script: python3 -m examples.models.llama2.export_llama - "${PYTHON_EXECUTABLE}" -m examples.models.llama2.export_llama -c examples/models/llama2/params/demo_rand_params.pth -p examples/models/llama2/params/demo_config.json + bash examples/models/llama/install_requirements.sh + # Test export_llama script: python3 -m examples.models.llama.export_llama + "${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama -c examples/models/llama/params/demo_rand_params.pth -p examples/models/llama/params/demo_config.json run_portable_executor_runner rm "./${MODEL_NAME}.pte" fi diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml index 35d3568c41..c5f244a934 100644 --- a/.github/workflows/android-perf.yml +++ b/.github/workflows/android-perf.yml @@ -160,7 +160,7 @@ jobs: if [[ ${{ matrix.model }} =~ ^stories* ]]; then # Install requirements for export_llama - PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh + PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh # Test llama2 if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then DELEGATE_CONFIG="xnnpack+custom+qe" diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index 7de73a23ff..7de308b1a6 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -162,7 +162,7 @@ jobs: if [[ ${{ matrix.model }} =~ ^stories* ]]; then # Install requirements for export_llama PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \ - bash examples/models/llama2/install_requirements.sh + bash examples/models/llama/install_requirements.sh # Test llama2 if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index f7d2b627bc..bb66ba54c3 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -98,6 +98,12 @@ jobs: - dtype: bf16 build-tool: buck2 mode: portable + - dtype: bf16 + build-tool: cmake + mode: custom + - dtype: bf16 + build-tool: buck2 + mode: custom fail-fast: false with: runner: linux.2xlarge @@ -117,7 +123,7 @@ jobs: # Setup executorch PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2 # Install requirements for export_llama - PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh + PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh # Test llama2 PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}" @@ -216,7 +222,7 @@ jobs: bash install_requirements.sh --pybind xnnpack # install Llava requirements - bash examples/models/llama2/install_requirements.sh + bash examples/models/llama/install_requirements.sh bash examples/models/llava/install_requirements.sh # run python unittest @@ -411,7 +417,7 @@ jobs: # Setup executorch PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2 # Install requirements for export_llama - PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh + PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh # Test llama2 PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh stories110M "${BUILD_TOOL}" "${DTYPE}" "${MODE}" diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index c749fd67b5..2d4bb8184b 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -227,6 +227,8 @@ jobs: include: - dtype: bf16 mode: portable + - dtype: bf16 + mode: custom fail-fast: false with: runner: macos-m1-stable @@ -255,7 +257,7 @@ jobs: fi # Install requirements for export_llama - PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh + PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama/install_requirements.sh # Test llama2 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M cmake "${DTYPE}" "${MODE}" @@ -279,7 +281,7 @@ jobs: # GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}" # # install Llava requirements - # ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh + # ${CONDA_RUN} bash examples/models/llama/install_requirements.sh # ${CONDA_RUN} bash examples/models/llava/install_requirements.sh # # run python unittest @@ -385,7 +387,7 @@ jobs: cmake --build cmake-out -j9 --target install --config Release echo "Build llama runner" - dir="examples/models/llama2" + dir="examples/models/llama" cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ @@ -437,5 +439,5 @@ jobs: python -m extension.export_util.export_hf_model -hfm=${{ matrix.hf_model_repo }} -o ${ET_MODEL_NAME} - cmake-out/examples/models/llama2/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is" + cmake-out/examples/models/llama/llama_main --model_path=${ET_MODEL_NAME}.pte --tokenizer_path=${TOKENIZER_BIN_FILE} --prompt="My name is" echo "::endgroup::" diff --git a/README.md b/README.md index e9ab0773a1..b27845e9f5 100644 --- a/README.md +++ b/README.md @@ -22,10 +22,10 @@ please visit our documentation website [for the latest release](https://pytorch. Check out the [Getting Started](https://pytorch.org/executorch/stable/getting-started-setup.html#quick-setup-colab-jupyter-notebook-prototype) page for a quick spin. -Check out the examples of [Llama](./examples/models/llama2/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch. +Check out the examples of [Llama](./examples/models/llama/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch. -**[UPDATE - 09/25]** We have added support for running [Llama 3.2 1B/3B](./examples/models/llama2/README.md) models via ExecuTorch. +**[UPDATE - 09/25]** We have added support for running [Llama 3.2 1B/3B](./examples/models/llama/README.md) models via ExecuTorch. ## Feedback diff --git a/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem.cpp b/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem.cpp index 21bcab925f..ec2f0459a4 100644 --- a/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem.cpp +++ b/backends/apple/coreml/runtime/inmemoryfs/inmemory_filesystem.cpp @@ -253,11 +253,11 @@ bool write_directory_node(InMemoryDirectoryNode* node, return false; } - for (const auto& [_, node]: node->get_items()) { - if (node.get()->isDirectory() && !recursive) { + for (const auto& [_, node_2]: node->get_items()) { + if (node_2.get()->isDirectory() && !recursive) { continue; } - if (!write_node(node.get(), dir_path, recursive, error)) { + if (!write_node(node_2.get(), dir_path, recursive, error)) { return false; } } @@ -383,9 +383,9 @@ FlattenedInMemoryNode::unflatten(const std::vector& flatt case InMemoryFileSystem::InMemoryNode::Kind::Directory: { std::unordered_map> items; items.reserve(flattened_node_metadata.child_name_to_indices_map.size()); - for (const auto& [name, index]: flattened_node_metadata.child_name_to_indices_map) { - auto moveIt = std::make_move_iterator(nodes.begin() + index); - items[name] = *moveIt; + for (const auto& [name_2, index_2]: flattened_node_metadata.child_name_to_indices_map) { + auto moveIt = std::make_move_iterator(nodes.begin() + index_2); + items[name_2] = *moveIt; } auto directory_node = std::make_unique(std::move(name), std::move(attributes), std::move(items)); diff --git a/backends/qualcomm/_passes/annotate_quant_attrs.py b/backends/qualcomm/_passes/annotate_quant_attrs.py index 0dc39d2a4d..632e67569f 100644 --- a/backends/qualcomm/_passes/annotate_quant_attrs.py +++ b/backends/qualcomm/_passes/annotate_quant_attrs.py @@ -27,9 +27,12 @@ class AnnotateQuantAttrs(ExportPass): generated after quatization process. """ - def __init__(self, edge_program: torch.export.ExportedProgram): + def __init__( + self, edge_program: torch.export.ExportedProgram, skip_advanced_requat: bool + ): super(AnnotateQuantAttrs, self).__init__() self.edge_program = edge_program + self.skip_advanced_requant = skip_advanced_requat def _annotate_source_nodes( self, quant_node: torch.fx.Node, quant_attrs: Dict[str, Any] @@ -68,9 +71,26 @@ def _annotate_requant(self, n): # TODO: Store multiple pairs of requantize attributes when we have an op builder # that has multiple outputs that requires quant attributes. - if q_attrs["dtype"] != dq_attrs["dtype"]: - dq_attrs[QCOM_ENCODING] = q_attrs[QCOM_ENCODING] - n.args[0].meta[QCOM_REQUANTIZE] = dq_attrs + if self.skip_advanced_requant: + if q_attrs["dtype"] != dq_attrs["dtype"]: + dq_attrs[QCOM_ENCODING] = q_attrs[QCOM_ENCODING] + n.args[0].meta[QCOM_REQUANTIZE] = dq_attrs + else: + # When dtype is the same but other specs such as scale and offset are different, + # insert requant to improve accuracy. + # Users can turn this feature off if any inference speed drop is observed. + if any( + q_attrs[attr] != dq_attrs[attr] + for attr in [ + "scale", + "zero_point", + "quant_min", + "quant_max", + "dtype", + ] + ): + dq_attrs[QCOM_ENCODING] = q_attrs[QCOM_ENCODING] + n.args[0].meta[QCOM_REQUANTIZE] = dq_attrs # Dequant all the fold_quant parameters back to fp32. # If an operation is not supported by QNN and got fallback, it will expect a fp32 param. diff --git a/backends/qualcomm/aot/ir/targets.bzl b/backends/qualcomm/aot/ir/targets.bzl index a7cc5c03e2..5fdcd14485 100644 --- a/backends/qualcomm/aot/ir/targets.bzl +++ b/backends/qualcomm/aot/ir/targets.bzl @@ -4,6 +4,7 @@ load( ) load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") load("@fbsource//xplat/executorch/backends/qualcomm:targets.bzl", "generate_schema_header") +load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") QCIR_NAME = "qcir" INPUT_QCIR = QCIR_NAME + ".fbs" @@ -55,7 +56,7 @@ def define_common_targets(): platforms = [ANDROID], visibility = ["@EXECUTORCH_CLIENTS"], deps = [ - "fbsource//third-party/qualcomm/qnn:api", + "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), "//executorch/runtime/backend:interface", "//executorch/runtime/core:core", "//executorch/backends/qualcomm/aot/wrappers:wrappers", diff --git a/backends/qualcomm/aot/python/targets.bzl b/backends/qualcomm/aot/python/targets.bzl index b16acfc490..e1f5a6a8fc 100644 --- a/backends/qualcomm/aot/python/targets.bzl +++ b/backends/qualcomm/aot/python/targets.bzl @@ -3,6 +3,7 @@ load( "ANDROID", ) load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") +load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") PYTHON_MODULE_NAME = "PyQnnManagerAdaptor" @@ -32,7 +33,7 @@ def define_common_targets(): "//executorch/backends/qualcomm:schema", "//executorch/backends/qualcomm/aot/ir:qcir_utils", "//executorch/backends/qualcomm/runtime:runtime", - "fbsource//third-party/qualcomm/qnn:api", + "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), ], external_deps = [ "pybind11", @@ -65,7 +66,7 @@ def define_common_targets(): "//executorch/backends/qualcomm:schema", "//executorch/backends/qualcomm/aot/ir:qcir_utils", "//executorch/backends/qualcomm/runtime:runtime", - "fbsource//third-party/qualcomm/qnn:api", + "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), ], external_deps = [ "pybind11", @@ -92,7 +93,7 @@ def define_common_targets(): "//executorch/backends/qualcomm:schema", "//executorch/backends/qualcomm/aot/ir:qcir_utils", "//executorch/backends/qualcomm/runtime:runtime", - "fbsource//third-party/qualcomm/qnn:api", + "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), ], external_deps = [ "pybind11", diff --git a/backends/qualcomm/aot/wrappers/targets.bzl b/backends/qualcomm/aot/wrappers/targets.bzl index 08d6920a02..24ceeb723e 100644 --- a/backends/qualcomm/aot/wrappers/targets.bzl +++ b/backends/qualcomm/aot/wrappers/targets.bzl @@ -3,6 +3,7 @@ load( "ANDROID", ) load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") +load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") def define_common_targets(): """Defines targets that should be shared between fbcode and xplat. @@ -22,7 +23,7 @@ def define_common_targets(): platforms = [ANDROID], visibility = ["@EXECUTORCH_CLIENTS"], deps = [ - "fbsource//third-party/qualcomm/qnn:api", + "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), "//executorch/runtime/backend:interface", "//executorch/runtime/core:core", ], diff --git a/backends/qualcomm/qnn_preprocess.py b/backends/qualcomm/qnn_preprocess.py index 417934acbd..f13d3fb55a 100644 --- a/backends/qualcomm/qnn_preprocess.py +++ b/backends/qualcomm/qnn_preprocess.py @@ -11,7 +11,6 @@ import executorch.backends.qualcomm.python.PyQnnManagerAdaptor as PyQnnManager import torch # noqa: F401 -from executorch.backends.qualcomm._passes.convert_to_linear import ConvertToLinear from executorch.backends.qualcomm._passes.fuse_consecutive_transpose import ( FuseConsecutiveTranspose, ) @@ -49,7 +48,6 @@ def preprocess( # QNN Delegate Specific Passes qnn_compiler_passes = PassManager( passes=[ - ConvertToLinear(), InsertRequantize(edge_program), InsertIOQDQ(edge_program), LayoutTransform(edge_program, insert_permute=True), diff --git a/backends/qualcomm/quantizer/utils.py b/backends/qualcomm/quantizer/utils.py index d1ea35fa19..46a048c36b 100644 --- a/backends/qualcomm/quantizer/utils.py +++ b/backends/qualcomm/quantizer/utils.py @@ -364,7 +364,7 @@ def get_ptq_per_channel_quant_config( quant_min=torch.iinfo(act_dtype).min, quant_max=torch.iinfo(act_dtype).max, qscheme=torch.per_tensor_affine, - observer_or_fake_quant_ctr=MinMaxObserver.with_args(**extra_args), + observer_or_fake_quant_ctr=MovingAverageMinMaxObserver.with_args(**extra_args), ) weight_quantization_spec = QuantizationSpec( diff --git a/backends/qualcomm/runtime/targets.bzl b/backends/qualcomm/runtime/targets.bzl index 8c921b96ec..f7a3e220de 100644 --- a/backends/qualcomm/runtime/targets.bzl +++ b/backends/qualcomm/runtime/targets.bzl @@ -24,7 +24,7 @@ def define_common_targets(): platforms = [ANDROID], visibility = ["@EXECUTORCH_CLIENTS"], deps = [ - "fbsource//third-party/qualcomm/qnn:api", + "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), "//executorch/runtime/backend:interface", ], exported_deps = [ @@ -53,7 +53,7 @@ def define_common_targets(): exclude = ["Logging.h"], ), define_static_target = True, - link_whole = True, # needed for executorch/examples/models/llama2:main to register QnnBackend + link_whole = True, # needed for executorch/examples/models/llama:main to register QnnBackend platforms = [ANDROID], visibility = ["@EXECUTORCH_CLIENTS"], resources = { diff --git a/backends/qualcomm/targets.bzl b/backends/qualcomm/targets.bzl index a201274402..929ccd9744 100644 --- a/backends/qualcomm/targets.bzl +++ b/backends/qualcomm/targets.bzl @@ -3,6 +3,7 @@ load( "ANDROID", ) load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") +load("@fbsource//xplat/executorch/backends/qualcomm/qnn_version.bzl", "get_qnn_library_verision") # Construct the input and output file names. All input and output files rely on scalar_type file. SCHEMA_NAME = "schema" @@ -83,7 +84,7 @@ def define_common_targets(): define_static_target = True, visibility = ["@EXECUTORCH_CLIENTS"], deps = [ - "fbsource//third-party/qualcomm/qnn:api", + "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()), "//executorch/runtime/backend:interface", "//executorch/runtime/core:core", "//executorch/backends/qualcomm/runtime:runtime", @@ -92,6 +93,3 @@ def define_common_targets(): ":schema", ], ) - -def get_qnn_library_verision(): - return "2.26" diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 01b1014e4c..1594ad58db 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -37,10 +37,7 @@ skip_annotation, ) -from executorch.examples.models.llama2.llama_transformer import ( - ModelArgs, - MOEFeedForward, -) +from executorch.examples.models.llama.llama_transformer import ModelArgs, MOEFeedForward from executorch.examples.qualcomm.utils import setup_common_args_and_variables @@ -51,7 +48,7 @@ from executorch.examples.models.inception_v3 import InceptionV3Model from executorch.examples.models.inception_v4 import InceptionV4Model -# from executorch.examples.models.llama2 import Llama2Model +# from executorch.examples.models.llama import Llama2Model from executorch.examples.models.mobilebert import MobileBertModelExample from executorch.examples.models.mobilenet_v2 import MV2Model from executorch.examples.models.mobilenet_v3 import MV3Model diff --git a/backends/qualcomm/utils/constants.py b/backends/qualcomm/utils/constants.py index 8a37b2bd8c..c54770e542 100644 --- a/backends/qualcomm/utils/constants.py +++ b/backends/qualcomm/utils/constants.py @@ -26,6 +26,7 @@ QCOM_ZERO_POINT = "zero_point" QCOM_ZERO_POINTS = "zero_points" QCOM_PASS_EXPAND_BROADCAST_SHAPE = "expand_broadcast_shape" +QCOM_PASS_SKIP_ADVANCED_REQUANT = "skip_advanced_requant" # constants in backends/qualcomm/tests QCOM_ANNOTATION = "annotation" diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py index d93f7fcb4b..298664e2c9 100644 --- a/backends/qualcomm/utils/utils.py +++ b/backends/qualcomm/utils/utils.py @@ -69,6 +69,7 @@ ) from executorch.backends.qualcomm.utils.constants import ( QCOM_PASS_EXPAND_BROADCAST_SHAPE, + QCOM_PASS_SKIP_ADVANCED_REQUANT, QCOM_QNN_COMPILE_SPEC, ) @@ -305,7 +306,9 @@ def _transform( ConvertBmmToMatmul()(graph_module) ConvertInterpolateWithUpsample2D()(graph_module) I64toI32(edge_program)(graph_module) - AnnotateQuantAttrs(edge_program)(graph_module) + AnnotateQuantAttrs( + edge_program, QCOM_PASS_SKIP_ADVANCED_REQUANT in custom_pass_config + )(graph_module) AnnotateAndQuantScalar(edge_program)(graph_module) AnnotateDecomposed(edge_program)(graph_module) FoldQDQ()(graph_module) diff --git a/backends/vulkan/_passes/TARGETS b/backends/vulkan/_passes/TARGETS index 9e754b25a6..812c39c2b6 100644 --- a/backends/vulkan/_passes/TARGETS +++ b/backends/vulkan/_passes/TARGETS @@ -41,6 +41,20 @@ runtime.python_library( ], ) +runtime.python_library( + name = "int4_weight_only_quantizer", + srcs = [ + "int4_weight_only_quantizer.py", + ], + visibility = [ + "//executorch/backends/...", + ], + deps = [ + ":custom_ops_defs", + "//pytorch/ao:torchao", + ] +) + runtime.python_library( name = "vulkan_passes", srcs = [ @@ -48,8 +62,10 @@ runtime.python_library( ], visibility = [ "//executorch/backends/...", + "//executorch/examples/...", ], deps = [ + ":int4_weight_only_quantizer", ":remove_local_scalar_dense", ] ) diff --git a/backends/vulkan/_passes/__init__.py b/backends/vulkan/_passes/__init__.py index bded91094e..080df83608 100644 --- a/backends/vulkan/_passes/__init__.py +++ b/backends/vulkan/_passes/__init__.py @@ -1,7 +1,11 @@ +from executorch.backends.vulkan._passes.int4_weight_only_quantizer import ( + VkInt4WeightOnlyQuantizer, +) from executorch.backends.vulkan._passes.remove_local_scalar_dense_ops import ( RemoveLocalScalarDenseOpsTransform, ) __all__ = [ + "VkInt4WeightOnlyQuantizer", "RemoveLocalScalarDenseOpsTransform", ] diff --git a/backends/vulkan/_passes/custom_ops_defs.py b/backends/vulkan/_passes/custom_ops_defs.py index fd586b665a..2c16a331c0 100644 --- a/backends/vulkan/_passes/custom_ops_defs.py +++ b/backends/vulkan/_passes/custom_ops_defs.py @@ -9,6 +9,10 @@ namespace = "et_vk" lib = torch.library.Library(namespace, "DEF") +##################### +## conv_with_clamp ## +##################### + def conv_with_clamp_impl( input, @@ -47,6 +51,10 @@ def conv_with_clamp_impl( lib.impl(name, conv_with_clamp_impl, "CompositeExplicitAutograd") conv_with_clamp_op = getattr(getattr(torch.ops, namespace), name) +######################### +## conv_with_clamp.out ## +######################### + def conv_with_clamp_out_impl( input, @@ -84,6 +92,10 @@ def conv_with_clamp_out_impl( ) lib.impl(name, conv_with_clamp_out_impl, "CompositeExplicitAutograd") +################# +## grid_priors ## +################# + # The dimension of x should be larger than 1 def grid_priors_impl( @@ -125,3 +137,35 @@ def grid_priors_out_impl( f"{name}(Tensor self, int stride, float offset, *, Tensor(a!) out) -> Tensor(a!)" ) lib.impl(name, grid_priors_out_impl, "CompositeExplicitAutograd") + +######################## +## linear_weight_int4 ## +######################## + + +def linear_weight_int4_impl( + x: torch.Tensor, + weights_4x8: torch.Tensor, + groupsize: int, + scales_and_zeros: torch.Tensor, + inner_k_tiles: int, +): + original_x_size = x.size() + out_features = weights_4x8.size(0) + x = x.reshape(-1, original_x_size[-1]) + weight_int4pack = torch.ops.aten._convert_weight_to_int4pack( + weights_4x8, inner_k_tiles + ) + out = torch.ops.aten._weight_int4pack_mm( + x, weight_int4pack, groupsize, scales_and_zeros + ) + out_shape = original_x_size[:-1] + (out_features,) + return out.reshape(out_shape) + + +name = "linear_weight_int4" +lib.define( + f"{name}(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros, int inner_k_tiles) -> Tensor" +) +lib.impl(name, linear_weight_int4_impl, "CompositeExplicitAutograd") +linear_weight_int4_op = getattr(getattr(torch.ops, namespace), name) diff --git a/backends/vulkan/_passes/int4_weight_only_quantizer.py b/backends/vulkan/_passes/int4_weight_only_quantizer.py new file mode 100644 index 0000000000..a0d208bb63 --- /dev/null +++ b/backends/vulkan/_passes/int4_weight_only_quantizer.py @@ -0,0 +1,257 @@ +import logging +from typing import Any, Callable, Dict, Optional, Type + +import torch +import torch.nn.functional as F + +from executorch.backends.vulkan._passes.custom_ops_defs import ( # noqa + linear_weight_int4_op, +) + +from torchao.quantization.GPTQ import _check_linear_int4_k +from torchao.quantization.unified import Quantizer +from torchao.quantization.utils import groupwise_affine_quantize_tensor + + +# This module is copied from torchao.quantization.GPTQ.WeightOnlyInt4Linear with +# changes at the annotated lines. +class VkWeightOnlyInt4Linear(torch.nn.Module): + __constants__ = ["in_features", "out_features"] + in_features: int + out_features: int + weight: torch.Tensor + + def __init__( + self, + in_features: int, + out_features: int, + # TODO: remove dtype field, not used + bias=False, + device=None, + dtype=None, + groupsize: int = 128, + inner_k_tiles: int = 8, + precision: torch.dtype = torch.bfloat16, + scales_precision: torch.dtype = torch.bfloat16, + ) -> None: + super().__init__() + self.padding = not _check_linear_int4_k(in_features, groupsize, inner_k_tiles) + if self.padding: + from torchao.quantization.utils import find_multiple + + self.origin_in_features = in_features + in_features = find_multiple(in_features, (1024,)) + + self.in_features = in_features + self.out_features = out_features + assert not bias, "require bias=False" + self.device = device + self.groupsize = groupsize + self.inner_k_tiles = inner_k_tiles + self.precision = precision + self.scales_precision = scales_precision + + if dtype is not None: + raise ValueError("Please specify 'precision' instead of 'dtype'") + + assert out_features % 8 == 0, "require out_features % 8 == 0" + assert ( + in_features % (inner_k_tiles * 16) == 0 + ), "require in_features % (innerKTiles * 16) == 0" + # In the original implementation, the weight buffer is registered with the packed + # sizes, i.e. the result of calling the _convert_weight_to_int4pack operator. + # However, the Vulkan implementation does not expect the weights to be packed + # therefore the weight tensor is registered with the unpacked sizes instead. + # Note that in_features is divided by 2 because each `uint8` tensor element + # contains 2 4-bit packed values. + self.register_buffer( + "weight", + torch.empty( + (out_features, in_features // 2), + dtype=torch.uint8, + device=device, + ), + ) + self.dtype = dtype + self.register_buffer( + "scales_and_zeros", + torch.empty( + (in_features // groupsize, out_features, 2), + dtype=self.scales_precision, + device=device, + ), + ) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + if self.padding: + input = F.pad(input, pad=(0, self.in_features - self.origin_in_features)) + # The forward method is replaced. In the original implementation, the forward + # method is torchao.quantization.GPTQ.linear_forward_int4; here a Vulkan custom + # operator is called instead. + return torch.ops.et_vk.linear_weight_int4( + input, + self.weight, + self.groupsize, + self.scales_and_zeros, + self.inner_k_tiles, + ) + + +# This function is coped from torchao.quantization.GPTQ._replace_linear_int4 +# with small changes at the annotated locations. +def _vk_replace_linear_int4( + module: torch.nn.Module, + groupsize: int, + inner_k_tiles: Optional[int], + padding_allowed: bool, + skip_layer_func: Optional[Callable] = None, + precision: torch.dtype = torch.bfloat16, + scales_precision: torch.dtype = torch.bfloat16, + # Use custom vulkan linear layer as default + linear_class: Type[torch.nn.Module] = VkWeightOnlyInt4Linear, + copy_weights: bool = False, + # Serves the same purpose as `tensor_dim_limit` in + # executorch.backends.vulkan.partitioner.VulkanSupportedOperators + feature_limit: int = 16384, +): + for name, child in module.named_children(): + if isinstance(child, torch.nn.Linear) and ( + skip_layer_func is None or not skip_layer_func(child.weight) + ): + # Add an additional condition that the out/in features must not exceed the + # `feature_limit` argument. + if ( + _check_linear_int4_k(child.in_features, groupsize, inner_k_tiles) + or padding_allowed + ) and ( + child.out_features < feature_limit and child.in_features < feature_limit + ): + new_linear = linear_class( + child.in_features, + child.out_features, + bias=False, + device=child.weight.device, + groupsize=groupsize, + inner_k_tiles=inner_k_tiles, + precision=precision, + scales_precision=scales_precision, + ) + if copy_weights and child.weight.device != torch.device("meta"): + new_linear.weight = child.weight + setattr(module, name, new_linear) + else: + _vk_replace_linear_int4( + child, + groupsize, + inner_k_tiles, + padding_allowed, + skip_layer_func, + precision, + scales_precision, + linear_class, + copy_weights, + ) + + +# This module is copied from torchao.quantization.GPTQ.Int4WeightOnlyQuantizer +# with some changes at the annotated lines. +class VkInt4WeightOnlyQuantizer(Quantizer): + def __init__( + self, + groupsize: int = 256, + padding_allowed: bool = True, + inner_k_tiles: Optional[int] = 8, + device: torch.device = torch.device("cpu"), # noqa + precision: torch.dtype = torch.float32, + feature_limit: int = 16384, + ) -> None: + super().__init__() + assert inner_k_tiles in [2, 4, 8] + assert groupsize in [32, 64, 128, 256] + + self.inner_k_tiles = inner_k_tiles + self.groupsize: int = groupsize + self.padding_allowed: bool = padding_allowed + self.device: torch.device = device + self.precision: torch.dtype = precision + # Serves the same purpose as `tensor_dim_limit` in + # executorch.backends.vulkan.partitioner.VulkanSupportedOperators + self.feature_limit = feature_limit + + @torch.no_grad() + def _create_quantized_state_dict( + self, model: torch.nn.Module + ) -> Dict[str, torch.Tensor]: + cur_state_dict = model.state_dict() + for fqn, mod in model.named_modules(): + # Add additional check to make sure features do not exceed feature limit + if isinstance(mod, torch.nn.Linear) and ( + mod.out_features < self.feature_limit + and mod.in_features < self.feature_limit + ): + assert not mod.bias + out_features = mod.out_features + in_features = mod.in_features + logging.info(f"linear: {fqn}, in={in_features}, out={out_features}") + + assert ( + in_features % self.groupsize == 0 + ), f"require in_features:{in_features} % self.groupsize:{self.groupsize} == 0" + + weight = mod.weight.data + if not _check_linear_int4_k( + in_features, self.groupsize, self.inner_k_tiles + ): + if self.padding_allowed: + import torch.nn.functional as F + + from torchao.quantization.utils import find_multiple + + logging.warn( + f"warning: {fqn} is padded to satisfy in_features % 1024 == 0" + ) + padded_in_features = find_multiple(in_features, (1024,)) + weight = F.pad( + weight, pad=(0, padded_in_features - in_features) + ) + else: + logging.warn( + f"warning: {fqn} is skipped, int4 requires that in_features is 32, 64, or is divisible by 1024, " + + "and that groupsize and inner_k_tiles*16 evenly divide into it" + ) + continue + (w_int4x8, scales_and_zeros) = groupwise_affine_quantize_tensor( + weight, + 4, # n_bit + self.groupsize, + self.precision, # dtype for scales_and_zeros + ) + # In the original implementation, w_int4x8 is packed via calling the + # _convert_weight_to_int4pack operator before storing the weight. However + # the Vulkan implementation does not expect the weights to be packed, so + # the w_int4x8 tensor is stored as the weight instead. + cur_state_dict[f"{fqn}.weight"] = w_int4x8.to(self.device) + cur_state_dict[f"{fqn}.scales_and_zeros"] = scales_and_zeros.to( + self.device + ) + return cur_state_dict + + def _convert_for_runtime(self, model: torch.nn.Module) -> torch.nn.Module: + _vk_replace_linear_int4( + model, + self.groupsize, + self.inner_k_tiles, + self.padding_allowed, + skip_layer_func=None, + precision=self.precision, + scales_precision=self.precision, + ) + return model + + def quantize( + self, model: torch.nn.Module, *args: Any, **kwargs: Any + ) -> torch.nn.Module: + state_dict = self._create_quantized_state_dict(model) + model = self._convert_for_runtime(model) + model.load_state_dict(state_dict, strict=False) + return model diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md index de6c400f11..2a4faacc0c 100644 --- a/backends/vulkan/docs/android_demo.md +++ b/backends/vulkan/docs/android_demo.md @@ -57,7 +57,7 @@ partially lower the Llama model to Vulkan. ```shell # The files will usually be downloaded to ~/.llama -python -m examples.models.llama2.export_llama \ +python -m examples.models.llama.export_llama \ --disable_dynamic_shape --vulkan -kv --use_sdpa_with_kv_cache -d fp32 \ -c ~/.llama/checkpoints/Llama3.2-1B/consolidated.00.pth \ -p ~/.llama/checkpoints/Llama3.2-1B/params.json \ @@ -95,23 +95,23 @@ binary using the Android NDK toolchain. cmake --build cmake-android-out -j16 --target install) # Build LLaMA Runner library -(rm -rf cmake-android-out/examples/models/llama2 && \ - cmake examples/models/llama2 \ +(rm -rf cmake-android-out/examples/models/llama && \ + cmake examples/models/llama \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DANDROID_ABI=$ANDROID_ABI \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DCMAKE_INSTALL_PREFIX=cmake-android-out \ -DPYTHON_EXECUTABLE=python \ - -Bcmake-android-out/examples/models/llama2 && \ - cmake --build cmake-android-out/examples/models/llama2 -j16) + -Bcmake-android-out/examples/models/llama && \ + cmake --build cmake-android-out/examples/models/llama -j16) ``` Finally, push and run the llama runner binary on your Android device. Note that your device must have sufficient GPU memory to execute the model. ```shell -adb push cmake-android-out/examples/models/llama2/llama_main /data/local/tmp/llama_main +adb push cmake-android-out/examples/models/llama/llama_main /data/local/tmp/llama_main adb shell /data/local/tmp/llama_main \ --model_path=/data/local/tmp/vulkan_llama2.pte \ diff --git a/backends/vulkan/partitioner/supported_ops.py b/backends/vulkan/partitioner/supported_ops.py index 6bc568bfdd..da50719ba3 100644 --- a/backends/vulkan/partitioner/supported_ops.py +++ b/backends/vulkan/partitioner/supported_ops.py @@ -83,6 +83,7 @@ def __contains__(self, op): exir_ops.edge.aten.mm.default, exir_ops.edge.aten.addmm.default, exir_ops.edge.aten.linear.default, + exir_ops.edge.et_vk.linear_weight_int4.default, # Reduction exir_ops.edge.aten._log_softmax.default, exir_ops.edge.aten._softmax.default, diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index dcc982add1..d3d32266d8 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -474,7 +474,7 @@ vTensor::vTensor( if (dtype == vkapi::kHalf) { VK_CHECK_COND( - api::context()->adapter_ptr()->has_16bit_storage(), + api::context()->adapter_ptr()->supports_16bit_storage_buffers(), "Half dtype is only available if the physical device supports float16 " "storage buffers!"); } diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h index 7a113c939f..bd83e60038 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.h +++ b/backends/vulkan/runtime/api/containers/Tensor.h @@ -430,6 +430,16 @@ class vTensor final { return axis_map_; } + /* + * Return true if the tensor's axis map is {0, 1, 2, concat_dim}. This means + * that the width dim is mapped to the width axis of the texture, the height + * dim is mapped to the height axis of the texture, the channels dim is mapped + * to the depth axis of the texture. + */ + inline bool has_standard_axis_map() const { + return axis_map_.at(0) == 0 && axis_map_.at(1) == 1 && axis_map_.at(2) == 2; + } + inline const std::vector& strides() const { return strides_; } diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py index 6ee29d45f1..c133094dbf 100644 --- a/backends/vulkan/runtime/gen_vulkan_spv.py +++ b/backends/vulkan/runtime/gen_vulkan_spv.py @@ -319,21 +319,26 @@ def define_active_storage_type(storage_type: str): raise AssertionError(f"Invalid storage type: {storage_type}") -def define_required_extensions(dtype: str): +def define_required_extensions(dtypes: Union[str, List[str]]): out_str = "\n" - nbit = None - glsl_type = None - - if dtype == "half": - nbit = "16bit" - glsl_type = "float16" - if dtype == "int8": - nbit = "8bit" - glsl_type = "int8" - - if nbit is not None and glsl_type is not None: - out_str += f"#extension GL_EXT_shader_{nbit}_storage : require\n" - out_str += f"#extension GL_EXT_shader_explicit_arithmetic_types_{glsl_type} : require\n" + dtype_list = dtypes if isinstance(dtypes, list) else [dtypes] + + for dtype in dtype_list: + nbit = None + glsl_type = None + if dtype == "half": + nbit = "16bit" + glsl_type = "float16" + elif dtype == "int16" or dtype == "uint16": + nbit = "16bit" + glsl_type = "int16" + elif dtype == "int8" or dtype == "uint8": + nbit = "8bit" + glsl_type = "int8" + + if nbit is not None and glsl_type is not None: + out_str += f"#extension GL_EXT_shader_{nbit}_storage : require\n" + out_str += f"#extension GL_EXT_shader_explicit_arithmetic_types_{glsl_type} : require\n" return out_str diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index a4bb714e38..f2d971a56b 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -342,6 +342,10 @@ class ComputeGraph final { return values_.at(idx).toTensor().axis_map_ubo(); } + inline bool has_standard_axis_map(const ValueRef idx) { + return values_.at(idx).toTensor().has_standard_axis_map(); + } + inline vkapi::BufferBindInfo logical_limits_ubo(const ValueRef idx) { return values_.at(idx).toTensor().logical_limits_ubo(); } @@ -690,6 +694,10 @@ class ComputeGraph final { // Miscellaneous Utilities // + inline bool int16_shader_types_enabled() const { + return context_->adapter_ptr()->supports_int16_shader_types(); + } + /* * Check whether the GPU supports 8 bit buffers. */ diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.yaml index 8ea4cbe561..9abd9c1dea 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.yaml @@ -14,5 +14,6 @@ buffer_to_buffer: - VALUE: float - VALUE: int - VALUE: int8 + - VALUE: uint8 shader_variants: - NAME: buffer_to_buffer diff --git a/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml index 825da11b24..e64e1bd260 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml @@ -14,6 +14,7 @@ no_op: - VALUE: float - VALUE: int - VALUE: int8 + - VALUE: uint8 STORAGE: - VALUE: texture3d - VALUE: texture2d diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl index de42f9ed99..b702a110a6 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl @@ -19,117 +19,94 @@ ${define_active_storage_type(STORAGE)} -${define_required_extensions(DTYPE)} -${define_required_extensions("int8")} +${define_required_extensions([DTYPE, "uint8", "uint16"])} +#extension GL_EXT_control_flow_attributes : require layout(std430) buffer; -${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)} -${layout_declare_tensor(1, "r", "t_mat1", DTYPE, STORAGE)} -${layout_declare_tensor(2, "r", "t_mat2", "int8", "buffer")} -${layout_declare_tensor(3, "r", "t_scales_and_zeros", DTYPE, STORAGE)} - -$if STORAGE == "texture3d": - ${layout_declare_ubo(4, "ivec4", "out_sizes")} - ${layout_declare_ubo(5, "ivec4", "mat1_sizes")} - ${layout_declare_ubo(6, "ivec4", "mat2_strides")} - ${layout_declare_ubo(7, "ivec4", "scales_strides")} -$else: - ${layout_declare_ubo(4, "ivec4", "out_sizes")} - ${layout_declare_ubo(5, "ivec4", "out_strides")} - ${layout_declare_ubo(6, "ivec4", "mat1_sizes")} - ${layout_declare_ubo(7, "ivec4", "mat1_strides")} - ${layout_declare_ubo(8, "ivec4", "mat2_strides")} - ${layout_declare_ubo(9, "ivec4", "scales_strides")} +${layout_declare_tensor(B, "w", "ret", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "x", DTYPE, STORAGE)} +${layout_declare_tensor(B, "r", "weights", "uint8", "buffer")} +${layout_declare_tensor(B, "r", "qparams", DTYPE, STORAGE)} +${layout_declare_ubo(B, "ivec3", "ret_limits")} +${layout_declare_ubo(B, "ivec4", "x_sizes")} +${layout_declare_ubo(B, "ivec4", "weights_strides")} +${layout_declare_ubo(B, "ivec4", "qparams_strides")} layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; layout(constant_id = 3) const int group_size = 1; +/* + * This shader computes a linear operator between a floating point input matrix + * x and a weights matrix that is quantized to 4 bits. + * + * The (W, H, C) shape of each tensor is: + * - x: (K, M) + * - weights: (K / 2, N) + * - The weights tensor has a data type of `uint8`. Each element in the tensor + * contains 2 4-bit values packed into a uint8. + * - qparams: (2, N, number_of_groups) + * - This tensor contains the scales and zeros quantization parameters for the + * weights tensor. The weight tensor is quantized group-wise, which means + * that every `group_size` elements along the K dimension of the weights + * tensor has independent quantization parameters. Along the width dim, the + * first value contains the scale for the group and the second value + * contains the zero point for the group. + * + * Note that this shader assumes that all tensors are width packed. + */ void main() { - - const ivec4 out_pos = ivec4( - gl_GlobalInvocationID.x, // n = 0..N-1 - gl_GlobalInvocationID.y, // m = 0..M-1 - gl_GlobalInvocationID.z % out_sizes.z, - gl_GlobalInvocationID.z / out_sizes.z); - - if (any(greaterThanEqual(out_pos, out_sizes))) { - return; + // output positions being calculated are (n, m), (n + 1, m), ... + // This means multiplying the m-th row of x with the n-th, (n+1)-th, ... rows + // of the weights tensor. + const u16vec3 ret_pos = u16vec3(gl_GlobalInvocationID); + if (any(greaterThanEqual(ret_pos, ret_limits))) { + return; + } + + // Since ret is width packed, need to multiply by 4 + const uint16_t n = uint16_t(ret_pos.x * 4); + + // K is guaranteed to be a multiple of group size + const uint16_t num_blocks = uint16_t(x_sizes.x / group_size); + + uint16_t k_texel_i = uint16_t(0); + vec4 sums = vec4(0.0); + for (uint16_t block_idx = uint16_t(0); block_idx < num_blocks; block_idx++) { + vec4 scales; + vec4 zeros; + + [[unroll]] for (int comp = 0; comp < 4; comp++) { + const vec4 scale_and_zero = load_texel( + qparams, u16vec3(0, n + comp, block_idx)); + scales[comp] = scale_and_zero.x; + zeros[comp] = scale_and_zero.y; } - const uint K = mat1_sizes.x; - const uint n = out_pos.x; - const uint m = out_pos.y; - const uint mask = uint(0x0f); - - float rc = 0.0; - int k = 0; - const uint k_block = (K + group_size - 1) / group_size; - - #ifdef USING_BUFFER - ivec4 mat1_pos = ivec4(0, m, out_pos.z, out_pos.w); - ivec4 mat2_pos = ivec4(0, n, out_pos.z, out_pos.w); - ivec4 scale_pos = ivec4(0, n, 0, out_pos.w); - ivec4 zero_pos = ivec4(0, n, 1, out_pos.w); - - for (int kb = 0; kb < k_block; kb++) { - scale_pos.x = kb; - const int scale_bufi = tidx_to_bufi(scale_pos, scales_strides); - const float scale = float(t_scales_and_zeros[scale_bufi]); - - zero_pos.x = kb; - const int zero_bufi = tidx_to_bufi(zero_pos, scales_strides); - const float zero = float(t_scales_and_zeros[zero_bufi]) - scale * 8.0; - - for(uint idx = 0; idx < group_size && k < K; idx++, k++) { - mat1_pos.x = k; - const int mat1_bufi = tidx_to_bufi(mat1_pos, mat1_strides); - const float mat1_val = float(t_mat1[mat1_bufi]); - - mat2_pos.x = k / 2; - const int mat2_bufi = tidx_to_bufi(mat2_pos, mat2_strides); - // Bitwise op treats sign bit from int8 as a value bit instead, - // since there is no uint8_t datatype - uint mat2_val = (t_mat2[mat2_bufi] & 0xFF); - mat2_val = (k & 1) == 0 ? mat2_val & mask : (mat2_val >> 4); - - rc += mat1_val * (scale * float(mat2_val) + zero); - } - } - - const int out_bufi = tidx_to_bufi(out_pos, out_strides); - t_out[out_bufi] = FLOAT_T(rc); - - #else // Using texture - ivec3 mat1_pos = ivec3(0, m, out_pos.z); - ivec4 mat2_pos = ivec4(0, n, out_pos.z, out_pos.w); - ivec3 scale_zero_pos = ivec3(0, n, 0); - uint K_texel = K / FOUR; - - for (int kb = 0; kb < k_block; kb++) { - scale_zero_pos.x = kb; - const vec4 scale_zero = load_texel(t_scales_and_zeros, scale_zero_pos); - const float scale = scale_zero.x; - const float zero = scale_zero.y - scale * 8.0; - - for(uint idx = 0; idx < group_size && k < K_texel; idx += FOUR, k++) { - mat1_pos.x = k; - const VEC4_T mat1_tex = load_texel(t_mat1, mat1_pos); - - mat2_pos.x = k * 2; // k * FOUR / 2 - const int mat2_id = tidx_to_bufi(mat2_pos, mat2_strides); - - for (int texel_pos = 0; texel_pos < FOUR; texel_pos++) { - // Bitwise op treats sign bit from int8 as a value bit instead, - // since there is no uint8_t datatype - uint mat2_val = (t_mat2[mat2_id + texel_pos / 2] & 0xFF); - mat2_val = (texel_pos & 1) == 0 ? mat2_val & mask : (mat2_val >> 4); - rc += mat1_tex[texel_pos] * (scale * float(mat2_val) + zero); - } - } + for (uint16_t i = uint16_t(0); i < group_size; i += uint16_t(4), k_texel_i++) { + const VEC4_T x_texel = load_texel( + x, u16vec3(k_texel_i, ret_pos.y, ret_pos.z)); + + [[unroll]] for (int comp = 0; comp < 4; comp++) { + const int weights_bufi = (n + comp) * weights_strides.y + (k_texel_i * 2); + // Need to read 4 unpacked values, which corresponds to 2 packed values + const uint8_t weights_val_1 = weights[weights_bufi]; + const uint8_t weights_val_2 = weights[weights_bufi + 1]; + + const u8vec4 weights_texel = u8vec4( + (weights_val_1 & 0xF0) >> 4, + weights_val_1 & 0x0F, + (weights_val_2 & 0xF0) >> 4, + weights_val_2 & 0x0F); + + // Note that the unpacked 4-bit values are unsigned, therefore they must + // first be "centered" around 0 by subtracting 8 before applying the + // scale and zero point. + sums[comp] += dot( + x_texel, (vec4(weights_texel) - 8.0) * scales[comp] + zeros[comp]); } - write_texel(t_out, out_pos.xyz, vec4(rc, 0, 0, 0)); - - #endif + } + } + write_texel(ret, ret_pos, sums); } diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml index fd65068080..40d95d4a05 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml @@ -7,13 +7,10 @@ q_4w_linear: parameter_names_with_default_values: DTYPE: float - STORAGE: buffer + STORAGE: texture3d generate_variant_forall: DTYPE: - VALUE: float - VALUE: half - STORAGE: - - VALUE: buffer - - VALUE: texture3d shader_variants: - - NAME: q_4w_linear + - NAME: q_4w_linear_texture3d diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp index 838605f05f..4dd55be469 100644 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp @@ -16,7 +16,7 @@ namespace vkcompute { -void check_qlinear_args( +void check_q_8w_linear_args( const ComputeGraph& graph, const ValueRef mat1, const ValueRef qmat2_data, @@ -38,7 +38,7 @@ void check_qlinear_args( utils::val_at(-1, scales_sizes) == utils::val_at(-2, qmat2_sizes)); } -void resize_qlinear_node( +void resize_q_8w_linear_node( ComputeGraph* graph, const std::vector& args, const std::vector& extra_args) { @@ -123,7 +123,7 @@ void add_q_8w_linear_node( // Specialization Constants {}, // Resizing Logic - resize_qlinear_node)); + resize_q_8w_linear_node)); if (!graph.is_buffer_storage(out) && graph.packed_dim_of(out) != WHCN::kWidthDim) { viewFn(graph, {out_W_packed, graph.add_none(), out}); @@ -133,12 +133,138 @@ void add_q_8w_linear_node( void weight_int8pack_mm( ComputeGraph& graph, const std::vector& args) { - check_qlinear_args(graph, args[0], args[1], args[2], args[3]); + check_q_8w_linear_args(graph, args[0], args[1], args[2], args[3]); return add_q_8w_linear_node(graph, args[0], args[1], args[2], args[3]); } +void check_q_4w_linear_args( + ComputeGraph& graph, + const ValueRef mat1, + const ValueRef mat2_data, + const ValueRef group_size, + const ValueRef scales_and_zeros, + const ValueRef out) { + VK_CHECK_COND(graph.int16_shader_types_enabled()); + VK_CHECK_COND(graph.int8_buffers_enabled()); + + VK_CHECK_COND(graph.val_is_tensor(mat1)); + VK_CHECK_COND(graph.val_is_tref(mat2_data)); + VK_CHECK_COND(graph.val_is_tref(scales_and_zeros)); + + VK_CHECK_COND(graph.dim_of(mat1) <= 3); + VK_CHECK_COND(graph.dim_of(mat2_data) == 2); + VK_CHECK_COND(graph.dim_of(scales_and_zeros) == 3); + + VK_CHECK_COND(graph.size_at(-3, mat1) == 1); + const int K = graph.size_at(-1, mat1); + VK_CHECK_COND(graph.size_at(-1, mat2_data) * 2 == K); + + const int group_size_val = graph.extract_scalar(group_size); + VK_CHECK_COND(K % group_size_val == 0); + + VK_CHECK_COND(graph.packed_dim_of(mat1) == WHCN::kWidthDim); + VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kWidthDim); + + VK_CHECK_COND(graph.has_standard_axis_map(mat1)); + VK_CHECK_COND(graph.has_standard_axis_map(out)); +} + +void resize_q_4w_linear_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + (void)extra_args; + + vTensorPtr out = graph->get_tensor(args[0].refs[0]); + vTensorPtr mat1 = graph->get_tensor(args[1].refs[0]); + vTensorPtr mat2 = graph->get_tensor(args[1].refs[1]); + + const int out_cols = utils::val_at(-2, mat1->sizes()); + const int out_rows = utils::val_at(-2, mat2->sizes()); + + std::vector new_out_sizes(3); + if (mat1->sizes().size() == 2) { + new_out_sizes.resize(2); + new_out_sizes.at(0) = out_cols; + new_out_sizes.at(1) = out_rows; + } else { + new_out_sizes.at(0) = mat1->sizes().at(0); + new_out_sizes.at(1) = out_cols; + new_out_sizes.at(2) = out_rows; + } + + out->virtual_resize(new_out_sizes); +} + +void add_q_4w_linear_node( + ComputeGraph& graph, + const ValueRef mat1, + const ValueRef mat2_data, + const ValueRef group_size, + const ValueRef scales_and_zeros_data, + const ValueRef out) { + check_q_4w_linear_args( + graph, mat1, mat2_data, group_size, scales_and_zeros_data, out); + + utils::StorageType storage_type = graph.storage_type_of(out); + + ValueRef mat2 = + prepack_buffer_if_tensor_ref(graph, mat2_data, utils::kWidthPacked); + + ValueRef scales_and_zeros = + prepack_if_tensor_ref(graph, scales_and_zeros_data, utils::kWidthPacked); + + std::string kernel_name = "q_4w_linear"; + add_storage_type_suffix(kernel_name, storage_type); + add_dtype_suffix(kernel_name, graph.dtype_of(out)); + + const uint32_t group_size_val = graph.extract_scalar(group_size); + + vkapi::ParamsBindList ubos({}); + ubos.append(graph.logical_limits_ubo(out)); + ubos.append(graph.sizes_ubo(mat1)); + ubos.append(graph.strides_ubo(mat2)); + ubos.append(graph.strides_ubo(scales_and_zeros)); + + utils::uvec3 global_wg_size = graph.logical_limits_of(out); + utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size); + + graph.execute_nodes().emplace_back(new DispatchNode( + graph, + VK_KERNEL_FROM_STR(kernel_name), + global_wg_size, + local_wg_size, + // Inputs and Outputs + {{out, vkapi::MemoryAccessType::WRITE}, + {{mat1, mat2, scales_and_zeros}, vkapi::MemoryAccessType::READ}}, + // Shader params buffers + ubos, + // Specialization Constants + {SV(group_size_val)}, + // Resizing Logic + resize_q_4w_linear_node, + {})); +} + +void linear_weight_int4( + ComputeGraph& graph, + const std::vector& args) { + return add_q_4w_linear_node( + graph, + args[0], // mat1 + args[1], // mat2 + args[2], // group_size + args[3], // scales_and_zeros + // There is an unused variable inner_k_tiles which is used to call + // _convert_weight_to_int4pack in the AOT custom op, which is why the 4th + // argument is skipped. + args[5] // out + ); +} + REGISTER_OPERATORS { VK_REGISTER_OP(aten._weight_int8pack_mm.default, weight_int8pack_mm); + VK_REGISTER_OP(et_vk.linear_weight_int4.default, linear_weight_int4); } } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp deleted file mode 100644 index 17291d292a..0000000000 --- a/backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include - -namespace vkcompute { - -void check_q_matmul_args( - ComputeGraph& graph, - const ValueRef mat1, - const ValueRef mat2_data, - const ValueRef group_size_data, - const ValueRef scales_and_zeros, - const ValueRef out) { - const std::vector mat1_sizes = graph.sizes_of(mat1); - const std::vector mat2_sizes = graph.sizes_of(mat2_data); - const std::vector scales_and_zeros_sizes = - graph.sizes_of(scales_and_zeros); - - const uint32_t group_size = graph.extract_scalar(group_size_data); - - VK_CHECK_COND(mat1_sizes.size() == 2); - VK_CHECK_COND(mat1_sizes.size() == mat2_sizes.size()); - - using namespace WHCN; - VK_CHECK_COND(graph.packed_dim_of(mat1) == kWidthDim); - VK_CHECK_COND(graph.packed_dim_of(mat2_data) == kWidthDim); - // VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kWidthDim); - - if (graph.storage_type_of(scales_and_zeros) == utils::kBuffer) { - VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kWidthDim); - } else { - VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kChannelsDim); - } - - if (graph.storage_type_of(out) == utils::kBuffer) { - VK_CHECK_COND(graph.packed_dim_of(out) == kWidthDim); - } else { - VK_CHECK_COND(graph.packed_dim_of(out) == kChannelsDim); - } - - const int mat1_K = utils::val_at(-1, mat1_sizes); - const int mat2_K = utils::val_at(-1, mat2_sizes) * 2; - const int N = utils::val_at(-2, mat2_sizes); - - VK_CHECK_COND(mat1_K == mat2_K); - - VK_CHECK_COND(mat2_K % group_size == 0); - - const uint32_t k_groups = mat2_K / group_size; - - VK_CHECK_COND(scales_and_zeros_sizes.size() == 3); - VK_CHECK_COND(utils::val_at(-1, scales_and_zeros_sizes) == k_groups); - VK_CHECK_COND(utils::val_at(-2, scales_and_zeros_sizes) == N); - VK_CHECK_COND(utils::val_at(-3, scales_and_zeros_sizes) == 2); - - // Match https://fburl.com/code/6ostkknm - std::vector valid_group_sizes = {32, 64, 128, 256}; - - bool is_valid_group_size = false; - for (auto valid_group_size : valid_group_sizes) { - if (group_size == valid_group_size) { - is_valid_group_size = true; - break; - } - } - - VK_CHECK_COND(is_valid_group_size); -} - -void resize_q_matmul_node( - ComputeGraph* graph, - const std::vector& args, - const std::vector& extra_args) { - (void)extra_args; - - vTensorPtr out = graph->get_tensor(args[0].refs[0]); - vTensorPtr mat1 = graph->get_tensor(args[1].refs[0]); - vTensorPtr mat2 = graph->get_tensor(args[1].refs[1]); - - const int out_cols = utils::val_at(-2, mat1->sizes()); - const int out_rows = utils::val_at(-2, mat2->sizes()); - - std::vector new_out_sizes(3); - if (mat1->sizes().size() == 2) { - new_out_sizes.resize(2); - new_out_sizes.at(0) = out_cols; - new_out_sizes.at(1) = out_rows; - } else { - new_out_sizes.at(0) = mat1->sizes().at(0); - new_out_sizes.at(1) = out_cols; - new_out_sizes.at(2) = out_rows; - } - - out->virtual_resize(new_out_sizes); -} - -void add_q_matmul_node( - ComputeGraph& graph, - const ValueRef mat1, - const ValueRef mat2_data, - const ValueRef group_size, - const ValueRef scales_and_zeros_data, - const ValueRef out) { - auto storage_type = graph.storage_type_of(out); - - ValueRef mat2 = - prepack_buffer_if_tensor_ref(graph, mat2_data, utils::kWidthPacked); - - ValueRef scales_and_zeros = - prepack_if_tensor_ref(graph, scales_and_zeros_data, utils::kWidthPacked); - - std::string kernel_name = "q_4w_linear"; - - add_dtype_suffix(kernel_name, graph.dtype_of(out)); - add_storage_type_suffix(kernel_name, storage_type); - - const uint32_t group_size_val = graph.extract_scalar(group_size); - - vkapi::ParamsBindList ubos({}); - if (storage_type == utils::kBuffer) { - ubos.append(graph.sizes_ubo(out)); - ubos.append(graph.strides_ubo(out)); - ubos.append(graph.sizes_ubo(mat1)); - ubos.append(graph.strides_ubo(mat1)); - ubos.append(graph.strides_ubo(mat2)); - ubos.append(graph.strides_ubo(scales_and_zeros)); - } else { - ubos.append(graph.sizes_ubo(out)); - ubos.append(graph.sizes_ubo(mat1)); - ubos.append(graph.strides_ubo(mat2)); - ubos.append(graph.strides_ubo(scales_and_zeros)); - } - - auto out_sizes = graph.sizes_of(out); - uint32_t N = utils::val_at(-1, out_sizes); - uint32_t M = utils::val_at(-2, out_sizes); - - utils::uvec3 global_wg_size = {N, M, 1}; - - utils::uvec3 local_wg_size = adaptive_work_group_size(global_wg_size); - - graph.execute_nodes().emplace_back(new DispatchNode( - graph, - VK_KERNEL_FROM_STR(kernel_name), - global_wg_size, - local_wg_size, - // Inputs and Outputs - {{out, vkapi::MemoryAccessType::WRITE}, - {{mat1, mat2, scales_and_zeros}, vkapi::MemoryAccessType::READ}}, - // Shader params buffers - ubos, - // Specialization Constants - {SV(group_size_val)}, - // Resizing Logic - resize_q_matmul_node, - {})); -} - -void int4pack_mm(ComputeGraph& graph, const std::vector& args) { - check_q_matmul_args(graph, args[0], args[1], args[2], args[3], args[4]); - return add_q_matmul_node( - graph, - args[0], // mat1 - args[1], // mat2 - args[2], // group_size - args[3], // scales_and_zeros - args[4] // out - ); -} - -REGISTER_OPERATORS { - VK_REGISTER_OP(aten._weight_int4pack_mm.default, int4pack_mm); -} - -} // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp index d634947a51..4a709fce99 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -147,7 +148,9 @@ ValueRef prepack_buffer( const utils::GPUMemoryLayout layout) { ValueRef v = graph.add_tensor_like(vref, utils::kBuffer, layout); - vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR("buffer_to_buffer"); + std::string kernel_name = "buffer_to_buffer"; + add_dtype_suffix(kernel_name, graph.dtype_of(vref)); + vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name); vkapi::ParamsBindList ubos; ubos.append({graph.numel_ubo(v)}); diff --git a/backends/vulkan/runtime/vk_api/Adapter.h b/backends/vulkan/runtime/vk_api/Adapter.h index f03f06e1f4..545f59502e 100644 --- a/backends/vulkan/runtime/vk_api/Adapter.h +++ b/backends/vulkan/runtime/vk_api/Adapter.h @@ -155,30 +155,34 @@ class Adapter final { // Physical Device Features - inline bool has_16bit_storage() { + inline bool supports_16bit_storage_buffers() { return physical_device_.shader_16bit_storage.storageBuffer16BitAccess == VK_TRUE; } - inline bool has_8bit_storage() { + inline bool supports_8bit_storage_buffers() { return physical_device_.shader_8bit_storage.storageBuffer8BitAccess == VK_TRUE; } - inline bool has_16bit_compute() { + inline bool supports_float16_shader_types() { return physical_device_.shader_float16_int8_types.shaderFloat16 == VK_TRUE; } - inline bool has_8bit_compute() { + inline bool supports_int8_shader_types() { return physical_device_.shader_float16_int8_types.shaderInt8 == VK_TRUE; } + inline bool supports_int16_shader_types() { + return physical_device_.supports_int16_shader_types; + } + inline bool has_full_float16_buffers_support() { - return has_16bit_storage() && has_16bit_compute(); + return supports_16bit_storage_buffers() && supports_float16_shader_types(); } inline bool has_full_int8_buffers_support() { - return has_8bit_storage() && has_8bit_compute(); + return supports_16bit_storage_buffers() && supports_int8_shader_types(); } // Command Buffer Submission diff --git a/backends/vulkan/runtime/vk_api/Device.cpp b/backends/vulkan/runtime/vk_api/Device.cpp index 46e534f09f..08d4565dba 100644 --- a/backends/vulkan/runtime/vk_api/Device.cpp +++ b/backends/vulkan/runtime/vk_api/Device.cpp @@ -30,6 +30,7 @@ PhysicalDevice::PhysicalDevice(VkPhysicalDevice physical_device_handle) VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES_KHR}, queue_families{}, num_compute_queues(0), + supports_int16_shader_types(false), has_unified_memory(false), has_timestamps(properties.limits.timestampComputeAndGraphics), timestamp_period(properties.limits.timestampPeriod), @@ -49,6 +50,10 @@ PhysicalDevice::PhysicalDevice(VkPhysicalDevice physical_device_handle) vkGetPhysicalDeviceFeatures2(handle, &features2); + if (features2.features.shaderInt16 == VK_TRUE) { + supports_int16_shader_types = true; + } + // Check if there are any memory types have both the HOST_VISIBLE and the // DEVICE_LOCAL property flags const VkMemoryPropertyFlags unified_memory_flags = diff --git a/backends/vulkan/runtime/vk_api/Device.h b/backends/vulkan/runtime/vk_api/Device.h index 9f4b83540e..6d6e28857a 100644 --- a/backends/vulkan/runtime/vk_api/Device.h +++ b/backends/vulkan/runtime/vk_api/Device.h @@ -35,6 +35,7 @@ struct PhysicalDevice final { // Metadata uint32_t num_compute_queues; + bool supports_int16_shader_types; bool has_unified_memory; bool has_timestamps; float timestamp_period; diff --git a/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp b/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp new file mode 100644 index 0000000000..63ebb96cfa --- /dev/null +++ b/backends/vulkan/test/op_tests/linear_weight_int4_test.cpp @@ -0,0 +1,224 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include +#include +#include + +#include + +// +// Reference Implementations +// + +at::Tensor linear_weight_int4_reference_impl( + const at::Tensor& x, + const at::Tensor& weights_4x2, + const int64_t groupsize, + const at::Tensor& scales_and_zeros, + const int64_t inner_k_tiles) { + const std::vector original_x_size(x.sizes().vec()); + const size_t ndim = original_x_size.size(); + const int64_t out_features = weights_4x2.size(0); + const at::Tensor x_flattened = x.reshape({-1, original_x_size[ndim - 1]}); + const at::Tensor packed_weights = + at::_convert_weight_to_int4pack(weights_4x2, inner_k_tiles); + at::Tensor out = at::_weight_int4pack_mm( + x_flattened, packed_weights, groupsize, scales_and_zeros); + std::vector out_shape( + original_x_size.begin(), original_x_size.end()); + out_shape.at(ndim - 1) = out_features; + return out.reshape(out_shape); +} + +at::Tensor dequantize_and_linear( + const at::Tensor& x, + const at::Tensor& weights_4x2, + const int64_t groupsize, + const at::Tensor& scales_and_zeros, + const int64_t inner_k_tiles) { + std::vector weights_shape(weights_4x2.sizes().vec()); + weights_shape[1] *= 2; + + at::Tensor weights_dequantized = + at::empty(weights_shape, at::device(at::kCPU).dtype(at::kFloat)); + + const int64_t N = weights_dequantized.size(0); + const int64_t K = weights_dequantized.size(1); + + const int k_groups = K / groupsize; + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k += 2) { + const int group_idx = k / groupsize; + // const int scale_idx = k_groups * n + group_idx; + const uint8_t packed_val = weights_4x2[n][k / 2].item().to(); + const uint8_t second_val = packed_val & 0x0F; + const uint8_t first_val = (packed_val & 0xF0) >> 4; + + const float scale = scales_and_zeros[group_idx][n][0].item().to(); + const float zero = scales_and_zeros[group_idx][n][1].item().to(); + + weights_dequantized[n][k] = (float(first_val) - 8.0) * scale + zero; + weights_dequantized[n][k + 1] = (float(second_val) - 8.0) * scale + zero; + } + } + + return at::linear(x, weights_dequantized); +} + +// +// Test functions +// + +void test_reference_linear_int4( + const int B, + const int M, + const int K, + const int N, + const int group_size = 32, + const int inner_k_tiles = 8) { + assert(K % group_size == 0); + + at::Tensor x = at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat)); + at::Tensor weights_4x2 = + at::randint(0, 256, {N, K / 2}, at::device(at::kCPU).dtype(at::kByte)); + + const int k_groups = K / group_size; + at::Tensor scales_and_zeros = + at::rand({k_groups, N, 2}, at::device(at::kCPU).dtype(at::kFloat)); + + at::Tensor out = linear_weight_int4_reference_impl( + x, weights_4x2, group_size, scales_and_zeros, inner_k_tiles); + + at::Tensor out_ref = dequantize_and_linear( + x, weights_4x2, group_size, scales_and_zeros, inner_k_tiles); + + ASSERT_TRUE(at::allclose(out, out_ref)); +} + +vkcompute::vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) { + using namespace vkcompute; + switch (at_scalartype) { + case c10::kFloat: + return vkapi::kFloat; + case c10::kHalf: + return vkapi::kHalf; + case c10::kInt: + return vkapi::kInt; + case c10::kLong: + return vkapi::kInt; + case c10::kChar: + return vkapi::kChar; + case c10::kByte: + return vkapi::kByte; + default: + VK_THROW("Unsupported at::ScalarType!"); + } +} + +void test_vulkan_linear_int4( + const int B, + const int M, + const int K, + const int N, + const int group_size = 32, + const int inner_k_tiles = 8) { + assert(K % group_size == 0); + + at::Tensor x = at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat)); + at::Tensor weights_4x2 = + at::randint(0, 256, {N, K / 2}, at::device(at::kCPU).dtype(at::kByte)); + + const int k_groups = K / group_size; + at::Tensor scales_and_zeros = + at::rand({k_groups, N, 2}, at::device(at::kCPU).dtype(at::kFloat)); + + at::Tensor out_ref = dequantize_and_linear( + x, weights_4x2, group_size, scales_and_zeros, inner_k_tiles); + + // Build Vulkan graph + using namespace vkcompute; + + GraphConfig config; + config.set_storage_type_override(utils::kTexture3D); + ComputeGraph graph(config); + +#define MAKE_TENSORREF_FOR(x) \ + ValueRef r_##x = graph.add_tensorref( \ + x.sizes().vec(), \ + from_at_scalartype(x.scalar_type()), \ + x.const_data_ptr()); + + MAKE_TENSORREF_FOR(weights_4x2); + MAKE_TENSORREF_FOR(scales_and_zeros); + +#define MAKE_INPUT_FOR(x) \ + IOValueRef r_##x = graph.add_input_tensor( \ + x.sizes().vec(), from_at_scalartype(x.scalar_type())); + + MAKE_INPUT_FOR(x); + + const ValueRef r_out = graph.add_tensor( + out_ref.sizes().vec(), from_at_scalartype(out_ref.scalar_type())); + + VK_GET_OP_FN("et_vk.linear_weight_int4.default") + (graph, + {r_x.value, + r_weights_4x2, + graph.add_scalar(group_size), + r_scales_and_zeros, + kDummyValueRef, + r_out}); + + ValueRef staging_out = graph.set_output_tensor(r_out); + + graph.prepare(); + graph.encode_prepack(); + graph.prepack(); + graph.encode_execute(); + + // + // Run model + // + + graph.propagate_resize(); + graph.copy_into_staging(r_x.staging, x.const_data_ptr(), x.numel()); + + graph.execute(); + + at::Tensor vk_out = at::empty_like(out_ref); + graph.copy_from_staging( + staging_out, vk_out.mutable_data_ptr(), vk_out.numel()); + + ASSERT_TRUE(at::allclose(vk_out, out_ref, 1e-4, 1e-4)); +} + +TEST(VulkanInt4LinearTest, test_reference_impl) { + test_reference_linear_int4( + /*B = */ 1, + /*M = */ 4, + /*K = */ 128, + /*N = */ 32); +} + +TEST(VulkanInt4LinearTest, test_vulkan_impl) { + if (!vkcompute::api::context() + ->adapter_ptr() + ->has_full_int8_buffers_support()) { + GTEST_SKIP(); + } + test_vulkan_linear_int4( + /*B = */ 1, + /*M = */ 4, + /*K = */ 128, + /*N = */ 32); +} diff --git a/backends/vulkan/test/op_tests/targets.bzl b/backends/vulkan/test/op_tests/targets.bzl index 3acf1debe5..270e1b768a 100644 --- a/backends/vulkan/test/op_tests/targets.bzl +++ b/backends/vulkan/test/op_tests/targets.bzl @@ -186,3 +186,41 @@ def define_common_targets(is_fbcode = False): runtime.external_dep_location("libtorch"), ], ) + + runtime.cxx_binary( + name = "linear_weight_int4_test_bin", + srcs = [ + "linear_weight_int4_test.cpp", + ], + compiler_flags = [ + "-Wno-unused-variable", + ], + define_static_target = False, + deps = [ + "//third-party/googletest:gtest_main", + "//executorch/backends/vulkan:vulkan_graph_runtime", + runtime.external_dep_location("libtorch"), + ], + ) + + runtime.cxx_test( + name = "linear_weight_int4_test", + srcs = [ + "linear_weight_int4_test.cpp", + ], + contacts = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"], + fbandroid_additional_loaded_sonames = [ + "torch-code-gen", + "vulkan_graph_runtime", + "vulkan_graph_runtime_shaderlib", + ], + platforms = [ANDROID], + use_instrumentation_test = True, + deps = [ + "//third-party/googletest:gtest_main", + "//executorch/backends/vulkan:vulkan_graph_runtime", + "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", + "//executorch/extension/tensor:tensor", + runtime.external_dep_location("libtorch"), + ], + ) diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index ca8558fe0e..694eeebece 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -2379,7 +2379,8 @@ void run_from_gpu_test( utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, vkapi::ScalarType dtype = vkapi::kFloat, utils::StorageType storage_type = utils::StorageType::TEXTURE_3D) { - if (dtype == vkapi::kHalf && !context()->adapter_ptr()->has_16bit_storage()) { + if (dtype == vkapi::kHalf && + !context()->adapter_ptr()->supports_16bit_storage_buffers()) { return; } vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout); @@ -2433,7 +2434,8 @@ void round_trip_test( utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, vkapi::ScalarType dtype = vkapi::kFloat, utils::StorageType storage_type = utils::StorageType::TEXTURE_3D) { - if (dtype == vkapi::kHalf && !context()->adapter_ptr()->has_16bit_storage()) { + if (dtype == vkapi::kHalf && + !context()->adapter_ptr()->supports_16bit_storage_buffers()) { return; } @@ -2484,7 +2486,8 @@ void compute_graph_round_trip_test( utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, vkapi::ScalarType dtype = vkapi::kFloat, utils::StorageType storage_type = utils::StorageType::TEXTURE_3D) { - if (dtype == vkapi::kHalf && !context()->adapter_ptr()->has_16bit_storage()) { + if (dtype == vkapi::kHalf && + !context()->adapter_ptr()->supports_16bit_storage_buffers()) { return; } @@ -3026,142 +3029,6 @@ TEST(VulkanComputeGraphOpsTest, grid_priors_test) { /*data_out_expected = */ {4, 4, 12, 4, 20, 4, 4, 12, 12, 12, 20, 12}); } -void test_int4pack_mm( - std::vector MKN, - uint32_t group_size, - utils::StorageType storage_type) { - GraphConfig config; - ComputeGraph graph(config); - - const uint32_t M = MKN[0]; - const uint32_t K = MKN[1]; - const uint32_t N = MKN[2]; - - const std::vector mat1_size = {M, K}; - const std::vector mat2_size = {K, N}; - const std::vector mat2_q_size = {N, K / 2}; // Transposed and packed - const std::vector out_size = {M, N}; - - std::vector A_data = create_random_float_buffer(M * K); - IOValueRef A = graph.add_input_tensor(mat1_size, vkapi::kFloat, storage_type); - graph.copy_into_staging(A.staging, A_data.data(), A_data.size()); - - // Quantized but un-packed weights - std::vector B_quant_data = create_random_uint8_buffer(K * N, 0, 16); - - // Pack and transpose weights to correspond to int4 weight format - std::vector B_int4_data = - int4mm_pack_weights(mat2_size, B_quant_data.data()); - - IOValueRef B_int4 = - graph.add_input_tensor(mat2_q_size, vkapi::kQInt8, utils::kBuffer); - graph.copy_into_staging( - B_int4.staging, B_int4_data.data(), B_int4_data.size()); - - const int k_groups = K / group_size; - - // Random scales and zeroes. Keep scales small to avoid overflow and zeroes in - // int4 range - IOValueRef scales_and_zeros; - - if (storage_type == utils::kBuffer) { - scales_and_zeros.value = graph.add_tensor( - {2, N, k_groups}, vkapi::kFloat, storage_type, utils::kWidthPacked); - } else { - scales_and_zeros.value = graph.add_tensor( - {2, N, k_groups}, vkapi::kFloat, storage_type, utils::kChannelsPacked); - } - - scales_and_zeros.staging = graph.set_input_tensor(scales_and_zeros.value); - - std::vector s_data(graph.numel_of(scales_and_zeros.value)); - const int zeros_stride = s_data.size() / 2; - for (size_t i = 0; i < zeros_stride; i++) { - s_data[i] = rand() % 100; - s_data[i + zeros_stride] = rand() % 16; - } - - graph.copy_into_staging( - scales_and_zeros.staging, s_data.data(), s_data.size()); - - IOValueRef out_int4; - - if (storage_type == utils::kBuffer) { - out_int4.value = graph.add_tensor(out_size, vkapi::kFloat, utils::kBuffer); - } else { - out_int4.value = - graph.add_tensor(out_size, vkapi::kFloat, utils::kChannelsPacked); - } - - VK_GET_OP_FN("aten._weight_int4pack_mm.default") - (graph, - {A.value, - B_int4.value, - graph.add_scalar(group_size), - scales_and_zeros.value, - out_int4.value}); - - out_int4.staging = graph.set_output_tensor(out_int4.value); - - // Dequantized matmul for comparison - IOValueRef B_deq = - graph.add_input_tensor(mat2_size, vkapi::kFloat, storage_type); - std::vector B_deq_data = int4mm_dequantize_weights( - mat2_size, B_quant_data.data(), group_size, s_data.data()); - graph.copy_into_staging(B_deq.staging, B_deq_data.data(), B_deq_data.size()); - - IOValueRef out_deq; - out_deq.value = graph.add_tensor(out_size, vkapi::kFloat, storage_type); - - VK_GET_OP_FN("aten.mm.default") - (graph, {A.value, B_deq.value, out_deq.value}); - - out_deq.staging = graph.set_output_tensor(out_deq.value); - - graph.prepare(); - graph.encode_prepack(); - graph.prepack(); - graph.encode_execute(); - graph.propagate_resize(); - graph.execute(); - - // Compare outputs - std::vector out_int4_data(graph.numel_of(out_int4.value)); - graph.copy_from_staging( - out_int4.staging, out_int4_data.data(), out_int4_data.size()); - - std::vector out_deq_data(graph.numel_of(out_deq.value)); - graph.copy_from_staging( - out_deq.staging, out_deq_data.data(), out_deq_data.size()); - - for (int i = 0; i < out_int4_data.size(); i++) { - EXPECT_TRUE(check_close(out_int4_data[i], out_deq_data[i])); - } -} - -TEST(VulkanComputeGraphOpsTest, int4pack_mm_test) { - if (!context()->adapter_ptr()->has_full_int8_buffers_support()) { - GTEST_SKIP(); - } - - for (auto storage_type : {utils::kBuffer, utils::kTexture3D}) { - // Vector multiplication, single group per row - test_int4pack_mm({1, 32, 1}, 32, storage_type); - - // Vector multiplication, multiple groups per row - test_int4pack_mm({1, 256, 1}, 64, storage_type); - - // Square matrices, single group per row - test_int4pack_mm({32, 32, 32}, 32, storage_type); - - // Irregular matrices, single group per row - test_int4pack_mm({37, 32, 19}, 32, storage_type); - - // Irregular matrices, multiple groups per row - test_int4pack_mm({37, 256, 19}, 64, storage_type); - } -} - void test_transpose_view_mm( const int B, const int M, @@ -3355,7 +3222,7 @@ void test_to_copy() { } TEST(VulkanComputeGraphOpsTest, test_to_copy) { - if (context()->adapter_ptr()->has_16bit_storage()) { + if (context()->adapter_ptr()->supports_16bit_storage_buffers()) { test_to_copy(); } } diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt index 5d4eb2c7bb..002d351155 100644 --- a/backends/xnnpack/CMakeLists.txt +++ b/backends/xnnpack/CMakeLists.txt @@ -39,7 +39,7 @@ option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE # Keeping this OFF by default due to regressions in decode # and model load with kleidi kernels option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI - "Enable workspace sharing across different delegate instances" OFF) + "Enable Arm Kleidi kernels" OFF) if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE) add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE) endif() diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp index 1080da0bea..b948aa8623 100644 --- a/backends/xnnpack/runtime/XNNCompiler.cpp +++ b/backends/xnnpack/runtime/XNNCompiler.cpp @@ -56,7 +56,8 @@ using DataType = fb_xnnpack::XNNDatatype; using DefineNodeFunc = Error (*)( xnn_subgraph_t, const std::unordered_map&, - NodePtr) noexcept; + NodePtr, + const fb_xnnpack::XNNGraph*) noexcept; /* Convert a tensor from fp32 to bf16. @@ -512,6 +513,8 @@ Error defineTensor( return Error::Ok; }; +#define MAYBE_UNUSED(x) (void)(x) + /* Define serialized add node into the subgraph, using the remapped ids to map the serialized ids, to the new ids generated when defining @@ -520,7 +523,10 @@ the tensor value Error defineAddNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + std::pair min_max = getOutputMinMax(node); auto graph_node = node->xnode_union_as_XNNAdd(); xnn_status status = xnn_define_add2( @@ -547,7 +553,10 @@ Define Minimum operator Node into the subgraph Error defineMinimumNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNMinimum(); xnn_status status = xnn_define_minimum2( subgraph_ptr, @@ -572,7 +581,10 @@ Define subtract operator Node into the subgraph Error defineSubtractNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNSubtract(); std::pair min_max = getOutputMinMax(node); xnn_status status = xnn_define_subtract( @@ -600,7 +612,10 @@ Define Multiply operator Node into the subgraph Error defineMultiplyNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNMultiply(); std::pair min_max = getOutputMinMax(node); xnn_status status = xnn_define_multiply2( @@ -622,26 +637,83 @@ Error defineMultiplyNode( return Error::Ok; }; +#ifdef ENABLE_XNNPACK_KLEIDI +bool isQP8(const fb_xnnpack::XNNGraph* graph, const NodePtr node) { + assert(node->xnode_union_type() == fb_xnnpack::XNodeUnion::XNNConvert); + auto graph_node = node->xnode_union_as_XNNConvert(); + auto cvt_output_id = graph_node->output_id(); + + auto check_dtype = [graph](uint32_t id, DataType dtype) -> bool { + assert( + dtype == DataType::xnn_datatype_qdint8 || + dtype == DataType::xnn_datatype_qbint4); + for (auto value : *graph->xvalues()) { + if (value->xvalue_union_type() != + fb_xnnpack::XValueUnion::XNNQuantizedTensorValue) { + continue; + } + auto tensor = + value->xvalue_union_as_XNNQuantizedTensorValue()->tensor_value(); + if (tensor->id_out() == id) { + return tensor->datatype() == dtype; + } + } + return false; + }; + + // Check if the output tensor is qint8 else bail early. + if (!check_dtype(cvt_output_id, DataType::xnn_datatype_qdint8)) { + return false; + } + + // Find if the convert output is going to the right linear node. + // Assuming if we can find one valid linear node, then we can use QP8 + // for all the linear nodes consuming this convert output. + for (auto node : *graph->xnodes()) { + if (node->xnode_union_type() == fb_xnnpack::XNodeUnion::XNNFullyConnected) { + auto linear_node = node->xnode_union_as_XNNFullyConnected(); + if (linear_node->input1_id() == cvt_output_id) { + if (check_dtype( + linear_node->filter_id(), DataType::xnn_datatype_qbint4)) { + return true; + } + } + } + } + return false; +} +#endif // ENABLE_XNNPACK_KLEIDI + /* Define Convert operator Node into the subgraph */ Error defineConvertNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* flatbuffer_graph) noexcept { + MAYBE_UNUSED(flatbuffer_graph); auto graph_node = node->xnode_union_as_XNNConvert(); + + int32_t flags = graph_node->flags(); +#ifdef ENABLE_XNNPACK_KLEIDI +// This is not currently exposed at include/xnnpack.h yet once it is +// we can remove this runtime logic and do this ahead-of-time +#define XNN_FLAG_MAYBE_PACK_FOR_QB4W_GEMM 0x00000100; + if (isQP8(flatbuffer_graph, node)) { + flags |= XNN_FLAG_MAYBE_PACK_FOR_QB4W_GEMM; + ET_LOG( + Debug, + "Setting XNN_FLAG_MAYBE_PACK_FOR_QB4W_GEMM flag for convert node %i", + node->debug_handle()); + } +#endif + xnn_status status = xnn_define_convert( subgraph_ptr, remapped_ids.at(graph_node->input_id()), remapped_ids.at(graph_node->output_id()), -#ifdef ENABLE_XNNPACK_KLEIDI - // This maps to XNNPACK's XNN_FLAG_MAYBE_PACK_FOR_QB4W_GEMM - // however this is not currently exposed at top level - // xnnpack.h Header - 0x00000100); -#else - graph_node->flags()); -#endif + flags); ET_CHECK_OR_RETURN_ERROR( status == xnn_status_success, @@ -660,7 +732,10 @@ when defining the tensor values Error defineFullyConnectedNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNFullyConnected(); std::pair min_max = getOutputMinMax(node); xnn_status status = xnn_define_fully_connected( @@ -690,7 +765,10 @@ the tensor value Error defineClampNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + std::pair min_max = getOutputMinMax(node); auto graph_node = node->xnode_union_as_XNNClamp(); xnn_status status = xnn_define_clamp( @@ -719,7 +797,10 @@ the tensor value Error defineSoftmaxNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNSoftmax(); xnn_status status = xnn_define_softmax( subgraph_ptr, @@ -744,7 +825,10 @@ the tensor value Error defineSigmoidNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNSigmoid(); xnn_status status = xnn_define_sigmoid( subgraph_ptr, @@ -769,7 +853,10 @@ the tensor value Error defineFloorNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNFloor(); xnn_status status = xnn_define_floor( subgraph_ptr, @@ -789,7 +876,10 @@ Error defineFloorNode( Error defineGlobalAvgPooling2dNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNGlobalAvgPooling2d(); std::pair min_max = getOutputMinMax(node); xnn_status status = xnn_define_global_average_pooling_2d( @@ -812,7 +902,10 @@ Error defineGlobalAvgPooling2dNode( Error defineAvgPooling2dNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNAvgPooling2d(); std::pair min_max = getOutputMinMax(node); xnn_status status = xnn_define_average_pooling_2d( @@ -848,7 +941,10 @@ tensor value Error defineConv2dNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNConv2d(); std::pair min_max = getOutputMinMax(node); xnn_status status = xnn_define_convolution_2d( @@ -891,7 +987,10 @@ tensor value Error defineMaxPooling2dNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNMaxPooling2d(); std::pair min_max = getOutputMinMax(node); xnn_status status = xnn_define_max_pooling_2d( @@ -928,7 +1027,10 @@ Define serialized div node into the subgraph Error defineDivNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNDiv(); std::pair min_max = getOutputMinMax(node); xnn_status status = xnn_define_divide( @@ -957,7 +1059,10 @@ tensor value Error defineStaticTransposeNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNStaticTranspose(); // Get tensor dims, we need to convert the uint32_t* to size_t* @@ -987,7 +1092,10 @@ the tensor value Error defineStaticResizeBilinear2DNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + const fb_xnnpack::XNNStaticResizeBilinear2D* graph_node = node->xnode_union_as_XNNStaticResizeBilinear2D(); @@ -1016,7 +1124,10 @@ the tensor value Error defineStaticConstantPadNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + const fb_xnnpack::XNNStaticConstantPad* graph_node = node->xnode_union_as_XNNStaticConstantPad(); @@ -1051,7 +1162,10 @@ tensor value Error defineDepthwiseConv2dNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNDepthwiseConv2d(); std::pair min_max = getOutputMinMax(node); xnn_status status = xnn_define_depthwise_convolution_2d( @@ -1090,7 +1204,10 @@ Error defineDepthwiseConv2dNode( Error defineStaticReshapeNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNStaticReshape(); // Get tensor dims, we need to convert the uint32_t* to size_t* @@ -1121,7 +1238,10 @@ tensor value Error defineArgMaxPooling2dNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNArgMaxPooling2d(); xnn_status status = xnn_define_argmax_pooling_2d( @@ -1155,7 +1275,10 @@ tensor value Error defineSquareRootNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNSquareRoot(); xnn_status status = xnn_define_square_root( @@ -1182,7 +1305,10 @@ tensor value Error defineCeilingNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNCeiling(); xnn_status status = xnn_define_ceiling( @@ -1209,7 +1335,10 @@ tensor value Error defineHardswishNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNHardswish(); xnn_status status = xnn_define_hardswish( @@ -1236,7 +1365,10 @@ tensor value Error defineLeakyReLUNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNLeakyReLU(); xnn_status status = xnn_define_leaky_relu( @@ -1264,7 +1396,10 @@ tensor value Error defineMaximumNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNMaximum(); xnn_status status = xnn_define_maximum2( @@ -1291,7 +1426,10 @@ serialized ids, to the new ids generated when defining the tensor value Error defineNegateNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNNegate(); xnn_status status = xnn_define_negate( @@ -1317,7 +1455,10 @@ serialized ids to the new ids generated when defining the tensor value Error defineSquareNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNSquare(); xnn_status status = xnn_define_square( @@ -1343,7 +1484,10 @@ serialized ids to the new ids generated when defining the tensor value Error defineELUNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNELU(); xnn_status status = xnn_define_elu( @@ -1370,7 +1514,10 @@ serialized ids to the new ids generated when defining the tensor value Error defineAbsNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNAbs(); xnn_status status = xnn_define_abs( @@ -1397,7 +1544,10 @@ to the new ids generated when defining the tensor value Error definePReLUNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNPReLU(); xnn_status status = xnn_define_prelu( @@ -1425,7 +1575,10 @@ to the new ids generated when defining the tensor value Error defineConcatenate2Node( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNConcatenate2(); xnn_status status = xnn_define_concatenate2( @@ -1454,7 +1607,10 @@ to the new ids generated when defining the tensor value Error defineConcatenate3Node( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNConcatenate3(); xnn_status status = xnn_define_concatenate3( @@ -1484,7 +1640,10 @@ to the new ids generated when defining the tensor value Error defineConcatenate4Node( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNConcatenate4(); xnn_status status = xnn_define_concatenate4( @@ -1515,7 +1674,10 @@ to the new ids generated when defining the tensor value Error defineStaticSliceNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNStaticSlice(); std::vector offsets = flatbufferDimsToVector(graph_node->offsets()); @@ -1548,7 +1710,10 @@ to the new ids generated when defining the tensor value Error defineScaledDotProductAttentionNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNScaledDotProductAttention(); xnn_status status = xnn_define_scaled_dot_product_attention( @@ -1581,7 +1746,10 @@ to the new ids generated when defining the tensor value Error defineBatchMatrixMultiplyNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + auto graph_node = node->xnode_union_as_XNNBatchMatrixMultiply(); xnn_status status = xnn_define_batch_matrix_multiply( @@ -1609,7 +1777,10 @@ that has not yet been implemented Error defineNotImplementedNode( xnn_subgraph_t subgraph_ptr, const std::unordered_map& remapped_ids, - const NodePtr node) noexcept { + const NodePtr node, + const fb_xnnpack::XNNGraph* graph) noexcept { + MAYBE_UNUSED(graph); + ET_CHECK_OR_RETURN_ERROR( false, NotImplemented, @@ -1767,7 +1938,7 @@ ET_NODISCARD Error XNNCompiler::compileModel( for (auto node : *flatbuffer_graph->xnodes()) { err = getDefineNodeFunc(node->xnode_union_type())( - subgraph.get(), remapped_ids, node); + subgraph.get(), remapped_ids, node, flatbuffer_graph); if (err != Error::Ok) { return err; } diff --git a/backends/xnnpack/test/TARGETS b/backends/xnnpack/test/TARGETS index 11209e41ba..b2db8060e1 100644 --- a/backends/xnnpack/test/TARGETS +++ b/backends/xnnpack/test/TARGETS @@ -58,7 +58,7 @@ runtime.python_test( "fbsource//third-party/pypi/torchsr:torchsr", # @manual "fbsource//third-party/pypi/transformers:transformers", # @manual "//executorch/backends/xnnpack/test/tester:tester", - "//executorch/examples/models/llama2:llama2_model", + "//executorch/examples/models/llama:llama2_model", "//pytorch/audio/src:torchaudio_core", "//pytorch/vision:torchvision", # @manual ], diff --git a/backends/xnnpack/test/models/llama2_et_example.py b/backends/xnnpack/test/models/llama2_et_example.py index 6948321d53..f1dce43c3c 100644 --- a/backends/xnnpack/test/models/llama2_et_example.py +++ b/backends/xnnpack/test/models/llama2_et_example.py @@ -9,7 +9,7 @@ import torch from executorch.backends.xnnpack.test.tester import Tester -from executorch.examples.models.llama2.model import Llama2Model +from executorch.examples.models.llama.model import Llama2Model class TestLlama2ETExample(unittest.TestCase): diff --git a/build/build_android_llm_demo.sh b/build/build_android_llm_demo.sh index f015b08e61..4ad7c70c39 100644 --- a/build/build_android_llm_demo.sh +++ b/build/build_android_llm_demo.sh @@ -26,6 +26,14 @@ build_android_native_library() { EXECUTORCH_BUILD_QNN=OFF fi + NEURON_BUFFER_ALLOCATOR_LIB="${NEURON_BUFFER_ALLOCATOR_LIB:-}" + NEURON_USDK_ADAPTER_LIB="${NEURON_USDK_ADAPTER_LIB:-}" + if [ -n "$NEURON_BUFFER_ALLOCATOR_LIB" ]; then + EXECUTORCH_BUILD_NEURON=ON + else + EXECUTORCH_BUILD_NEURON=OFF + fi + cmake . -DCMAKE_INSTALL_PREFIX="${CMAKE_OUT}" \ -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \ -DANDROID_ABI="${ANDROID_ABI}" \ @@ -89,10 +97,10 @@ build_android_native_library() { fi # Copy MTK related so library - if [ -n "$NEURON_BUFFER_ALLOCATOR_LIB" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then + if [ -n "$NEURON_BUFFER_ALLOCATOR_LIB" ] && [ -n "$NEURON_USDK_ADAPTER_LIB" ] && [ "$ANDROID_ABI" == "arm64-v8a" ]; then cp "${CMAKE_OUT}"/backends/mediatek/libneuron_backend.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/ - cp "$NEURON_BUFFER_ALLOCATOR_LIB"/libneuron_buffer_allocator.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/ - cp "$NEURON_BUFFER_ALLOCATOR_LIB"/libneuronusdk_adapter.mtk.so ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/ + cp "${NEURON_BUFFER_ALLOCATOR_LIB}" ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/ + cp "${NEURON_USDK_ADAPTER_LIB}" ${BUILD_AAR_DIR}/jni/${ANDROID_ABI}/ fi } @@ -107,13 +115,12 @@ build_aar() { find jni -type f -name "libexecutorch_jni.so" -exec bash -c 'mv "$1" "${1/_jni/}"' bash {} \; # Zip all necessary files into the AAR file zip -r executorch.aar libs jni/*/libexecutorch.so jni/*/libqnn*.so jni/*/libQnn*.so jni/*/libneuron_backend.so jni/*/libneuron_buffer_allocator.so jni/*/libneuronusdk_adapter.mtk.so AndroidManifest.xml - cp executorch.aar executorch-llama.aar popd } build_android_demo_apps() { mkdir -p examples/demo-apps/android/LlamaDemo/app/libs - cp ${BUILD_AAR_DIR}/executorch-llama.aar examples/demo-apps/android/LlamaDemo/app/libs + cp ${BUILD_AAR_DIR}/executorch.aar examples/demo-apps/android/LlamaDemo/app/libs pushd examples/demo-apps/android/LlamaDemo ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew build assembleAndroidTest popd diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml index 47bcf0ce4b..9d4e595da3 100644 --- a/build/cmake_deps.toml +++ b/build/cmake_deps.toml @@ -383,7 +383,7 @@ deps = [ [targets.llama_runner] buck_targets = [ - "//examples/models/llama2/runner:runner", + "//examples/models/llama/runner:runner", ] filters = [ ".cpp$", diff --git a/docs/source/android-prebuilt-library.md b/docs/source/android-prebuilt-library.md index 5a2d319893..324c63376c 100644 --- a/docs/source/android-prebuilt-library.md +++ b/docs/source/android-prebuilt-library.md @@ -23,14 +23,14 @@ To add the Java library to your app, simply download the AAR, and add it to your In your app working directory, such as example executorch/examples/demo-apps/android/LlamaDemo, ``` mkdir -p app/libs -curl https://ossci-android.s3.amazonaws.com/executorch/release/executorch-241002/executorch.aar -o app/libs/executorch-llama.aar +curl https://ossci-android.s3.amazonaws.com/executorch/release/executorch-241002/executorch.aar -o app/libs/executorch.aar ``` And include it in gradle: ``` # app/build.grardle.kts dependencies { - implementation(files("libs/executorch-llama.aar")) + implementation(files("libs/executorch.aar")) } ``` diff --git a/docs/source/apple-runtime.md b/docs/source/apple-runtime.md index e17ee3ea2f..dd4e1c9391 100644 --- a/docs/source/apple-runtime.md +++ b/docs/source/apple-runtime.md @@ -38,7 +38,7 @@ The prebuilt ExecuTorch runtime, backend, and kernels are available as a [Swift #### Xcode -In Xcode, go to `File > Add Package Dependencies`. Paste the URL of the [ExecuTorch repo](https://github.com/pytorch/executorch) into the search bar and select it. Make sure to change the branch name to the desired ExecuTorch version, e.g., "0.3.0", or just use the "latest" branch name for the latest stable build. +In Xcode, go to `File > Add Package Dependencies`. Paste the URL of the [ExecuTorch repo](https://github.com/pytorch/executorch) into the search bar and select it. Make sure to change the branch name to the desired ExecuTorch version, e.g., "0.4.0", or just use the "latest" branch name for the latest stable build. ![](_static/img/swiftpm_xcode1.png) @@ -67,7 +67,7 @@ let package = Package( ], dependencies: [ // Use "latest" branch name for the latest stable build. - .package(url: "https://github.com/pytorch/executorch.git", .branch("0.3.0")) + .package(url: "https://github.com/pytorch/executorch.git", .branch("0.4.0")) ], targets: [ .target( diff --git a/docs/source/index.rst b/docs/source/index.rst index 095489de35..cf54fa2477 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -4,12 +4,16 @@ Welcome to the ExecuTorch Documentation ======================================= .. important:: - This is a beta release. As of this ExecuTorch beta release, the API - will follow the `lifecycle and deprecation policy `__ - and ``.pte`` binary format will comply with the *runtime compatibility policy* (TODO: add link). - This ensures that application developers can update to the latest version of ExecuTorch - without breaking existing integration code, in accordance with these policies. - If any issues arise or compatibility breaks occur, please `report them in GitHub `__. + v0.4.0 is a beta release of ExecuTorch. As of this release, the API will + follow the `API Lifecycle and Deprecation Policy `__, + and the ``.pte`` binary format will comply with the `Runtime Compatibility + Policy + `__. + This helps ensure that application developers can update to the latest + version of ExecuTorch without breaking existing integration code, in + accordance with these policies. If any issues arise or compatibility breaks + occur, please `report them in GitHub + `__. We welcome any feedback, suggestions, and bug reports from the community to help us improve the technology. Please use the `PyTorch Forums diff --git a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md index ac95fb21bd..0157668d7f 100644 --- a/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md +++ b/docs/source/llm/build-run-llama3-qualcomm-ai-engine-direct-backend.md @@ -6,7 +6,7 @@ This tutorial demonstrates how to export Llama 3 8B Instruct for Qualcomm AI Eng - Set up your ExecuTorch repo and environment if you haven’t done so by following [the Setting up ExecuTorch](../getting-started-setup.md) to set up the repo and dev environment. - Read [the Building and Running ExecuTorch with Qualcomm AI Engine Direct Backend page](../build-run-qualcomm-ai-engine-direct-backend.md) to understand how to export and run a model with Qualcomm AI Engine Direct Backend on Qualcomm device. -- Follow [the README for executorch llama](https://github.com/pytorch/executorch/tree/main/examples/models/llama2) to know how to run a llama model on mobile via ExecuTorch. +- Follow [the README for executorch llama](https://github.com/pytorch/executorch/tree/main/examples/models/llama) to know how to run a llama model on mobile via ExecuTorch. - A Qualcomm device with 16GB RAM - We are continuing to optimize our memory usage to ensure compatibility with lower memory devices. - The version of [Qualcomm AI Engine Direct SDK](https://developer.qualcomm.com/software/qualcomm-ai-engine-direct-sdk) is 2.26.0 or above. @@ -39,7 +39,7 @@ To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure ```bash # Please note that calibration_data must include the prompt template for special tokens. -python -m examples.models.llama2.export_llama -t +python -m examples.models.llama.export_llama -t llama3/Meta-Llama-3-8B-Instruct/tokenizer.model -p -c --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ``` @@ -76,9 +76,9 @@ llama3/Meta-Llama-3-8B-Instruct/tokenizer.model -p -c ${DEVICE_DIR} adb push ${DEVICE_DIR} adb push cmake-android-out/lib/libqnn_executorch_backend.so ${DEVICE_DIR} -adb push cmake-out-android/examples/models/llama2/llama_main ${DEVICE_DIR} +adb push cmake-out-android/examples/models/llama/llama_main ${DEVICE_DIR} ``` **3.4 Run model** diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md index 4cfebbf9e6..44e7a7a33b 100644 --- a/docs/source/llm/getting-started.md +++ b/docs/source/llm/getting-started.md @@ -361,6 +361,7 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) # Set options for executorch build. +option(EXECUTORCH_ENABLE_LOGGING "" ON) option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON) option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON) option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON) @@ -518,6 +519,7 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) # Set options for executorch build. +option(EXECUTORCH_ENABLE_LOGGING "" ON) option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON) option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON) option(EXECUTORCH_BUILD_EXTENSION_TENSOR "" ON) @@ -763,6 +765,8 @@ An ETRecord is an artifact generated at the time of export that contains model g In your export script, after calling `to_edge()` and `to_executorch()`, call `generate_etrecord()` with the `EdgeProgramManager` from `to_edge()` and the `ExecuTorchProgramManager` from `to_executorch()`. Make sure to copy the `EdgeProgramManager`, as the call to `to_backend()` mutates the graph in-place. ``` +# export_nanogpt.py + import copy from executorch.devtools import generate_etrecord @@ -813,23 +817,24 @@ if (result.buf != nullptr && result.size > 0) { Additionally, update CMakeLists.txt to build with Developer Tools and enable events to be traced and logged into ETDump: ``` +option(EXECUTORCH_ENABLE_EVENT_TRACER "" ON) option(EXECUTORCH_BUILD_DEVTOOLS "" ON) # ... target_link_libraries( - nanogpt_runner - PRIVATE - executorch - extension_module_static # Provides the Module class - optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels - xnnpack_backend # Provides the XNNPACK CPU acceleration backend + # ... omit existing ones etdump) # Provides event tracing and logging target_compile_options(executorch PUBLIC -DET_EVENT_TRACER_ENABLED) target_compile_options(portable_ops_lib PUBLIC -DET_EVENT_TRACER_ENABLED) ``` -Run the runner, you will see “etdump.etdp” generated. +Build and run the runner, you will see a file named “etdump.etdp” is generated. (Note that this time we build in release mode to get around a flatccrt build limitation.) +```bash +(rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake -DCMAKE_BUILD_TYPE=Release ..) +cmake --build cmake-out -j10 +./cmake-out/nanogpt_runner +``` #### Analyze with Inspector APIs diff --git a/docs/source/llm/llama.md b/docs/source/llm/llama.md index 2d266ba7ae..fd0e436b94 100644 --- a/docs/source/llm/llama.md +++ b/docs/source/llm/llama.md @@ -1,5 +1,5 @@ # Llama on ExecuTorch See -[Llama readme](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md) +[Llama readme](https://github.com/pytorch/executorch/blob/main/examples/models/llama/README.md) for detailed information about running Llama on ExecuTorch. diff --git a/docs/source/pte-file-format.md b/docs/source/pte-file-format.md index 9917db9f82..9f5757ed25 100644 --- a/docs/source/pte-file-format.md +++ b/docs/source/pte-file-format.md @@ -31,6 +31,13 @@ Optional ─┤ ├──────────────────── └─ └───────────────────────────────────┘ ``` +## Compatibility + +See the [Runtime Compatibility Policy]( +https://github.com/pytorch/executorch/tree/main/runtime/COMPATIBILITY.md) for +details about the compatibility guarantees between the `.pte` format and the +ExecuTorch runtime. + ## Headers Program files can be recognized by the magic string at byte offset 4, beginning diff --git a/examples/README.md b/examples/README.md index 2c1093296c..17999b1542 100644 --- a/examples/README.md +++ b/examples/README.md @@ -39,7 +39,7 @@ For specific details related to models and backend, you can explore the various ### Llama Models -[This page](./models/llama2/README.md) demonstrates how to run Llama 3.2 (1B, 3B), Llama 3.1 (8B), Llama 3 (8B), and Llama 2 7B models on mobile via ExecuTorch. We use XNNPACK, QNNPACK, MediaTek, and MPS to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones. +[This page](./models/llama/README.md) demonstrates how to run Llama 3.2 (1B, 3B), Llama 3.1 (8B), Llama 3 (8B), and Llama 2 7B models on mobile via ExecuTorch. We use XNNPACK, QNNPACK, MediaTek, and MPS to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on Android and iOS mobile phones. ### Llava1.5 7B diff --git a/examples/apple/mps/executor_runner/mps_executor_runner.mm b/examples/apple/mps/executor_runner/mps_executor_runner.mm index e3d0e2978b..237645ec3f 100644 --- a/examples/apple/mps/executor_runner/mps_executor_runner.mm +++ b/examples/apple/mps/executor_runner/mps_executor_runner.mm @@ -372,7 +372,7 @@ HierarchicalAllocator planned_memory( strstr(model_path, "emformer_transcribe") || strstr(model_path, "emformer_join") || strstr(model_path, "edsr") || - strstr(model_path, "llama2") || + strstr(model_path, "llama") || strstr(model_path, "ic3") || strstr(model_path, "ic4")) { atol = 1e-04; diff --git a/examples/cadence/models/babyllama.py b/examples/cadence/models/babyllama.py index 603eb5f3d9..58a3035723 100644 --- a/examples/cadence/models/babyllama.py +++ b/examples/cadence/models/babyllama.py @@ -14,7 +14,7 @@ from executorch.backends.cadence.aot.export_example import export_model -from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer +from executorch.examples.models.llama.llama_transformer import ModelArgs, Transformer FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s" diff --git a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts index 37c8cbf0ba..ea9d4e6c17 100644 --- a/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts +++ b/examples/demo-apps/android/LlamaDemo/app/build.gradle.kts @@ -57,7 +57,7 @@ dependencies { implementation("androidx.constraintlayout:constraintlayout:2.2.0-alpha12") implementation("com.facebook.fbjni:fbjni:0.5.1") implementation("com.google.code.gson:gson:2.8.6") - implementation(files("libs/executorch-llama.aar")) + implementation(files("libs/executorch.aar")) implementation("com.google.android.material:material:1.12.0") implementation("androidx.activity:activity:1.9.0") testImplementation("junit:junit:4.13.2") diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md index cc7dee455c..573a1d199d 100644 --- a/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md +++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md @@ -12,7 +12,7 @@ Phone verified: MediaTek Dimensity 9300 (D9300) chip. * Download and link the Buck2 build, Android NDK, and MediaTek ExecuTorch Libraries from the MediaTek Backend Readme ([link](https://github.com/pytorch/executorch/tree/main/backends/mediatek/scripts#prerequisites)). * MediaTek Dimensity 9300 (D9300) chip device * Desired Llama 3 model weights. You can download them on HuggingFace [Example](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)). -* Download NeuroPilot Express SDK from the [MediaTek NeuroPilot Portal](https://neuropilot.mediatek.com/resources/public/npexpress/en/docs/npexpress) (coming soon): +* Download NeuroPilot Express SDK from the [MediaTek NeuroPilot Portal](https://neuropilot.mediatek.com/resources/public/npexpress/en/docs/npexpress): - `libneuronusdk_adapter.mtk.so`: This universal SDK contains the implementation required for executing target-dependent code on the MediaTek chip. - `libneuron_buffer_allocator.so`: This utility library is designed for allocating DMA buffers necessary for model inference. - `mtk_converter-8.8.0.dev20240723+public.d1467db9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl`: This library preprocess the model into a MediaTek representation. diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md index 54bf956176..8308da6d84 100644 --- a/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md +++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/qualcomm_README.md @@ -74,7 +74,7 @@ cmake --build cmake-out -j16 --target install --config Release ### Setup Llama Runner Next we need to build and compile the Llama runner. This is similar to the requirements for running Llama with XNNPACK. ``` -sh examples/models/llama2/install_requirements.sh +sh examples/models/llama/install_requirements.sh cmake -DPYTHON_EXECUTABLE=python \ -DCMAKE_INSTALL_PREFIX=cmake-out \ @@ -84,9 +84,9 @@ cmake -DPYTHON_EXECUTABLE=python \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_QNN=ON \ - -Bcmake-out/examples/models/llama2 \ - examples/models/llama2 -cmake --build cmake-out/examples/models/llama2 -j16 --config Release + -Bcmake-out/examples/models/llama \ + examples/models/llama +cmake --build cmake-out/examples/models/llama -j16 --config Release ``` ## Export Llama Model @@ -101,12 +101,12 @@ We support PTQ by default. The entire export may take ~20 minutes (Llama 3.1 8B) Examples: ``` # 4 bits weight only quantize -python -m examples.models.llama2.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_16a4w -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” +python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_16a4w -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” ``` If the model is really big, it may require model sharding because the Qualcomm DSP is a 32bit system and has a 4GB size limit . For example for Llama 3 8B models, we need to shard the model into 4, but ExecuTorch still packages it into one PTE file. Here is an example: ``` # 8 bits quantization with 4 shards -python -m examples.models.llama2.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_8a8w -d fp32 --num_sharding 4 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” +python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" -p "${MODEL_DIR}/params.json" -kv --disable_dynamic_shape --qnn --pt2e_quantize qnn_8a8w -d fp32 --num_sharding 4 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="test.pte” ``` Note: if you encountered issues below ``` @@ -158,7 +158,7 @@ To export Llama 3 8B instruct with the Qualcomm AI Engine Direct Backend, ensure * 8B models might need 16GB RAM on the device to run. ``` # Please note that calibration_data must include the prompt template for special tokens. -python -m examples.models.llama2.export_llama -t -p -c --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" +python -m examples.models.llama.export_llama -t -p -c --use_kv_cache --qnn --pt2e_quantize qnn_16a4w --disable_dynamic_shape --num_sharding 8 --calibration_tasks wikitext --calibration_limit 1 --calibration_seq_length 128 --optimized_rotation_path --calibration_data "<|start_header_id|>system<|end_header_id|>\n\nYou are a funny chatbot.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nCould you tell me about Facebook?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" ``` ## Pushing Model and Tokenizer diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md index 9a8b86b8a5..3570c9acd3 100644 --- a/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md +++ b/examples/demo-apps/android/LlamaDemo/docs/delegates/xnnpack_README.md @@ -66,12 +66,12 @@ In this demo app, we support text-only inference with up-to-date Llama models an We have supported BFloat16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B models. * You can request and download model weights for Llama through Meta official [website](https://llama.meta.com/). * For chat use-cases, download the instruct models instead of pretrained. -* Run `examples/models/llama2/install_requirements.sh` to install dependencies. +* Run `examples/models/llama/install_requirements.sh` to install dependencies. * The 1B model in BFloat16 format can run on mobile devices with 8GB RAM. The 3B model will require 12GB+ RAM. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama2.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2.pte" +python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2.pte" ``` * Rename tokenizer for Llama 3.2 with command: `mv tokenizer.model tokenizer.bin`. We are updating the demo app to support tokenizer in original format directly. @@ -88,19 +88,19 @@ To safeguard your application, you can use our Llama Guard models for prompt cla * We prepared this model using the following command ``` -python -m examples.models.llama2.export_llama --checkpoint --params -d fp32 -kv --use_sdpa_with_kv_cache --quantization_mode 8da4w --group_size 256 --xnnpack --max_seq_length 8193 --embedding-quantize 4,32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_prune_map --output_name="llama_guard_3_1b_pruned_xnnpack.pte" +python -m examples.models.llama.export_llama --checkpoint --params -d fp32 -kv --use_sdpa_with_kv_cache --quantization_mode 8da4w --group_size 256 --xnnpack --max_seq_length 8193 --embedding-quantize 4,32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_prune_map --output_name="llama_guard_3_1b_pruned_xnnpack.pte" ``` ### For Llama 3.1 and Llama 2 models * You can download original model weights for Llama through Meta official [website](https://llama.meta.com/). * For Llama 2 models, Edit params.json file. Replace "vocab_size": -1 with "vocab_size": 32000. This is a short-term workaround -* Run `examples/models/llama2/install_requirements.sh` to install dependencies. +* Run `examples/models/llama/install_requirements.sh` to install dependencies. * The Llama 3.1 and Llama 2 models (8B and 7B) can run on devices with 12GB+ RAM. * Export Llama model and generate .pte file ``` -python -m examples.models.llama2.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama.pte" +python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama.pte" ``` You may wonder what the ‘--metadata’ flag is doing. This flag helps export the model with proper special tokens added that the runner can detect EOS tokens easily. @@ -159,7 +159,7 @@ sh examples/demo-apps/android/LlamaDemo/setup.sh This is running the shell script which configures the required core ExecuTorch, Llama2/3, and Android libraries, builds them, and copies them to jniLibs. -**Output**: The executorch-llama.aar file will be generated in a newly created folder in the example/demo-apps/android/LlamaDemo/app/libs directory. This is the path that the Android app expects it to be in. +**Output**: The executorch.aar file will be generated in a newly created folder in the example/demo-apps/android/LlamaDemo/app/libs directory. This is the path that the Android app expects it to be in. **Note**: If you are building the Android app mentioned in the next section on a separate machine (i.e. MacOS but building and exporting on Linux), make sure you copy the aar file generated from setup script to “examples/demo-apps/android/LlamaDemo/app/libs” before building the Android app. diff --git a/examples/demo-apps/android/LlamaDemo/download_prebuilt_lib.sh b/examples/demo-apps/android/LlamaDemo/download_prebuilt_lib.sh index 34cf910746..215bccea8f 100644 --- a/examples/demo-apps/android/LlamaDemo/download_prebuilt_lib.sh +++ b/examples/demo-apps/android/LlamaDemo/download_prebuilt_lib.sh @@ -15,6 +15,5 @@ mkdir -p "$LIBS_PATH" pushd "$LIBS_PATH" curl -O "${AAR_SHASUM_URL}" -sed -i -e 's/executorch.aar/executorch-llama.aar/g' executorch.aar.sha256sums -shasum --check --status executorch.aar.sha256sums || curl "${AAR_URL}" -o executorch-llama.aar +shasum --check --status executorch.aar.sha256sums || curl "${AAR_URL}" -o executorch.aar popd diff --git a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh index 53d248bc75..044d80832d 100644 --- a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh +++ b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh @@ -22,4 +22,4 @@ build_jar build_android_native_library "arm64-v8a" build_aar mkdir -p "$BASEDIR"/app/libs -cp "$BUILD_AAR_DIR/executorch.aar" "$BASEDIR"/app/libs/executorch-llama.aar +cp "$BUILD_AAR_DIR/executorch.aar" "$BASEDIR"/app/libs/executorch.aar diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh index 140434a86d..4d70c67ede 100644 --- a/examples/demo-apps/android/LlamaDemo/setup.sh +++ b/examples/demo-apps/android/LlamaDemo/setup.sh @@ -18,4 +18,4 @@ build_android_native_library "arm64-v8a" build_android_native_library "x86_64" build_aar mkdir -p "$BASEDIR"/app/libs -cp "$BUILD_AAR_DIR/executorch.aar" "$BASEDIR"/app/libs/executorch-llama.aar +cp "$BUILD_AAR_DIR/executorch.aar" "$BASEDIR"/app/libs/executorch.aar diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj index c947e20af9..7b93af46e4 100644 --- a/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj @@ -99,8 +99,8 @@ 035A5E942BB4B523001E0553 /* LLaMA.entitlements */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.entitlements; path = LLaMA.entitlements; sourceTree = ""; }; 036CAF9D2BB1444500D6C2D5 /* LLaMA.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = LLaMA.app; sourceTree = BUILT_PRODUCTS_DIR; }; 03729ED52BB1F8DE00152F2E /* LLaMARunner.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = LLaMARunner.framework; sourceTree = BUILT_PRODUCTS_DIR; }; - 03729F072BB203B300152F2E /* runner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = runner.cpp; path = ../../../examples/models/llama2/runner/runner.cpp; sourceTree = ""; }; - 03729F082BB203B300152F2E /* runner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = runner.h; path = ../../../examples/models/llama2/runner/runner.h; sourceTree = ""; }; + 03729F072BB203B300152F2E /* runner.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = runner.cpp; path = ../../../examples/models/llama/runner/runner.cpp; sourceTree = ""; }; + 03729F082BB203B300152F2E /* runner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = runner.h; path = ../../../examples/models/llama/runner/runner.h; sourceTree = ""; }; 03729F092BB203B300152F2E /* util.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = util.h; sourceTree = ""; }; 03729F102BB2042B00152F2E /* sampler.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sampler.h; sourceTree = ""; }; 03729F112BB2042B00152F2E /* sampler.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sampler.cpp; sourceTree = ""; }; @@ -275,7 +275,7 @@ 03729F152BB2043600152F2E /* tokenizer.h */, ); name = tokenizer; - path = ../../../../../models/llama2/tokenizer; + path = ../../../../../models/llama/tokenizer; sourceTree = ""; }; 03729F0F2BB203E100152F2E /* sampler */ = { diff --git a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm index e6a4722ddb..e03bc7aabc 100644 --- a/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm +++ b/examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm @@ -9,7 +9,7 @@ #import "LLaMARunner.h" #import -#import +#import #import using executorch::extension::llm::Image; diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md index 20ee73b821..eb3c244dee 100644 --- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md +++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/mps_README.md @@ -42,12 +42,12 @@ In this demo app, we support text-only inference with Llama 3.1, Llama 3, and Ll Install the required packages to export the model ``` -sh examples/models/llama2/install_requirements.sh +sh examples/models/llama/install_requirements.sh ``` Export the model ``` -python -m examples.models.llama2.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" --params "${MODEL_DIR}/params.json" -kv --use_sdpa_with_kv_cache --mps -d fp32 --disable_dynamic_shape -qmode 8da4w -G 32 +python -m examples.models.llama.export_llama --checkpoint "${MODEL_DIR}/consolidated.00.pth" --params "${MODEL_DIR}/params.json" -kv --use_sdpa_with_kv_cache --mps -d fp32 --disable_dynamic_shape -qmode 8da4w -G 32 ``` ## Pushing Model and Tokenizer @@ -76,7 +76,7 @@ sudo /Applications/CMake.app/Contents/bin/cmake-gui --install The prebuilt ExecuTorch runtime, backend, and kernels are available as a Swift PM package. ### Xcode -Open the project in Xcode.In Xcode, go to `File > Add Package Dependencies`. Paste the URL of the ExecuTorch repo into the search bar and select it. Make sure to change the branch name to the desired ExecuTorch version, e.g., “0.3.0”, or just use the “latest” branch name for the latest stable build. +Open the project in Xcode.In Xcode, go to `File > Add Package Dependencies`. Paste the URL of the ExecuTorch repo into the search bar and select it. Make sure to change the branch name to the desired ExecuTorch version, e.g., “0.4.0”, or just use the “latest” branch name for the latest stable build. Note: If you're running into any issues related to package dependencies, quit Xcode entirely, delete the whole executorch repo, clean the caches by running the command below in terminal and clone the repo again. diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md index c3e254d77a..faec4e3a3a 100644 --- a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md +++ b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md @@ -43,19 +43,19 @@ In this demo app, we support text-only inference with up-to-date Llama models. Install the required packages to export the model ``` -sh examples/models/llama2/install_requirements.sh +sh examples/models/llama/install_requirements.sh ``` ### For Llama 3.2 1B and 3B models We have supported BFloat16 as a data type on the XNNPACK backend for Llama 3.2 1B/3B models. * You can download original model weights for Llama through Meta official [website](https://llama.meta.com/). * For chat use-cases, download the instruct models instead of pretrained. -* Run “examples/models/llama2/install_requirements.sh” to install dependencies. +* Run “examples/models/llama/install_requirements.sh” to install dependencies. * The 1B model in BFloat16 format can run on mobile devices with 8GB RAM (iPhone 15 Pro and later). The 3B model will require 12GB+ RAM and hence will not fit on 8GB RAM phones. * Export Llama model and generate .pte file as below: ``` -python -m examples.models.llama2.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2.pte" +python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -d bf16 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --output_name="llama3_2.pte" ``` For more detail using Llama 3.2 lightweight models including prompt template, please go to our official [website](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-lightweight-models-(1b/3b)-). @@ -64,7 +64,7 @@ For more detail using Llama 3.2 lightweight models including prompt template, pl Export the model ``` -python -m examples.models.llama2.export_llama --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" +python -m examples.models.llama.export_llama --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" ``` ### For LLaVA model diff --git a/examples/mediatek/CMakeLists.txt b/examples/mediatek/CMakeLists.txt index 1d411f07ca..61906870e1 100644 --- a/examples/mediatek/CMakeLists.txt +++ b/examples/mediatek/CMakeLists.txt @@ -149,7 +149,7 @@ if(${ANDROID}) PRIVATE ${LLAMA2_TOKENIZER_DIR}/tiktoken.cpp ${LLAMA2_TOKENIZER_DIR}/bpe_tokenizer.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../models/llama2/tokenizer/llama_tiktoken.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../models/llama/tokenizer/llama_tiktoken.cpp ) # Include directory for neuron headers diff --git a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp index 2ebacec2c5..4fba0e20a8 100644 --- a/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp +++ b/examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp @@ -67,7 +67,7 @@ #include "llama_runner/Utils.h" #include "llama_runner/llm_helper/include/llm_types.h" -#include +#include #include #include diff --git a/examples/models/__init__.py b/examples/models/__init__.py index 81c9fc2b6d..80f95af89e 100644 --- a/examples/models/__init__.py +++ b/examples/models/__init__.py @@ -16,7 +16,8 @@ "emformer_transcribe": ("emformer_rnnt", "EmformerRnntTranscriberModel"), "emformer_predict": ("emformer_rnnt", "EmformerRnntPredictorModel"), "emformer_join": ("emformer_rnnt", "EmformerRnntJoinerModel"), - "llama2": ("llama2", "Llama2Model"), + "llama2": ("llama", "Llama2Model"), + "llama": ("llama", "Llama2Model"), "lstm": ("lstm", "LSTMModel"), "mobilebert": ("mobilebert", "MobileBertModelExample"), "mv2": ("mobilenet_v2", "MV2Model"), diff --git a/examples/models/checkpoint.py b/examples/models/checkpoint.py index 592ebab145..ee3fb56042 100644 --- a/examples/models/checkpoint.py +++ b/examples/models/checkpoint.py @@ -15,15 +15,15 @@ def get_default_model_resource_dir(model_file_path: str) -> Path: Get the default path to resouce files (which contain files such as the checkpoint and param files), either: 1. Uses the path from pkg_resources, only works with buck2 - 2. Uses default path located in examples/models/llama2/params + 2. Uses default path located in examples/models/llama/params Expected to be called from with a `model.py` file located in a `executorch/examples/models/` directory. Args: model_file_path: The file path to the eager model definition. - For example, `executorch/examples/models/llama2/model.py`, - where `executorch/examples/models/llama2` contains all + For example, `executorch/examples/models/llama/model.py`, + where `executorch/examples/models/llama` contains all the llama2-related files. Returns: @@ -35,7 +35,7 @@ def get_default_model_resource_dir(model_file_path: str) -> Path: # 1st way: If we can import this path, we are running with buck2 and all resources can be accessed with pkg_resources. # pyre-ignore - from executorch.examples.models.llama2 import params # noqa + from executorch.examples.models.llama import params # noqa # Get the model name from the cwd, assuming that this module is called from a path such as # examples/models//model.py. diff --git a/examples/models/llama2/Android3_2_1B_bf16.gif b/examples/models/llama/Android3_2_1B_bf16.gif similarity index 100% rename from examples/models/llama2/Android3_2_1B_bf16.gif rename to examples/models/llama/Android3_2_1B_bf16.gif diff --git a/examples/models/llama2/Android3_2_3B_SpinQuant.gif b/examples/models/llama/Android3_2_3B_SpinQuant.gif similarity index 100% rename from examples/models/llama2/Android3_2_3B_SpinQuant.gif rename to examples/models/llama/Android3_2_3B_SpinQuant.gif diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama/CMakeLists.txt similarity index 100% rename from examples/models/llama2/CMakeLists.txt rename to examples/models/llama/CMakeLists.txt diff --git a/examples/models/llama2/LICENSE b/examples/models/llama/LICENSE similarity index 100% rename from examples/models/llama2/LICENSE rename to examples/models/llama/LICENSE diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md new file mode 100644 index 0000000000..29d468543a --- /dev/null +++ b/examples/models/llama/README.md @@ -0,0 +1,497 @@ +# Summary +This example demonstrates how to run a [llama models](https://www.llama.com/) on mobile via ExecuTorch. We use XNNPACK to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on a phone. + +Here are supported models: + +- Llama 3.2 1B and 3B +- Llama 3.1 8B +- Llama 3 8B +- Llama 2 7B + +Pretrained models are not included in this repo. Users are suggested to download them [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/). + +# What is Llama? +Llama is a collection of large language models that use publicly available data for training. These models are based on the transformer architecture, which allows it to process input sequences of arbitrary length and generate output sequences of variable length. One of the key features of Llama models is its ability to generate coherent and contextually relevant text. This is achieved through the use of attention mechanisms, which allow the model to focus on different parts of the input sequence as it generates output. Additionally, Llama models use a technique called “masked language modeling” to pre-train the model on a large corpus of text, which helps it learn to predict missing words in a sentence. + +Llama models have shown to perform well on a variety of natural language processing tasks, including language translation, question answering, and text summarization and are also capable of generating human-like text, making Llama models a useful tool for creative writing and other applications where natural language generation is important. + +Overall, Llama models are powerful and versatile language models that can be used for a wide range of natural language processing tasks. The model’s ability to generate coherent and contextually relevant text makes it particularly useful for applications such as chatbots, virtual assistants, and language translation. + +Please note that the models are subject to the [Llama 2 Acceptable Use Policy](https://github.com/facebookresearch/llama/blob/main/USE_POLICY.md), [Llama 3 Acceptable Use Policy](https://github.com/meta-llama/llama3/blob/main/USE_POLICY.md) and [Responsible Use Guide](https://ai.meta.com/static-resource/responsible-use-guide/). + + +# Results + +Since Llama 2 7B or Llama 3 8B model needs at least 4-bit quantization to fit even within some of the highend phones, results presented here correspond to 4-bit groupwise post-training quantized model. + +For Llama 3.2 1B/3B, we validated the models by running them in their original bf16 datatype and unquantized on both Android and iOS phones. The 3B version required high-end phones with larger RAMs to fit the model. + +Additionally, 1B/3B models are sensitive to accuracy loss when regular PTQ quantization is applied, so we employed 4bit quantization using [SpinQuant](https://github.com/facebookresearch/SpinQuant/tree/main) to achieve a good balance between accuracy, performance and memory. + + + + + + +
+ +
+ + Llama3.1 8B, 4bit quantized on Android phone + +
+
+ Llama3.2 1B, unquantized, bf16 on Android phone. +
+ +## Quantization: +We employed 4-bit groupwise per token dynamic quantization of all the linear layers of the model. Dynamic quantization refers to quantizating activations dynamically, such that quantization parameters for activations are calculated, from min/max range, at runtime. Here we quantized activations with 8bits (signed integer). Furthermore, weights are statically quantized. In our case weights were per-channel groupwise quantized with 4bit signed integer. For more information refer to this [page](https://github.com/pytorch/ao). + +We evaluated WikiText perplexity using [LM Eval](https://github.com/EleutherAI/lm-evaluation-harness). Please note that LM Eval reports perplexity normalized by word count instead of token count. You may see different perplexity for WikiText from other sources if they implement it differntly. More details could be found [here](https://github.com/EleutherAI/lm-evaluation-harness/issues/2301). + +Below are the results for two different groupsizes, with max_seq_length 2048, and limit 1000. + +|Model | Baseline (FP32) | Groupwise 4-bit (128) | Groupwise 4-bit (256) +|--------|-----------------| ---------------------- | --------------- +|Llama 2 7B | 9.2 | 10.2 | 10.7 +|Llama 3 8B | 7.9 | 9.4 | 9.7 + +Note that groupsize less than 128 was not enabled, since such models were still too large. This is because our current efforts have focused on enabling FP32 and support for FP16 is under way. What this implies for model size is that 1) embedding table is in FP32 and 2) quantized weights scales are FP32. + +### SpinQuant for Llama 3.2 1B/3B models (Optional) + +To improve accuracy, we can use [SpinQuant](https://github.com/facebookresearch/SpinQuant/tree/main), a post-training quantization (PTQ) technique that generates new quantized weights. In the standard PTQ process, quantization may lead to a decrease in accuracy when there are outliers. The SpinQuant method takes the original weights and produces optimized quantized weights with minimal outliers, resulting in higher accuracy. This can be achieved without any finetuning of the weights and only requires 100 iterations on a single A100 node. + +SpinQuant can generate quantized weights that are [compatible with ExecuTorch](https://github.com/facebookresearch/SpinQuant/tree/main?tab=readme-ov-file#3-export-to-executorch), specifically, it can be integrated with the existing optimized XNNPACK kernels (e.g., group-wise 4bit weight and 8bit dynamic activation). This allows developers to benefit from the higher accuracy of SpinQuant while also taking advantage of the strong performance of ExecuTorch acceleration. We enabled SpinQuant for Llama3.2 1B/3B models on ExecuTorch. + +

+ +
+ + Running Llama3.2 3B on Android phone. + +
+ + 4bit quantization using SpinQuant + +

+ +## Enablement + +For Llama 3 8B and Llama3.1 8B, we have verified so far on iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S24+ and OnePlus 12 (with 16GB RAM). + +We have verified running Llama 2 7B [mobile applications](#step-6-build-mobile-apps) efficiently on select devices including the iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S22 and S24, and OnePlus 12. + +## Performance + +### Llama 3.2 1B and 3B +Llama 3.2 1B and 3B performance was measured on the OnePlus 12 device. The performance measurement is expressed in terms of tokens per second using an [adb binary-based approach](#step-5-run-benchmark-on) for generating 128 tokens. + +|Model | bf16 | 4bit(*) via SpinQuant +|--------| ---------------------- | --------------- +|1B | 19.4 tokens/second | 53.41 tokens/second | +|3B | 7.76 tokens/second | 22.98 tokens/second | + +(*) With SpinQuant, we currently quantize 4-bit groupwise (with groupsize 32) weight, 8bit dynamic activation of all the linear layers of the model, except embedding and output layers. The embedding and output layers are quantized as 8-bit per-channel weight and 8-bit dynamic activation. + +### Llama3 8B and Llama3.1 8B +Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus 12 devices. The performance measurement is expressed in terms of tokens per second using an [adb binary-based approach](#step-5-run-benchmark-on). + +Note that since Llama3's vocabulary size is 4x that of Llama2, we had to quantize embedding lookup table as well. For these results embedding lookup table was groupwise quantized with 4-bits and group size of 32. + +|Device | Groupwise 4-bit (128) | Groupwise 4-bit (256) +|--------| ---------------------- | --------------- +|Galaxy S22 | 7.85 tokens/second | 8.4 tokens/second | +|Galaxy S24 | 10.91 tokens/second | 11.21 tokens/second | +|OnePlus 12 | 10.85 tokens/second | 11.02 tokens/second | + +### Llama2 7B +Llama 2 7B performance was measured on the Samsung Galaxy S22, S24, and OnePlus 12 devices. The performance measurement is expressed in terms of tokens per second using an [adb binary-based approach](#step-5-run-benchmark-on). + +|Device | Groupwise 4-bit (128) | Groupwise 4-bit (256) +|--------| ---------------------- | --------------- +|Galaxy S22 | 8.15 tokens/second | 8.3 tokens/second | +|Galaxy S24 | 10.66 tokens/second | 11.26 tokens/second | +|OnePlus 12 | 11.55 tokens/second | 11.6 tokens/second | + +# Instructions + +## Tested on + +- MacOS M1/M2, Linux. +- For Llama 2 7B, your device may require at least 32GB RAM. If this is a constraint for you, please try the smaller stories model. + +## Step 1: Setup +> :warning: **double check your python environment**: make sure `conda activate ` is run before all the bash and python scripts. + +1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_requirements.sh --pybind xnnpack` +2. Run `examples/models/llama/install_requirements.sh` to install a few dependencies. + + +## Step 2: Prepare model + +### Option A: Download and export Llama3.2 1B/3B model. + +1. Download `consolidated.00.pth`, `params.json` and `tokenizer.model` from [Llama website](https://www.llama.com/llama-downloads/) or [Hugging Face](https://huggingface.co/meta-llama/Llama-3.2-1B). For chat use-cases, download the instruct models. + +2. Export model and generate `.pte` file. Use original bfloat16 version, without any quantization. + +``` +# Set these paths to point to the downloaded files +LLAMA_CHECKPOINT=path/to/checkpoint.pth +LLAMA_PARAMS=path/to/params.json + +python -m examples.models.llama.export_llama \ + --checkpoint "${LLAMA_CHECKPOINT:?}" \ + --params "${LLAMA_PARAMS:?}" \ + -kv \ + --use_sdpa_with_kv_cache \ + -X \ + -d bf16 \ + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ + --output_name="llama3_2.pte" +``` + +Optionally, we can apply SpinQuant to quantize the model without sacrifacing too much accuracy loss. + +To use SpinQuant, follow its [instruction](https://github.com/facebookresearch/SpinQuant/tree/main?tab=readme-ov-file#3-export-to-executorch) for exporting checkpoint to ExecuTorch and then export the SpinQuant checkpoint. + +``` +# Set these paths to point to the exported files +LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/checkpoint.pth +LLAMA_PARAMS=path/to/params.json + +python -m examples.models.llama.export_llama \ + --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \ + --params "${LLAMA_PARAMS:?}" \ + --use_sdpa_with_kv_cache \ + -X \ + --preq_mode 8da4w_output_8da8w \ + --preq_group_size 32 \ + --max_seq_length 2048 \ + --output_name "llama3_2.pte" \ + -kv \ + -d fp32 \ + --preq_embedding_quantize 8,0 \ + --use_spin_quant native \ + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' +``` + +### Option B: Download and export Llama 3 8B instruct model + +You can export and run the original Llama 3 8B instruct model. + +1. Llama 3 pretrained parameters can be downloaded from [Meta's official Llama 3 repository](https://github.com/meta-llama/llama3/). + +2. Export model and generate `.pte` file + ``` + python -m examples.models.llama.export_llama --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" + ``` + + Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` as shown above to further reduce the model size. + +### Option C: Download and export stories110M model + +If you want to deploy and run a smaller model for educational purposes. From `executorch` root: + +1. Download `stories110M.pt` and `tokenizer.model` from Github. + ``` + wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" + wget "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" + ``` +2. Create params file. + ``` + echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json + ``` +3. Export model and generate `.pte` file. + ``` + python -m examples.models.llama.export_llama -c stories110M.pt -p params.json -X -kv + ``` + +### Option D: Download and export Llama 2 7B model + +You can export and run the original Llama 2 7B model. + +1. Llama 2 pretrained parameters can be downloaded from [Meta's official website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) or from [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b). + +2. Edit `params.json` file. Replace `"vocab_size": -1` with `"vocab_size": 32000`. This is a short-term workaround. + +3. Export model and generate `.pte` file: + ``` + python -m examples.models.llama.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 + ``` +4. Create tokenizer.bin. + ``` + python -m extension.llm.tokenizer.tokenizer -t -o tokenizer.bin + ``` + +### Option E: Download models from Hugging Face and convert from safetensor format to state dict + + +You can also download above models from [Hugging Face](https://huggingface.co/). Since ExecuTorch starts from a PyTorch model, a script like below can be used to convert the Hugging Face safetensors format to PyTorch's state dict. It leverages the utils provided by [TorchTune](https://github.com/pytorch/torchtune). + + +```Python +from torchtune.utils import FullModelHFCheckpointer +from torchtune.models import convert_weights +import torch + +# Convert from safetensors to TorchTune. Suppose the model has been downloaded from Hugging Face +checkpointer = FullModelHFCheckpointer( + checkpoint_dir='/home/.cache/huggingface/hub/models/snapshots/hash-number', + checkpoint_files=['model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors'], + output_dir='/the/destination/dir' , + model_type='LLAMA3' # or other types that TorchTune supports +) + +print("loading checkpoint") +sd = checkpointer.load_checkpoint() + +# Convert from TorchTune to Meta (PyTorch native) +sd = convert_weights.tune_to_meta(sd['model']) + +print("saving checkpoint") +torch.save(sd, "/the/destination/dir/checkpoint.pth") +``` + +## (Optional) Finetuning + +If you want to finetune your model based on a specific dataset, PyTorch provides [TorchTune](https://github.com/pytorch/torchtune) - a native-Pytorch library for easily authoring, fine-tuning and experimenting with LLMs. + +Once you have [TorchTune installed](https://github.com/pytorch/torchtune?tab=readme-ov-file#get-started) you can finetune Llama2 7B model using LoRA on a single GPU, using the following command. This will produce a checkpoint where the LoRA weights are merged with the base model and so the output checkpoint will be in the same format as the original Llama2 model. + +``` +tune run lora_finetune_single_device \ +--config llama2/7B_lora_single_device \ +checkpointer.checkpoint_dir= \ +tokenizer.path=/tokenizer.model +``` + +To run full finetuning with Llama2 7B on a single device, you can use the following command. + +``` +tune run full_finetune_single_device \ +--config llama2/7B_full_single_device \ +checkpointer.checkpoint_dir= \ +tokenizer.path=/tokenizer.model +``` + +## Step 3: Evaluate model accuracy + +> Forewarning: Model evaluation without a GPU may take a long time, especially on larger models. + +We use [LM Eval](https://github.com/EleutherAI/lm-evaluation-harness) to evaluate model accuracy. + +For base models, use the following example command to calculate its perplexity based on WikiText. +``` +python -m examples.models.llama.eval_llama \ + -c \ + -p \ + -t \ + -kv \ + -d \ + --max_seq_len \ + --limit +``` + +For instruct models, use the following example command to calculate its MMLU score. +``` +python -m examples.models.llama.eval_llama \ + -c \ + -p \ + -t \ + -kv \ + -d \ + --tasks mmlu \ + --num_fewshot 5 \ + --max_seq_len +``` + +## Step 4: Run on your computer to validate + +1. Build executorch with optimized CPU performance as follows. Build options available [here](https://github.com/pytorch/executorch/blob/main/CMakeLists.txt#L59). + ``` + cmake -DPYTHON_EXECUTABLE=python \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DEXECUTORCH_ENABLE_LOGGING=1 \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -Bcmake-out . + + cmake --build cmake-out -j16 --target install --config Release + ``` +Note for Mac users: There's a known linking issue with Xcode 15.1. Refer to the session of Common Issues and Mitigations below for solutions. + +2. Build llama runner. + ``` + cmake -DPYTHON_EXECUTABLE=python \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -Bcmake-out/examples/models/llama \ + examples/models/llama + + cmake --build cmake-out/examples/models/llama -j16 --config Release + ``` + +3. Run model. Run options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/llama/main.cpp#L18-L40). + ``` + cmake-out/examples/models/llama/llama_main --model_path= --tokenizer_path= --prompt= + ``` + +For Llama2 models, pass the converted `tokenizer.bin` file instead of `tokenizer.model`. + +To build for CoreML backend and validate on Mac, replace `-DEXECUTORCH_BUILD_XNNPACK=ON` with `-DEXECUTORCH_BUILD_COREML=ON` + +## Step 5: Run benchmark on Android phone + +**1. Build llama runner binary for Android** + +*Pre-requisite*: Android NDK (tested with r27b) which can be downloaded from [here](https://developer.android.com/ndk/downloads). Note that the mac binary can be unpackaged and you can locate NDK folder from it. + +**1.1 Set Android NDK** +``` +export ANDROID_NDK= +``` +**1.2 Build executorch and associated libraries for android.** +``` +cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-23 \ + -DCMAKE_INSTALL_PREFIX=cmake-out-android \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_ENABLE_LOGGING=1 \ + -DPYTHON_EXECUTABLE=python \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -Bcmake-out-android . + +cmake --build cmake-out-android -j16 --target install --config Release +``` + +**1.2 Build llama runner for android** +``` +cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-23 \ + -DCMAKE_INSTALL_PREFIX=cmake-out-android \ + -DCMAKE_BUILD_TYPE=Release \ + -DPYTHON_EXECUTABLE=python \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -Bcmake-out-android/examples/models/llama \ + examples/models/llama + +cmake --build cmake-out-android/examples/models/llama -j16 --config Release +``` + +**2. Run on Android via adb shell** + +*Pre-requisite*: Make sure you enable USB debugging via developer options on your phone + +**2.1 Connect your android phone** + +**2.2 Upload model, tokenizer and llama runner binary to phone** +``` +adb shell mkdir -p /data/local/tmp/llama +adb push /data/local/tmp/llama/ +adb push /data/local/tmp/llama/ +adb push cmake-out-android/examples/models/llama/llama_main /data/local/tmp/llama/ +``` + +**2.3 Run model** +``` +adb shell "cd /data/local/tmp/llama && ./llama_main --model_path --tokenizer_path --prompt \"Once upon a time\" --seq_len 120" +``` +## Step 6: Build Mobile apps + +### iOS + +Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-ios.html) to for full instructions on building the iOS LLAMA Demo App. Rename `tokenizer.model` file to `tokenizer.bin` because the demo app looks for the tokenizer file with .bin extension. + +### Android +Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-android.html) to for full instructions on building the Android LLAMA Demo App. + +## Optional: Smaller models delegated to other backends +Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction +for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.html), [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [QNN](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html)) before trying to lower them. After the backend library is installed, the script to export a lowered model is + +- Lower to CoreML: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json ` +- MPS: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json ` +- QNN: `python -m examples.models.llama.export_llama -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json ` + +The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run. + +For CoreML, there are 2 additional optional arguments: +* `--coreml-ios`: Specify the minimum iOS version to deploy (and turn on available optimizations). E.g. `--coreml-ios 18` will turn on [in-place KV cache](https://developer.apple.com/documentation/coreml/mlstate?language=objc) and [fused scaled dot product attention kernel](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS18.transformers.scaled_dot_product_attention) (the resulting model will then need at least iOS 18 to run, though) +* `--coreml-quantize`: Use [quantization tailored for CoreML](https://apple.github.io/coremltools/docs-guides/source/opt-quantization-overview.html). E.g. `--coreml-quantize b4w` will perform per-block 4-bit weight-only quantization in a way tailored for CoreML + +# What is coming next? +## Quantization +- Enabling FP16 model to leverage smaller groupsize for 4-bit quantization. +- Enabling GPTQ for 4-bit groupwise quantization +- Enabling custom quantization +- Lower bit quantization +## Models +- Enabling more generative AI models and architectures. +- Enable support for mult-modal models like LlaVa. +## Performance +- Performance improvement via techniques such as speculative decoding +- Enabling LLama2 7b and other architectures via Vulkan +- Enabling performant execution of widely used quantization schemes. + + +# Notes +This example tries to reuse the Python code, with minimal modifications to make it compatible with current ExecuTorch: +1. Since ExecuTorch does not support complex Tensor data type, use the customized functions to have rotary embedding with real numbers. Please see [GitHub issue: Support complex data type in ExecuTorch](https://github.com/pytorch/executorch/issues/886). +2. No CUDA. ExecuTorch is focused on Edge use cases where CUDA is not available on most of the edge devices. +3. No dependencies on fairscale. The ColumnParallelLinear, ParallelEmbedding and training are not needed and supported in ExecuTorch. + + +# Common Issues and Mitigations: +- To clean your build: +``` +git clean -xfd +pip uninstall executorch +./install_requirements.sh --pybind xnnpack + +rm -rf cmake-out +``` +- If you encounter `pthread` related issues during link time, add `pthread` in `target_link_libraries` in `CMakeLists.txt` +- On Mac, if there is linking error in Step 4 with error message like +``` +0 0x100823648 __assert_rtn + 72 +1 0x10074bc5c ld::Fixup::applyFixup(ld::Atom const*, ld::LayoutLinkedImage const&, unsigned char*) const + 8268 +2 0x1007de7d8 ___ZN2ld16LayoutExecutable27writeContentWithoutLinkEditENSt3__14spanIhLm18446744073709551615EEEy_block_invoke + 332 +3 0x188cca428 _dispatch_client_callout2 + 20 +4 0x188cde850 _dispatch_apply_invoke3 + 336 +5 0x188cca3e8 _dispatch_client_callout + 20 +6 0x188ccbc68 _dispatch_once_callout + 32 +7 0x188cdeeec _dispatch_apply_invoke_and_wait + 372 +8 0x188cdde9c _dispatch_apply_with_attr_f + 1212 +9 0x188cde08c dispatch_apply + 96 +10 0x1007de9e4 void mapReduce(std::__1::span, unsigned long, void (unsigned long, mach_o::Error&, std::__1::span) block_pointer, void (std::__1::span) block_pointer) + 336 +11 0x1007de594 ld::LayoutExecutable::writeContentWithoutLinkEdit(std::__1::span, unsigned long long) + 1180 +12 0x1007e4020 ld::LayoutExecutable::writeToFile(char const*) + 15248 +13 0x1007962e8 main + 9424 +ld: Assertion failed: (extras.otherInstrOffset != 0 && "Kind::arm64_adrp_ldr missing extra info"), function applyFixup, file Fixup.cpp, line 793. +clang: error: linker command failed with exit code 1 (use -v to see invocation) +``` +It's a known issue for Xcode version 15.1. +Mitigation: update to most recent Xcode version, clean and rebuild. diff --git a/examples/models/llama2/TARGETS b/examples/models/llama/TARGETS similarity index 88% rename from examples/models/llama2/TARGETS rename to examples/models/llama/TARGETS index 17597f3d50..751c61da97 100644 --- a/examples/models/llama2/TARGETS +++ b/examples/models/llama/TARGETS @@ -16,7 +16,7 @@ runtime.python_library( "rope.py", ], _is_external_target = True, - base_module = "executorch.examples.models.llama2", + base_module = "executorch.examples.models.llama", visibility = [ "//executorch/...", "@EXECUTORCH_CLIENTS", @@ -34,9 +34,9 @@ runtime.python_library( "model.py", ], _is_external_target = True, - base_module = "executorch.examples.models.llama2", + base_module = "executorch.examples.models.llama", resources = { - "//executorch/examples/models/llama2/params:params": "params", + "//executorch/examples/models/llama/params:params": "params", }, visibility = [ "//bento/...", @@ -46,14 +46,14 @@ runtime.python_library( deps = [ "//caffe2:torch", "//executorch/examples/models:model_base", - "//executorch/examples/models/llama2:llama_transformer", + "//executorch/examples/models/llama:llama_transformer", "//executorch/examples/models:checkpoint", ], ) runtime.python_binary( name = "export_llama", - main_function = "executorch.examples.models.llama2.export_llama.main", + main_function = "executorch.examples.models.llama.export_llama.main", # visibility = ["//executorch/examples/..."], preload_deps = [ "//executorch/extension/llm/custom_ops:model_sharding_py", @@ -93,7 +93,7 @@ runtime.python_library( "source_transformation/spin_quant.py", ], _is_external_target = True, - base_module = "executorch.examples.models.llama2", + base_module = "executorch.examples.models.llama", visibility = [ "//bento/...", "//bento_kernels/...", @@ -103,6 +103,7 @@ runtime.python_library( deps = [ "//ai_codesign/gen_ai/fast_hadamard_transform:fast_hadamard_transform", "//caffe2:torch", + "//executorch/backends/vulkan/_passes:vulkan_passes", "//executorch/examples/models:model_base", "//executorch/examples/models:models", "//executorch/extension/llm/custom_ops:custom_ops_aot_py", @@ -123,7 +124,7 @@ runtime.python_library( runtime.python_binary( name = "eval_llama", - main_function = "executorch.examples.models.llama2.eval_llama.main", + main_function = "executorch.examples.models.llama.eval_llama.main", preload_deps = [ "//executorch/extension/llm/custom_ops:custom_ops_aot_lib", "//executorch/kernels/quantized:aot_lib", @@ -142,7 +143,7 @@ runtime.python_library( "evaluate/eager_eval.py", ], _is_external_target = True, - base_module = "executorch.examples.models.llama2", + base_module = "executorch.examples.models.llama", visibility = [ "//bento/...", "//bento_kernels/...", @@ -153,7 +154,7 @@ runtime.python_library( "fbsource//third-party/pypi/lm-eval:lm-eval", "fbsource//third-party/pypi/tiktoken:tiktoken", ":export_library", - "//executorch/examples/models/llama2/tokenizer:tiktoken_py", + "//executorch/examples/models/llama/tokenizer:tiktoken_py", "//executorch/extension/llm/export:export_lib", "//executorch/extension/llm/tokenizer:tokenizer_py_lib", "//executorch/extension/pybindings:portable_lib", @@ -195,7 +196,7 @@ runtime.python_test( deps = [ ":quantized_kv_cache", "//caffe2:torch", - "//executorch/examples/models/llama2:llama_transformer", + "//executorch/examples/models/llama:llama_transformer", ], ) @@ -211,6 +212,6 @@ runtime.python_test( ":quantized_kv_cache", ":sdpa", "//caffe2:torch", - "//executorch/examples/models/llama2:llama_transformer", + "//executorch/examples/models/llama:llama_transformer", ], ) diff --git a/examples/models/llama2/__init__.py b/examples/models/llama/__init__.py similarity index 100% rename from examples/models/llama2/__init__.py rename to examples/models/llama/__init__.py diff --git a/examples/models/llama2/eval_llama.py b/examples/models/llama/eval_llama.py similarity index 100% rename from examples/models/llama2/eval_llama.py rename to examples/models/llama/eval_llama.py diff --git a/examples/models/llama2/eval_llama_lib.py b/examples/models/llama/eval_llama_lib.py similarity index 98% rename from examples/models/llama2/eval_llama_lib.py rename to examples/models/llama/eval_llama_lib.py index 95b3ff0fb7..e95e6998d9 100644 --- a/examples/models/llama2/eval_llama_lib.py +++ b/examples/models/llama/eval_llama_lib.py @@ -10,10 +10,10 @@ from typing import Optional, Union import torch -from executorch.examples.models.llama2.export_llama_lib import ( +from executorch.examples.models.llama.export_llama_lib import ( get_quantizer_and_quant_params, ) -from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer as Tiktoken +from executorch.examples.models.llama.tokenizer.tiktoken import Tokenizer as Tiktoken from executorch.extension.llm.export.builder import LLMEdgeManager from executorch.extension.llm.tokenizer.tokenizer import ( diff --git a/examples/models/llama2/evaluate/__init__.py b/examples/models/llama/evaluate/__init__.py similarity index 100% rename from examples/models/llama2/evaluate/__init__.py rename to examples/models/llama/evaluate/__init__.py diff --git a/examples/models/llama2/evaluate/eager_eval.py b/examples/models/llama/evaluate/eager_eval.py similarity index 96% rename from examples/models/llama2/evaluate/eager_eval.py rename to examples/models/llama/evaluate/eager_eval.py index 784112e052..3d0a9a0d70 100644 --- a/examples/models/llama2/evaluate/eager_eval.py +++ b/examples/models/llama/evaluate/eager_eval.py @@ -8,7 +8,7 @@ from typing import Optional, Union import torch -from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer as Tiktoken +from executorch.examples.models.llama.tokenizer.tiktoken import Tokenizer as Tiktoken from executorch.extension.llm.tokenizer.tokenizer import ( Tokenizer as SentencePieceTokenizer, ) diff --git a/examples/models/llama2/experimental/README.md b/examples/models/llama/experimental/README.md similarity index 100% rename from examples/models/llama2/experimental/README.md rename to examples/models/llama/experimental/README.md diff --git a/examples/models/llama2/experimental/TARGETS b/examples/models/llama/experimental/TARGETS similarity index 100% rename from examples/models/llama2/experimental/TARGETS rename to examples/models/llama/experimental/TARGETS diff --git a/examples/models/llama2/experimental/generate.py b/examples/models/llama/experimental/generate.py similarity index 98% rename from examples/models/llama2/experimental/generate.py rename to examples/models/llama/experimental/generate.py index bc974d7351..d09772c309 100644 --- a/examples/models/llama2/experimental/generate.py +++ b/examples/models/llama/experimental/generate.py @@ -11,7 +11,7 @@ import torch -from executorch.examples.models.llama2.experimental.load_gguf_q4_0 import load_gguf_q4_0 +from executorch.examples.models.llama.experimental.load_gguf_q4_0 import load_gguf_q4_0 from sentencepiece import SentencePieceProcessor diff --git a/examples/models/llama2/experimental/load_gguf_q4_0.py b/examples/models/llama/experimental/load_gguf_q4_0.py similarity index 98% rename from examples/models/llama2/experimental/load_gguf_q4_0.py rename to examples/models/llama/experimental/load_gguf_q4_0.py index 4583978394..39b81ea64a 100644 --- a/examples/models/llama2/experimental/load_gguf_q4_0.py +++ b/examples/models/llama/experimental/load_gguf_q4_0.py @@ -14,7 +14,7 @@ from typing import Callable, Dict, Mapping import torch -from executorch.examples.models.llama2.experimental.subclass import ( +from executorch.examples.models.llama.experimental.subclass import ( _unpack_two_uint8, GGMLInt4LinearWeight, to_float, diff --git a/examples/models/llama2/experimental/subclass.py b/examples/models/llama/experimental/subclass.py similarity index 100% rename from examples/models/llama2/experimental/subclass.py rename to examples/models/llama/experimental/subclass.py diff --git a/examples/models/llama2/experimental/targets.bzl b/examples/models/llama/experimental/targets.bzl similarity index 100% rename from examples/models/llama2/experimental/targets.bzl rename to examples/models/llama/experimental/targets.bzl diff --git a/examples/models/llama2/experimental/test_subclass.py b/examples/models/llama/experimental/test_subclass.py similarity index 100% rename from examples/models/llama2/experimental/test_subclass.py rename to examples/models/llama/experimental/test_subclass.py diff --git a/examples/models/llama2/export_llama.py b/examples/models/llama/export_llama.py similarity index 100% rename from examples/models/llama2/export_llama.py rename to examples/models/llama/export_llama.py diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama/export_llama_lib.py similarity index 99% rename from examples/models/llama2/export_llama_lib.py rename to examples/models/llama/export_llama_lib.py index 8cff6e8e11..940bcaecbc 100644 --- a/examples/models/llama2/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -24,7 +24,7 @@ from executorch.devtools.etrecord import generate_etrecord -from executorch.examples.models.llama2.llama_transformer import ModelArgs +from executorch.examples.models.llama.llama_transformer import ModelArgs from executorch.extension.llm.export.builder import DType, LLMEdgeManager @@ -157,7 +157,7 @@ def build_args_parser() -> argparse.ArgumentParser: "--quantization_mode", type=str, default=None, - choices=["int8", "8da4w", "8da4w-gptq"], + choices=["int8", "8da4w", "8da4w-gptq", "vulkan_4w"], help="type of quantization", ) @@ -783,8 +783,8 @@ def _load_llama_model( f"Loading model with checkpoint={checkpoint}, params={params_path}, use_kv_cache={use_kv_cache}, weight_type={weight_type}" ) model, example_inputs, example_kwarg_inputs, _ = EagerModelFactory.create_model( - "llama2", - "Llama2Model", + module_name="llama", + model_class_name="Llama2Model", checkpoint=checkpoint, checkpoint_dir=checkpoint_dir, params=params_path, diff --git a/examples/models/llama2/fairseq2.py b/examples/models/llama/fairseq2.py similarity index 100% rename from examples/models/llama2/fairseq2.py rename to examples/models/llama/fairseq2.py diff --git a/examples/models/llama2/install_requirement_helper.py b/examples/models/llama/install_requirement_helper.py similarity index 100% rename from examples/models/llama2/install_requirement_helper.py rename to examples/models/llama/install_requirement_helper.py diff --git a/examples/models/llama2/install_requirements.sh b/examples/models/llama/install_requirements.sh similarity index 92% rename from examples/models/llama2/install_requirements.sh rename to examples/models/llama/install_requirements.sh index 99783ff29c..470e804c2d 100755 --- a/examples/models/llama2/install_requirements.sh +++ b/examples/models/llama/install_requirements.sh @@ -19,4 +19,4 @@ pip install lm_eval==0.4.2 pip install tiktoken blobfile # Call the install helper for further setup -python examples/models/llama2/install_requirement_helper.py +python examples/models/llama/install_requirement_helper.py diff --git a/examples/models/llama2/llama_test.py b/examples/models/llama/llama_test.py similarity index 100% rename from examples/models/llama2/llama_test.py rename to examples/models/llama/llama_test.py diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama/llama_transformer.py similarity index 99% rename from examples/models/llama2/llama_transformer.py rename to examples/models/llama/llama_transformer.py index 8e17013ae3..4d39d131d1 100644 --- a/examples/models/llama2/llama_transformer.py +++ b/examples/models/llama/llama_transformer.py @@ -14,7 +14,7 @@ import torch import torch.nn.functional as F -from executorch.examples.models.llama2.rope import ( +from executorch.examples.models.llama.rope import ( apply_rotary_emb, hf_apply_rotary_emb, hf_precompute_freqs_cis, diff --git a/examples/models/llama2/llama_via_xnnpack.gif b/examples/models/llama/llama_via_xnnpack.gif similarity index 100% rename from examples/models/llama2/llama_via_xnnpack.gif rename to examples/models/llama/llama_via_xnnpack.gif diff --git a/examples/models/llama2/main.cpp b/examples/models/llama/main.cpp similarity index 97% rename from examples/models/llama2/main.cpp rename to examples/models/llama/main.cpp index 339b2abfdb..5fe0ce93cf 100644 --- a/examples/models/llama2/main.cpp +++ b/examples/models/llama/main.cpp @@ -8,7 +8,7 @@ #include -#include +#include #if defined(ET_USE_THREADPOOL) #include diff --git a/examples/models/llama2/model.py b/examples/models/llama/model.py similarity index 99% rename from examples/models/llama2/model.py rename to examples/models/llama/model.py index 23f1c1b489..ad997de64c 100644 --- a/examples/models/llama2/model.py +++ b/examples/models/llama/model.py @@ -16,7 +16,7 @@ get_default_model_resource_dir, ) -from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer +from executorch.examples.models.llama.llama_transformer import ModelArgs, Transformer try: from .fairseq2 import convert_to_llama_checkpoint diff --git a/examples/models/llama2/params/TARGETS b/examples/models/llama/params/TARGETS similarity index 100% rename from examples/models/llama2/params/TARGETS rename to examples/models/llama/params/TARGETS diff --git a/examples/models/llama2/params/demo_config.json b/examples/models/llama/params/demo_config.json similarity index 100% rename from examples/models/llama2/params/demo_config.json rename to examples/models/llama/params/demo_config.json diff --git a/examples/models/llama2/params/demo_rand_params.pth b/examples/models/llama/params/demo_rand_params.pth similarity index 100% rename from examples/models/llama2/params/demo_rand_params.pth rename to examples/models/llama/params/demo_rand_params.pth diff --git a/examples/models/llama2/rope.py b/examples/models/llama/rope.py similarity index 100% rename from examples/models/llama2/rope.py rename to examples/models/llama/rope.py diff --git a/examples/models/llama2/runner/CMakeLists.txt b/examples/models/llama/runner/CMakeLists.txt similarity index 100% rename from examples/models/llama2/runner/CMakeLists.txt rename to examples/models/llama/runner/CMakeLists.txt diff --git a/examples/models/llama2/runner/TARGETS b/examples/models/llama/runner/TARGETS similarity index 100% rename from examples/models/llama2/runner/TARGETS rename to examples/models/llama/runner/TARGETS diff --git a/examples/models/llama2/runner/eager.py b/examples/models/llama/runner/eager.py similarity index 56% rename from examples/models/llama2/runner/eager.py rename to examples/models/llama/runner/eager.py index 42357d6e55..42d11bdedf 100644 --- a/examples/models/llama2/runner/eager.py +++ b/examples/models/llama/runner/eager.py @@ -10,10 +10,13 @@ import torch -from examples.models.llama2.llama_transformer import ModelArgs -from executorch.examples.models.model_factory import EagerModelFactory - -from .generation import LlamaRunner +from examples.models.llama.llama_transformer import ModelArgs +from executorch.examples.models.llama2.export_llama_lib import ( + _prepare_for_llama_export, + build_args_parser as _build_args_parser, +) +from executorch.examples.models.llama2.runner.generation import LlamaRunner +from executorch.extension.llm.export import LLMEdgeManager class EagerLlamaRunner(LlamaRunner): @@ -25,21 +28,17 @@ def __init__(self, args): with open(args.params, "r") as f: params = json.loads(f.read()) model_args: ModelArgs = ModelArgs( - max_seq_len=args.max_len, + max_seq_len=args.max_seq_length, max_batch_size=1, - use_kv_cache=True, + use_kv_cache=args.use_kv_cache, **params, ) - super().__init__(tokenizer_path=args.tokenizer, model_args=model_args) - self.model, _, _, _ = EagerModelFactory.create_model( - "llama2", - "Llama2Model", - checkpoint=args.checkpoint, - params=args.params, - use_kv_cache=True, - fairseq2=False, - max_seq_len=args.max_len, - enable_dynamic_shape=True, + super().__init__(tokenizer_path=args.tokenizer_path, model_args=model_args) + manager: LLMEdgeManager = _prepare_for_llama_export("llama", args) + self.model = ( + manager.model.eval().to(device="cuda") + if torch.cuda.is_available() + else manager.model.eval().to(device="cpu") ) def forward( @@ -51,34 +50,7 @@ def forward( def build_args_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser() - - parser.add_argument( - "--checkpoint", - type=str, - default=None, - help="path to model checkpoint file", - ) - - parser.add_argument( - "--params", - type=str, - default=None, - help="model params file", - ) - - parser.add_argument( - "--max_len", - type=int, - default=128, - help="Maximum length of the generated response sequence.", - ) - - parser.add_argument( - "--tokenizer", - type=str, - default=None, - ) + parser = _build_args_parser() parser.add_argument( "--prompt", diff --git a/examples/models/llama2/runner/generation.py b/examples/models/llama/runner/generation.py similarity index 94% rename from examples/models/llama2/runner/generation.py rename to examples/models/llama/runner/generation.py index 6d43c84932..885249f9b9 100644 --- a/examples/models/llama2/runner/generation.py +++ b/examples/models/llama/runner/generation.py @@ -9,8 +9,8 @@ import torch -from executorch.examples.models.llama2.llama_transformer import ModelArgs -from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer +from executorch.examples.models.llama.llama_transformer import ModelArgs +from executorch.examples.models.llama.tokenizer.tiktoken import Tokenizer class CompletionPrediction(TypedDict, total=False): @@ -45,9 +45,9 @@ def sample_top_p(probs, p): def next_token(logits: torch.Tensor, temperature: float, top_p: float) -> int: if temperature > 0: - probs = torch.softmax(logits[:, -1] / temperature, dim=-1) + probs = torch.softmax(logits / temperature, dim=-1) return sample_top_p(probs, top_p).item() - return torch.argmax(logits[:, -1], dim=-1).item() + return torch.argmax(logits, dim=-1).item() class LlamaRunner(ABC): diff --git a/examples/models/llama2/runner/native.py b/examples/models/llama/runner/native.py similarity index 97% rename from examples/models/llama2/runner/native.py rename to examples/models/llama/runner/native.py index b0d6c20e96..90e7fc46dd 100644 --- a/examples/models/llama2/runner/native.py +++ b/examples/models/llama/runner/native.py @@ -10,7 +10,7 @@ import torch -from examples.models.llama2.llama_transformer import ModelArgs +from examples.models.llama.llama_transformer import ModelArgs from executorch.extension.pybindings.portable_lib import _load_for_executorch # Load custom ops and quantized ops. diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama/runner/runner.cpp similarity index 98% rename from examples/models/llama2/runner/runner.cpp rename to examples/models/llama/runner/runner.cpp index a2ae053dd1..42a1a632dc 100644 --- a/examples/models/llama2/runner/runner.cpp +++ b/examples/models/llama/runner/runner.cpp @@ -9,13 +9,13 @@ // A simple llama2 runner that includes preprocessing and post processing logic. // The module takes in a string as input and emits a string as output. -#include +#include #include #include -#include +#include #include namespace example { diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama/runner/runner.h similarity index 100% rename from examples/models/llama2/runner/runner.h rename to examples/models/llama/runner/runner.h diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama/runner/targets.bzl similarity index 97% rename from examples/models/llama2/runner/targets.bzl rename to examples/models/llama/runner/targets.bzl index 96d47ffce2..de12dc4d10 100644 --- a/examples/models/llama2/runner/targets.bzl +++ b/examples/models/llama/runner/targets.bzl @@ -40,7 +40,7 @@ def define_common_targets(): "//executorch/kernels/quantized:generated_lib" + aten_suffix, "//executorch/runtime/core/exec_aten:lib" + aten_suffix, "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix, - "//executorch/examples/models/llama2/tokenizer:tiktoken", + "//executorch/examples/models/llama/tokenizer:tiktoken", "//executorch/extension/llm/tokenizer:bpe_tokenizer", ] + (_get_operator_lib(aten)) + ([ # Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE) diff --git a/examples/models/llama2/source_transformation/__init__.py b/examples/models/llama/source_transformation/__init__.py similarity index 100% rename from examples/models/llama2/source_transformation/__init__.py rename to examples/models/llama/source_transformation/__init__.py diff --git a/examples/models/llama2/source_transformation/apply_spin_quant_r1_r2.py b/examples/models/llama/source_transformation/apply_spin_quant_r1_r2.py similarity index 100% rename from examples/models/llama2/source_transformation/apply_spin_quant_r1_r2.py rename to examples/models/llama/source_transformation/apply_spin_quant_r1_r2.py diff --git a/examples/models/llama2/source_transformation/lora.py b/examples/models/llama/source_transformation/lora.py similarity index 100% rename from examples/models/llama2/source_transformation/lora.py rename to examples/models/llama/source_transformation/lora.py diff --git a/examples/models/llama2/source_transformation/pre_quantization.py b/examples/models/llama/source_transformation/pre_quantization.py similarity index 100% rename from examples/models/llama2/source_transformation/pre_quantization.py rename to examples/models/llama/source_transformation/pre_quantization.py diff --git a/examples/models/llama2/source_transformation/prune_output.py b/examples/models/llama/source_transformation/prune_output.py similarity index 100% rename from examples/models/llama2/source_transformation/prune_output.py rename to examples/models/llama/source_transformation/prune_output.py diff --git a/examples/models/llama2/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py similarity index 99% rename from examples/models/llama2/source_transformation/quantize.py rename to examples/models/llama/source_transformation/quantize.py index 3879c00fd9..274fc447b3 100644 --- a/examples/models/llama2/source_transformation/quantize.py +++ b/examples/models/llama/source_transformation/quantize.py @@ -12,6 +12,8 @@ import torch.nn as nn import torch.nn.functional as F +from executorch.backends.vulkan._passes import VkInt4WeightOnlyQuantizer + from executorch.extension.llm.export.builder import DType from sentencepiece import SentencePieceProcessor @@ -31,7 +33,7 @@ fsLinear = nn.Linear -def quantize( +def quantize( # noqa C901 model: torch.nn.Module, qmode: str, activation_dtype: Optional[DType], @@ -131,6 +133,9 @@ def quantize( ) model = gptq_quantizer.quantize(model, inputs) return model + elif qmode == "vulkan_4w": + model = VkInt4WeightOnlyQuantizer().quantize(model) + return model else: raise Exception(f"Unrecognized quantize mode: {qmode}") diff --git a/examples/models/llama2/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/quantized_kv_cache.py similarity index 99% rename from examples/models/llama2/source_transformation/quantized_kv_cache.py rename to examples/models/llama/source_transformation/quantized_kv_cache.py index 8eec7846d3..9977256975 100644 --- a/examples/models/llama2/source_transformation/quantized_kv_cache.py +++ b/examples/models/llama/source_transformation/quantized_kv_cache.py @@ -9,7 +9,7 @@ import torch import torch.nn as nn -from executorch.examples.models.llama2.llama_transformer import KVCache +from executorch.examples.models.llama.llama_transformer import KVCache from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib # noqa: F401 diff --git a/examples/models/llama2/source_transformation/rms_norm.py b/examples/models/llama/source_transformation/rms_norm.py similarity index 90% rename from examples/models/llama2/source_transformation/rms_norm.py rename to examples/models/llama/source_transformation/rms_norm.py index ff7e8b6745..3d94f73b63 100644 --- a/examples/models/llama2/source_transformation/rms_norm.py +++ b/examples/models/llama/source_transformation/rms_norm.py @@ -5,7 +5,7 @@ # LICENSE file in the root directory of this source tree. import torch -from executorch.examples.models.llama2.llama_transformer import RMSNorm +from executorch.examples.models.llama.llama_transformer import RMSNorm def replace_rms_norm_with_native_rms_norm(module: torch.nn.Module): diff --git a/examples/models/llama2/source_transformation/rope.py b/examples/models/llama/source_transformation/rope.py similarity index 100% rename from examples/models/llama2/source_transformation/rope.py rename to examples/models/llama/source_transformation/rope.py diff --git a/examples/models/llama2/source_transformation/sdpa.py b/examples/models/llama/source_transformation/sdpa.py similarity index 98% rename from examples/models/llama2/source_transformation/sdpa.py rename to examples/models/llama/source_transformation/sdpa.py index bda6966fa1..f8362648f3 100644 --- a/examples/models/llama2/source_transformation/sdpa.py +++ b/examples/models/llama/source_transformation/sdpa.py @@ -13,8 +13,8 @@ import torch -from executorch.examples.models.llama2.llama_transformer import KVCache, SDPA -from executorch.examples.models.llama2.source_transformation.quantized_kv_cache import ( +from executorch.examples.models.llama.llama_transformer import KVCache, SDPA +from executorch.examples.models.llama.source_transformation.quantized_kv_cache import ( QuantizedKVCache, ) diff --git a/examples/models/llama2/source_transformation/spin_quant.py b/examples/models/llama/source_transformation/spin_quant.py similarity index 97% rename from examples/models/llama2/source_transformation/spin_quant.py rename to examples/models/llama/source_transformation/spin_quant.py index f544e9e1f6..e07b78dc6e 100644 --- a/examples/models/llama2/source_transformation/spin_quant.py +++ b/examples/models/llama/source_transformation/spin_quant.py @@ -14,7 +14,7 @@ import torch.nn.functional as F -from executorch.examples.models.llama2.llama_transformer import FeedForward +from executorch.examples.models.llama.llama_transformer import FeedForward from torch import nn diff --git a/examples/models/llama2/source_transformation/test_quantized_kv_cache.py b/examples/models/llama/source_transformation/test_quantized_kv_cache.py similarity index 96% rename from examples/models/llama2/source_transformation/test_quantized_kv_cache.py rename to examples/models/llama/source_transformation/test_quantized_kv_cache.py index 5fa5d1958d..2f38f96552 100644 --- a/examples/models/llama2/source_transformation/test_quantized_kv_cache.py +++ b/examples/models/llama/source_transformation/test_quantized_kv_cache.py @@ -8,9 +8,9 @@ import torch -from executorch.examples.models.llama2.llama_transformer import KVCache +from executorch.examples.models.llama.llama_transformer import KVCache -from executorch.examples.models.llama2.source_transformation.quantized_kv_cache import ( +from executorch.examples.models.llama.source_transformation.quantized_kv_cache import ( QuantizedCacheType, QuantizedKVCache, ) diff --git a/examples/models/llama2/source_transformation/test_sdpa_with_quantized_kv_cache.py b/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py similarity index 92% rename from examples/models/llama2/source_transformation/test_sdpa_with_quantized_kv_cache.py rename to examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py index 4755d45499..65c6678ab2 100644 --- a/examples/models/llama2/source_transformation/test_sdpa_with_quantized_kv_cache.py +++ b/examples/models/llama/source_transformation/test_sdpa_with_quantized_kv_cache.py @@ -8,14 +8,14 @@ import torch -from executorch.examples.models.llama2.llama_transformer import KVCache +from executorch.examples.models.llama.llama_transformer import KVCache -from executorch.examples.models.llama2.source_transformation.quantized_kv_cache import ( +from executorch.examples.models.llama.source_transformation.quantized_kv_cache import ( QuantizedCacheType, QuantizedKVCache, ) -from executorch.examples.models.llama2.source_transformation.sdpa import SDPACustom +from executorch.examples.models.llama.source_transformation.sdpa import SDPACustom class SDPAWithQuantizedKVCacheTest(unittest.TestCase): diff --git a/examples/models/llama2/targets.bzl b/examples/models/llama/targets.bzl similarity index 91% rename from examples/models/llama2/targets.bzl rename to examples/models/llama/targets.bzl index 57e84256a4..40b26d6980 100644 --- a/examples/models/llama2/targets.bzl +++ b/examples/models/llama/targets.bzl @@ -15,7 +15,7 @@ def define_common_targets(): "-DUSE_ATEN_LIB", ] if aten else [], deps = [ - "//executorch/examples/models/llama2/runner:runner" + aten_suffix, + "//executorch/examples/models/llama/runner:runner" + aten_suffix, "//executorch/extension/evalue_util:print_evalue", "//executorch/extension/threadpool:threadpool", "//executorch/extension/threadpool:cpuinfo_utils", diff --git a/examples/models/llama2/test_llama_runner.sh b/examples/models/llama/test_llama_runner.sh similarity index 88% rename from examples/models/llama2/test_llama_runner.sh rename to examples/models/llama/test_llama_runner.sh index d0c44518ab..c55719f382 100644 --- a/examples/models/llama2/test_llama_runner.sh +++ b/examples/models/llama/test_llama_runner.sh @@ -5,7 +5,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -# Test Llama runner in examples/models/llama2/main.cpp +# Test Llama runner in examples/models/llama/main.cpp # 1. Export a llama-like model # 2. Build llama runner binary # 3. Run model with the llama runner binary with prompt diff --git a/examples/models/llama2/tests/TARGETS b/examples/models/llama/tests/TARGETS similarity index 64% rename from examples/models/llama2/tests/TARGETS rename to examples/models/llama/tests/TARGETS index 2e4dcf7d1f..8cea9aeff4 100644 --- a/examples/models/llama2/tests/TARGETS +++ b/examples/models/llama/tests/TARGETS @@ -9,8 +9,8 @@ python_unittest( ], deps = [ "//caffe2:torch", - "//executorch/examples/models/llama2:export_library", - "//executorch/examples/models/llama2:llama_transformer", + "//executorch/examples/models/llama:export_library", + "//executorch/examples/models/llama:llama_transformer", ], ) @@ -21,8 +21,8 @@ python_unittest( ], deps = [ "//caffe2:torch", - "//executorch/examples/models/llama2:export_library", - "//executorch/examples/models/llama2:llama_transformer", + "//executorch/examples/models/llama:export_library", + "//executorch/examples/models/llama:llama_transformer", "//pytorch/ao:torchao", ], ) diff --git a/examples/models/llama2/tests/test_pre_quantization_transforms.py b/examples/models/llama/tests/test_pre_quantization_transforms.py similarity index 95% rename from examples/models/llama2/tests/test_pre_quantization_transforms.py rename to examples/models/llama/tests/test_pre_quantization_transforms.py index 59cec2e72a..dc7c640dba 100644 --- a/examples/models/llama2/tests/test_pre_quantization_transforms.py +++ b/examples/models/llama/tests/test_pre_quantization_transforms.py @@ -7,14 +7,14 @@ import unittest import torch -from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer -from executorch.examples.models.llama2.source_transformation.pre_quantization import ( +from executorch.examples.models.llama.llama_transformer import ModelArgs, Transformer +from executorch.examples.models.llama.source_transformation.pre_quantization import ( sanitize_checkpoint_from_pre_quantization, transform_embedding_for_pre_quantization, transform_linear_for_pre_quantization, transform_output_linear_for_pre_quantization, ) -from executorch.examples.models.llama2.source_transformation.quantize import ( +from executorch.examples.models.llama.source_transformation.quantize import ( dynamically_quantize_per_channel, ) from torchao.quantization.utils import group_quantize_tensor_symmetric diff --git a/examples/models/llama2/tests/test_simple_sdpa.py b/examples/models/llama/tests/test_simple_sdpa.py similarity index 92% rename from examples/models/llama2/tests/test_simple_sdpa.py rename to examples/models/llama/tests/test_simple_sdpa.py index 264ed3dde3..6e0c391960 100644 --- a/examples/models/llama2/tests/test_simple_sdpa.py +++ b/examples/models/llama/tests/test_simple_sdpa.py @@ -8,8 +8,8 @@ import unittest import torch -from executorch.examples.models.llama2.llama_transformer import KVCache, SDPA -from executorch.examples.models.llama2.source_transformation.sdpa import SDPASimple +from executorch.examples.models.llama.llama_transformer import KVCache, SDPA +from executorch.examples.models.llama.source_transformation.sdpa import SDPASimple class SDPATest(unittest.TestCase): diff --git a/examples/models/llama2/tokenizer/TARGETS b/examples/models/llama/tokenizer/TARGETS similarity index 100% rename from examples/models/llama2/tokenizer/TARGETS rename to examples/models/llama/tokenizer/TARGETS diff --git a/examples/models/llama2/tokenizer/llama_tiktoken.cpp b/examples/models/llama/tokenizer/llama_tiktoken.cpp similarity index 97% rename from examples/models/llama2/tokenizer/llama_tiktoken.cpp rename to examples/models/llama/tokenizer/llama_tiktoken.cpp index 5ce9d7f14c..74eacc1b5f 100644 --- a/examples/models/llama2/tokenizer/llama_tiktoken.cpp +++ b/examples/models/llama/tokenizer/llama_tiktoken.cpp @@ -6,7 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include namespace example { diff --git a/examples/models/llama2/tokenizer/llama_tiktoken.h b/examples/models/llama/tokenizer/llama_tiktoken.h similarity index 100% rename from examples/models/llama2/tokenizer/llama_tiktoken.h rename to examples/models/llama/tokenizer/llama_tiktoken.h diff --git a/examples/models/llama2/tokenizer/targets.bzl b/examples/models/llama/tokenizer/targets.bzl similarity index 100% rename from examples/models/llama2/tokenizer/targets.bzl rename to examples/models/llama/tokenizer/targets.bzl diff --git a/examples/models/llama2/tokenizer/test/CMakeLists.txt b/examples/models/llama/tokenizer/test/CMakeLists.txt similarity index 100% rename from examples/models/llama2/tokenizer/test/CMakeLists.txt rename to examples/models/llama/tokenizer/test/CMakeLists.txt diff --git a/examples/models/llama2/tokenizer/test/TARGETS b/examples/models/llama/tokenizer/test/TARGETS similarity index 100% rename from examples/models/llama2/tokenizer/test/TARGETS rename to examples/models/llama/tokenizer/test/TARGETS diff --git a/examples/models/llama2/tokenizer/test/resources/test_tiktoken_tokenizer.model b/examples/models/llama/tokenizer/test/resources/test_tiktoken_tokenizer.model similarity index 100% rename from examples/models/llama2/tokenizer/test/resources/test_tiktoken_tokenizer.model rename to examples/models/llama/tokenizer/test/resources/test_tiktoken_tokenizer.model diff --git a/examples/models/llama2/tokenizer/test/targets.bzl b/examples/models/llama/tokenizer/test/targets.bzl similarity index 90% rename from examples/models/llama2/tokenizer/test/targets.bzl rename to examples/models/llama/tokenizer/test/targets.bzl index 842a5fc396..bd07e9e88c 100644 --- a/examples/models/llama2/tokenizer/test/targets.bzl +++ b/examples/models/llama/tokenizer/test/targets.bzl @@ -12,7 +12,7 @@ def define_common_targets(): "test_tiktoken.cpp", ], deps = [ - "//executorch/examples/models/llama2/tokenizer:tiktoken", + "//executorch/examples/models/llama/tokenizer:tiktoken", ], env = { "RESOURCES_PATH": "$(location :resources)/resources", diff --git a/examples/models/llama2/tokenizer/test/test_tiktoken.cpp b/examples/models/llama/tokenizer/test/test_tiktoken.cpp similarity index 97% rename from examples/models/llama2/tokenizer/test/test_tiktoken.cpp rename to examples/models/llama/tokenizer/test/test_tiktoken.cpp index 5bd6515b67..b9309f9921 100644 --- a/examples/models/llama2/tokenizer/test/test_tiktoken.cpp +++ b/examples/models/llama/tokenizer/test/test_tiktoken.cpp @@ -6,7 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include #include diff --git a/examples/models/llama2/tokenizer/tiktoken.py b/examples/models/llama/tokenizer/tiktoken.py similarity index 100% rename from examples/models/llama2/tokenizer/tiktoken.py rename to examples/models/llama/tokenizer/tiktoken.py diff --git a/examples/models/llama2/README.md b/examples/models/llama2/README.md index 2260c8f825..8876c5c4e4 100644 --- a/examples/models/llama2/README.md +++ b/examples/models/llama2/README.md @@ -1,497 +1,2 @@ # Summary -This example demonstrates how to run a [llama models](https://www.llama.com/) on mobile via ExecuTorch. We use XNNPACK to accelerate the performance and 4-bit groupwise PTQ quantization to fit the model on a phone. - -Here are supported models: - -- Llama 3.2 1B and 3B -- Llama 3.1 8B -- Llama 3 8B -- Llama 2 7B - -Pretrained models are not included in this repo. Users are suggested to download them [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/). - -# What is Llama? -Llama is a collection of large language models that use publicly available data for training. These models are based on the transformer architecture, which allows it to process input sequences of arbitrary length and generate output sequences of variable length. One of the key features of Llama models is its ability to generate coherent and contextually relevant text. This is achieved through the use of attention mechanisms, which allow the model to focus on different parts of the input sequence as it generates output. Additionally, Llama models use a technique called “masked language modeling” to pre-train the model on a large corpus of text, which helps it learn to predict missing words in a sentence. - -Llama models have shown to perform well on a variety of natural language processing tasks, including language translation, question answering, and text summarization and are also capable of generating human-like text, making Llama models a useful tool for creative writing and other applications where natural language generation is important. - -Overall, Llama models are powerful and versatile language models that can be used for a wide range of natural language processing tasks. The model’s ability to generate coherent and contextually relevant text makes it particularly useful for applications such as chatbots, virtual assistants, and language translation. - -Please note that the models are subject to the [Llama 2 Acceptable Use Policy](https://github.com/facebookresearch/llama/blob/main/USE_POLICY.md), [Llama 3 Acceptable Use Policy](https://github.com/meta-llama/llama3/blob/main/USE_POLICY.md) and [Responsible Use Guide](https://ai.meta.com/static-resource/responsible-use-guide/). - - -# Results - -Since Llama 2 7B or Llama 3 8B model needs at least 4-bit quantization to fit even within some of the highend phones, results presented here correspond to 4-bit groupwise post-training quantized model. - -For Llama 3.2 1B/3B, we validated the models by running them in their original bf16 datatype and unquantized on both Android and iOS phones. The 3B version required high-end phones with larger RAMs to fit the model. - -Additionally, 1B/3B models are sensitive to accuracy loss when regular PTQ quantization is applied, so we employed 4bit quantization using [SpinQuant](https://github.com/facebookresearch/SpinQuant/tree/main) to achieve a good balance between accuracy, performance and memory. - - - - - - -
- -
- - Llama3.1 8B, 4bit quantized on Android phone - -
-
- Llama3.2 1B, unquantized, bf16 on Android phone. -
- -## Quantization: -We employed 4-bit groupwise per token dynamic quantization of all the linear layers of the model. Dynamic quantization refers to quantizating activations dynamically, such that quantization parameters for activations are calculated, from min/max range, at runtime. Here we quantized activations with 8bits (signed integer). Furthermore, weights are statically quantized. In our case weights were per-channel groupwise quantized with 4bit signed integer. For more information refer to this [page](https://github.com/pytorch/ao). - -We evaluated WikiText perplexity using [LM Eval](https://github.com/EleutherAI/lm-evaluation-harness). Please note that LM Eval reports perplexity normalized by word count instead of token count. You may see different perplexity for WikiText from other sources if they implement it differntly. More details could be found [here](https://github.com/EleutherAI/lm-evaluation-harness/issues/2301). - -Below are the results for two different groupsizes, with max_seq_length 2048, and limit 1000. - -|Model | Baseline (FP32) | Groupwise 4-bit (128) | Groupwise 4-bit (256) -|--------|-----------------| ---------------------- | --------------- -|Llama 2 7B | 9.2 | 10.2 | 10.7 -|Llama 3 8B | 7.9 | 9.4 | 9.7 - -Note that groupsize less than 128 was not enabled, since such models were still too large. This is because our current efforts have focused on enabling FP32 and support for FP16 is under way. What this implies for model size is that 1) embedding table is in FP32 and 2) quantized weights scales are FP32. - -### SpinQuant for Llama 3.2 1B/3B models (Optional) - -To improve accuracy, we can use [SpinQuant](https://github.com/facebookresearch/SpinQuant/tree/main), a post-training quantization (PTQ) technique that generates new quantized weights. In the standard PTQ process, quantization may lead to a decrease in accuracy when there are outliers. The SpinQuant method takes the original weights and produces optimized quantized weights with minimal outliers, resulting in higher accuracy. This can be achieved without any finetuning of the weights and only requires 100 iterations on a single A100 node. - -SpinQuant can generate quantized weights that are [compatible with ExecuTorch](https://github.com/facebookresearch/SpinQuant/tree/main?tab=readme-ov-file#3-export-to-executorch), specifically, it can be integrated with the existing optimized XNNPACK kernels (e.g., group-wise 4bit weight and 8bit dynamic activation). This allows developers to benefit from the higher accuracy of SpinQuant while also taking advantage of the strong performance of ExecuTorch acceleration. We enabled SpinQuant for Llama3.2 1B/3B models on ExecuTorch. - -

- -
- - Running Llama3.2 3B on Android phone. - -
- - 4bit quantization using SpinQuant - -

- -## Enablement - -For Llama 3 8B and Llama3.1 8B, we have verified so far on iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S24+ and OnePlus 12 (with 16GB RAM). - -We have verified running Llama 2 7B [mobile applications](#step-6-build-mobile-apps) efficiently on select devices including the iPhone 15 Pro, iPhone 15 Pro Max, Samsung Galaxy S22 and S24, and OnePlus 12. - -## Performance - -### Llama 3.2 1B and 3B -Llama 3.2 1B and 3B performance was measured on the OnePlus 12 device. The performance measurement is expressed in terms of tokens per second using an [adb binary-based approach](#step-5-run-benchmark-on) for generating 128 tokens. - -|Model | bf16 | 4bit(*) via SpinQuant -|--------| ---------------------- | --------------- -|1B | 19.4 tokens/second | 53.41 tokens/second | -|3B | 7.76 tokens/second | 22.98 tokens/second | - -(*) With SpinQuant, we currently quantize 4-bit groupwise (with groupsize 32) weight, 8bit dynamic activation of all the linear layers of the model, except embedding and output layers. The embedding and output layers are quantized as 8-bit per-channel weight and 8-bit dynamic activation. - -### Llama3 8B and Llama3.1 8B -Llama 3 8B performance was measured on the Samsung Galaxy S22, S24, and OnePlus 12 devices. The performance measurement is expressed in terms of tokens per second using an [adb binary-based approach](#step-5-run-benchmark-on). - -Note that since Llama3's vocabulary size is 4x that of Llama2, we had to quantize embedding lookup table as well. For these results embedding lookup table was groupwise quantized with 4-bits and group size of 32. - -|Device | Groupwise 4-bit (128) | Groupwise 4-bit (256) -|--------| ---------------------- | --------------- -|Galaxy S22 | 7.85 tokens/second | 8.4 tokens/second | -|Galaxy S24 | 10.91 tokens/second | 11.21 tokens/second | -|OnePlus 12 | 10.85 tokens/second | 11.02 tokens/second | - -### Llama2 7B -Llama 2 7B performance was measured on the Samsung Galaxy S22, S24, and OnePlus 12 devices. The performance measurement is expressed in terms of tokens per second using an [adb binary-based approach](#step-5-run-benchmark-on). - -|Device | Groupwise 4-bit (128) | Groupwise 4-bit (256) -|--------| ---------------------- | --------------- -|Galaxy S22 | 8.15 tokens/second | 8.3 tokens/second | -|Galaxy S24 | 10.66 tokens/second | 11.26 tokens/second | -|OnePlus 12 | 11.55 tokens/second | 11.6 tokens/second | - -# Instructions - -## Tested on - -- MacOS M1/M2, Linux. -- For Llama 2 7B, your device may require at least 32GB RAM. If this is a constraint for you, please try the smaller stories model. - -## Step 1: Setup -> :warning: **double check your python environment**: make sure `conda activate ` is run before all the bash and python scripts. - -1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch. For installation run `./install_requirements.sh --pybind xnnpack` -2. Run `examples/models/llama2/install_requirements.sh` to install a few dependencies. - - -## Step 2: Prepare model - -### Option A: Download and export Llama3.2 1B/3B model. - -1. Download `consolidated.00.pth`, `params.json` and `tokenizer.model` from [Llama website](https://www.llama.com/llama-downloads/) or [Hugging Face](https://huggingface.co/meta-llama/Llama-3.2-1B). For chat use-cases, download the instruct models. - -2. Export model and generate `.pte` file. Use original bfloat16 version, without any quantization. - -``` -# Set these paths to point to the downloaded files -LLAMA_CHECKPOINT=path/to/checkpoint.pth -LLAMA_PARAMS=path/to/params.json - -python -m examples.models.llama2.export_llama \ - --checkpoint "${LLAMA_CHECKPOINT:?}" \ - --params "${LLAMA_PARAMS:?}" \ - -kv \ - --use_sdpa_with_kv_cache \ - -X \ - -d bf16 \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ - --output_name="llama3_2.pte" -``` - -Optionally, we can apply SpinQuant to quantize the model without sacrifacing too much accuracy loss. - -To use SpinQuant, follow its [instruction](https://github.com/facebookresearch/SpinQuant/tree/main?tab=readme-ov-file#3-export-to-executorch) for exporting checkpoint to ExecuTorch and then export the SpinQuant checkpoint. - -``` -# Set these paths to point to the exported files -LLAMA_QUANTIZED_CHECKPOINT=path/to/spinquant/checkpoint.pth -LLAMA_PARAMS=path/to/params.json - -python -m examples.models.llama2.export_llama \ - --checkpoint "${LLAMA_QUANTIZED_CHECKPOINT:?}" \ - --params "${LLAMA_PARAMS:?}" \ - --use_sdpa_with_kv_cache \ - -X \ - --preq_mode 8da4w_output_8da8w \ - --preq_group_size 32 \ - --max_seq_length 2048 \ - --output_name "llama3_2.pte" \ - -kv \ - -d fp32 \ - --preq_embedding_quantize 8,0 \ - --use_spin_quant native \ - --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' -``` - -### Option B: Download and export Llama 3 8B instruct model - -You can export and run the original Llama 3 8B instruct model. - -1. Llama 3 pretrained parameters can be downloaded from [Meta's official Llama 3 repository](https://github.com/meta-llama/llama3/). - -2. Export model and generate `.pte` file - ``` - python -m examples.models.llama2.export_llama --checkpoint -p -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' --embedding-quantize 4,32 --output_name="llama3_kv_sdpa_xnn_qe_4_32.pte" - ``` - - Due to the larger vocabulary size of Llama 3, we recommend quantizing the embeddings with `--embedding-quantize 4,32` as shown above to further reduce the model size. - -### Option C: Download and export stories110M model - -If you want to deploy and run a smaller model for educational purposes. From `executorch` root: - -1. Download `stories110M.pt` and `tokenizer.model` from Github. - ``` - wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories110M.pt" - wget "https://raw.githubusercontent.com/karpathy/llama2.c/master/tokenizer.model" - ``` -2. Create params file. - ``` - echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps": 1e-05, "vocab_size": 32000}' > params.json - ``` -3. Export model and generate `.pte` file. - ``` - python -m examples.models.llama2.export_llama -c stories110M.pt -p params.json -X -kv - ``` - -### Option D: Download and export Llama 2 7B model - -You can export and run the original Llama 2 7B model. - -1. Llama 2 pretrained parameters can be downloaded from [Meta's official website](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) or from [Hugging Face](https://huggingface.co/meta-llama/Llama-2-7b). - -2. Edit `params.json` file. Replace `"vocab_size": -1` with `"vocab_size": 32000`. This is a short-term workaround. - -3. Export model and generate `.pte` file: - ``` - python -m examples.models.llama2.export_llama --checkpoint --params -kv --use_sdpa_with_kv_cache -X -qmode 8da4w --group_size 128 -d fp32 - ``` -4. Create tokenizer.bin. - ``` - python -m extension.llm.tokenizer.tokenizer -t -o tokenizer.bin - ``` - -### Option E: Download models from Hugging Face and convert from safetensor format to state dict - - -You can also download above models from [Hugging Face](https://huggingface.co/). Since ExecuTorch starts from a PyTorch model, a script like below can be used to convert the Hugging Face safetensors format to PyTorch's state dict. It leverages the utils provided by [TorchTune](https://github.com/pytorch/torchtune). - - -```Python -from torchtune.utils import FullModelHFCheckpointer -from torchtune.models import convert_weights -import torch - -# Convert from safetensors to TorchTune. Suppose the model has been downloaded from Hugging Face -checkpointer = FullModelHFCheckpointer( - checkpoint_dir='/home/.cache/huggingface/hub/models/snapshots/hash-number', - checkpoint_files=['model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors'], - output_dir='/the/destination/dir' , - model_type='LLAMA3' # or other types that TorchTune supports -) - -print("loading checkpoint") -sd = checkpointer.load_checkpoint() - -# Convert from TorchTune to Meta (PyTorch native) -sd = convert_weights.tune_to_meta(sd['model']) - -print("saving checkpoint") -torch.save(sd, "/the/destination/dir/checkpoint.pth") -``` - -## (Optional) Finetuning - -If you want to finetune your model based on a specific dataset, PyTorch provides [TorchTune](https://github.com/pytorch/torchtune) - a native-Pytorch library for easily authoring, fine-tuning and experimenting with LLMs. - -Once you have [TorchTune installed](https://github.com/pytorch/torchtune?tab=readme-ov-file#get-started) you can finetune Llama2 7B model using LoRA on a single GPU, using the following command. This will produce a checkpoint where the LoRA weights are merged with the base model and so the output checkpoint will be in the same format as the original Llama2 model. - -``` -tune run lora_finetune_single_device \ ---config llama2/7B_lora_single_device \ -checkpointer.checkpoint_dir= \ -tokenizer.path=/tokenizer.model -``` - -To run full finetuning with Llama2 7B on a single device, you can use the following command. - -``` -tune run full_finetune_single_device \ ---config llama2/7B_full_single_device \ -checkpointer.checkpoint_dir= \ -tokenizer.path=/tokenizer.model -``` - -## Step 3: Evaluate model accuracy - -> Forewarning: Model evaluation without a GPU may take a long time, especially on larger models. - -We use [LM Eval](https://github.com/EleutherAI/lm-evaluation-harness) to evaluate model accuracy. - -For base models, use the following example command to calculate its perplexity based on WikiText. -``` -python -m examples.models.llama2.eval_llama \ - -c \ - -p \ - -t \ - -kv \ - -d \ - --max_seq_len \ - --limit -``` - -For instruct models, use the following example command to calculate its MMLU score. -``` -python -m examples.models.llama2.eval_llama \ - -c \ - -p \ - -t \ - -kv \ - -d \ - --tasks mmlu \ - --num_fewshot 5 \ - --max_seq_len -``` - -## Step 4: Run on your computer to validate - -1. Build executorch with optimized CPU performance as follows. Build options available [here](https://github.com/pytorch/executorch/blob/main/CMakeLists.txt#L59). - ``` - cmake -DPYTHON_EXECUTABLE=python \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DEXECUTORCH_ENABLE_LOGGING=1 \ - -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ - -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -Bcmake-out . - - cmake --build cmake-out -j16 --target install --config Release - ``` -Note for Mac users: There's a known linking issue with Xcode 15.1. Refer to the session of Common Issues and Mitigations below for solutions. - -2. Build llama runner. - ``` - cmake -DPYTHON_EXECUTABLE=python \ - -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -Bcmake-out/examples/models/llama2 \ - examples/models/llama2 - - cmake --build cmake-out/examples/models/llama2 -j16 --config Release - ``` - -3. Run model. Run options available [here](https://github.com/pytorch/executorch/blob/main/examples/models/llama2/main.cpp#L18-L40). - ``` - cmake-out/examples/models/llama2/llama_main --model_path= --tokenizer_path= --prompt= - ``` - -For Llama2 models, pass the converted `tokenizer.bin` file instead of `tokenizer.model`. - -To build for CoreML backend and validate on Mac, replace `-DEXECUTORCH_BUILD_XNNPACK=ON` with `-DEXECUTORCH_BUILD_COREML=ON` - -## Step 5: Run benchmark on Android phone - -**1. Build llama runner binary for Android** - -*Pre-requisite*: Android NDK (tested with r27b) which can be downloaded from [here](https://developer.android.com/ndk/downloads). Note that the mac binary can be unpackaged and you can locate NDK folder from it. - -**1.1 Set Android NDK** -``` -export ANDROID_NDK= -``` -**1.2 Build executorch and associated libraries for android.** -``` -cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI=arm64-v8a \ - -DANDROID_PLATFORM=android-23 \ - -DCMAKE_INSTALL_PREFIX=cmake-out-android \ - -DCMAKE_BUILD_TYPE=Release \ - -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ - -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ - -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_ENABLE_LOGGING=1 \ - -DPYTHON_EXECUTABLE=python \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -Bcmake-out-android . - -cmake --build cmake-out-android -j16 --target install --config Release -``` - -**1.2 Build llama runner for android** -``` -cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI=arm64-v8a \ - -DANDROID_PLATFORM=android-23 \ - -DCMAKE_INSTALL_PREFIX=cmake-out-android \ - -DCMAKE_BUILD_TYPE=Release \ - -DPYTHON_EXECUTABLE=python \ - -DEXECUTORCH_BUILD_XNNPACK=ON \ - -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ - -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ - -Bcmake-out-android/examples/models/llama2 \ - examples/models/llama2 - -cmake --build cmake-out-android/examples/models/llama2 -j16 --config Release -``` - -**2. Run on Android via adb shell** - -*Pre-requisite*: Make sure you enable USB debugging via developer options on your phone - -**2.1 Connect your android phone** - -**2.2 Upload model, tokenizer and llama runner binary to phone** -``` -adb shell mkdir -p /data/local/tmp/llama -adb push /data/local/tmp/llama/ -adb push /data/local/tmp/llama/ -adb push cmake-out-android/examples/models/llama2/llama_main /data/local/tmp/llama/ -``` - -**2.3 Run model** -``` -adb shell "cd /data/local/tmp/llama && ./llama_main --model_path --tokenizer_path --prompt \"Once upon a time\" --seq_len 120" -``` -## Step 6: Build Mobile apps - -### iOS - -Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-ios.html) to for full instructions on building the iOS LLAMA Demo App. Rename `tokenizer.model` file to `tokenizer.bin` because the demo app looks for the tokenizer file with .bin extension. - -### Android -Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-android.html) to for full instructions on building the Android LLAMA Demo App. - -## Optional: Smaller models delegated to other backends -Currently we supported lowering the stories model to other backends, including, CoreML, MPS and QNN. Please refer to the instruction -for each backend ([CoreML](https://pytorch.org/executorch/main/build-run-coreml.html), [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [QNN](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html)) before trying to lower them. After the backend library is installed, the script to export a lowered model is - -- Lower to CoreML: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --coreml -c stories110M.pt -p params.json ` -- MPS: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --mps -c stories110M.pt -p params.json ` -- QNN: `python -m examples.models.llama2.export_llama -kv --disable_dynamic_shape --qnn -c stories110M.pt -p params.json ` - -The iOS LLAMA app supports the CoreML and MPS model and the Android LLAMA app supports the QNN model. On Android, it also allow to cross compiler the llama runner binary, push to the device and run. - -For CoreML, there are 2 additional optional arguments: -* `--coreml-ios`: Specify the minimum iOS version to deploy (and turn on available optimizations). E.g. `--coreml-ios 18` will turn on [in-place KV cache](https://developer.apple.com/documentation/coreml/mlstate?language=objc) and [fused scaled dot product attention kernel](https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS18.transformers.scaled_dot_product_attention) (the resulting model will then need at least iOS 18 to run, though) -* `--coreml-quantize`: Use [quantization tailored for CoreML](https://apple.github.io/coremltools/docs-guides/source/opt-quantization-overview.html). E.g. `--coreml-quantize b4w` will perform per-block 4-bit weight-only quantization in a way tailored for CoreML - -# What is coming next? -## Quantization -- Enabling FP16 model to leverage smaller groupsize for 4-bit quantization. -- Enabling GPTQ for 4-bit groupwise quantization -- Enabling custom quantization -- Lower bit quantization -## Models -- Enabling more generative AI models and architectures. -- Enable support for mult-modal models like LlaVa. -## Performance -- Performance improvement via techniques such as speculative decoding -- Enabling LLama2 7b and other architectures via Vulkan -- Enabling performant execution of widely used quantization schemes. - - -# Notes -This example tries to reuse the Python code, with minimal modifications to make it compatible with current ExecuTorch: -1. Since ExecuTorch does not support complex Tensor data type, use the customized functions to have rotary embedding with real numbers. Please see [GitHub issue: Support complex data type in ExecuTorch](https://github.com/pytorch/executorch/issues/886). -2. No CUDA. ExecuTorch is focused on Edge use cases where CUDA is not available on most of the edge devices. -3. No dependencies on fairscale. The ColumnParallelLinear, ParallelEmbedding and training are not needed and supported in ExecuTorch. - - -# Common Issues and Mitigations: -- To clean your build: -``` -git clean -xfd -pip uninstall executorch -./install_requirements.sh --pybind xnnpack - -rm -rf cmake-out -``` -- If you encounter `pthread` related issues during link time, add `pthread` in `target_link_libraries` in `CMakeLists.txt` -- On Mac, if there is linking error in Step 4 with error message like -``` -0 0x100823648 __assert_rtn + 72 -1 0x10074bc5c ld::Fixup::applyFixup(ld::Atom const*, ld::LayoutLinkedImage const&, unsigned char*) const + 8268 -2 0x1007de7d8 ___ZN2ld16LayoutExecutable27writeContentWithoutLinkEditENSt3__14spanIhLm18446744073709551615EEEy_block_invoke + 332 -3 0x188cca428 _dispatch_client_callout2 + 20 -4 0x188cde850 _dispatch_apply_invoke3 + 336 -5 0x188cca3e8 _dispatch_client_callout + 20 -6 0x188ccbc68 _dispatch_once_callout + 32 -7 0x188cdeeec _dispatch_apply_invoke_and_wait + 372 -8 0x188cdde9c _dispatch_apply_with_attr_f + 1212 -9 0x188cde08c dispatch_apply + 96 -10 0x1007de9e4 void mapReduce(std::__1::span, unsigned long, void (unsigned long, mach_o::Error&, std::__1::span) block_pointer, void (std::__1::span) block_pointer) + 336 -11 0x1007de594 ld::LayoutExecutable::writeContentWithoutLinkEdit(std::__1::span, unsigned long long) + 1180 -12 0x1007e4020 ld::LayoutExecutable::writeToFile(char const*) + 15248 -13 0x1007962e8 main + 9424 -ld: Assertion failed: (extras.otherInstrOffset != 0 && "Kind::arm64_adrp_ldr missing extra info"), function applyFixup, file Fixup.cpp, line 793. -clang: error: linker command failed with exit code 1 (use -v to see invocation) -``` -It's a known issue for Xcode version 15.1. -Mitigation: update to most recent Xcode version, clean and rebuild. +For Llama2, please see the [Llama README page](../llama/README.md) for details. diff --git a/examples/models/llama3/README.md b/examples/models/llama3/README.md index 5ea3e6b9e1..1056f3d93f 100644 --- a/examples/models/llama3/README.md +++ b/examples/models/llama3/README.md @@ -1,2 +1,2 @@ # Summary -For Llama3, use the same example code, minus tokenizer, as Llama2. Please see the [Llama2 README page](../llama2/README.md) for details. +For Llama3, use the same example code, minus tokenizer, as Llama2. Please see the [Llama README page](../llama/README.md) for details. diff --git a/examples/models/llama3_2_vision/cross_attention/TARGETS b/examples/models/llama3_2_vision/cross_attention/TARGETS new file mode 100644 index 0000000000..2341af9282 --- /dev/null +++ b/examples/models/llama3_2_vision/cross_attention/TARGETS @@ -0,0 +1,8 @@ +# Any targets that should be shared between fbcode and xplat must be defined in +# targets.bzl. This file can contain fbcode-only targets. + +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() diff --git a/examples/models/llama3_2_vision/preprocess/TARGETS b/examples/models/llama3_2_vision/preprocess/TARGETS new file mode 100644 index 0000000000..2341af9282 --- /dev/null +++ b/examples/models/llama3_2_vision/preprocess/TARGETS @@ -0,0 +1,8 @@ +# Any targets that should be shared between fbcode and xplat must be defined in +# targets.bzl. This file can contain fbcode-only targets. + +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() diff --git a/examples/models/llama3_2_vision/preprocess/export_preprocess.py b/examples/models/llama3_2_vision/preprocess/export_preprocess.py index a7d944a5b8..58c7909507 100644 --- a/examples/models/llama3_2_vision/preprocess/export_preprocess.py +++ b/examples/models/llama3_2_vision/preprocess/export_preprocess.py @@ -13,17 +13,17 @@ def main(): - # Export - ep = export_preprocess() # ExecuTorch - et = lower_to_executorch_preprocess(ep) + ep_et = export_preprocess() + et = lower_to_executorch_preprocess(ep_et) with open("preprocess_et.pte", "wb") as file: et.write_to_file(file) # AOTInductor + ep_aoti = export_preprocess() torch._inductor.aot_compile( - ep.module(), + ep_aoti.module(), get_example_inputs(), options={"aot_inductor.output_path": "preprocess_aoti.so"}, ) diff --git a/examples/models/llama3_2_vision/preprocess/export_preprocess_lib.py b/examples/models/llama3_2_vision/preprocess/export_preprocess_lib.py index 53bb2e400d..f3fe8188c0 100644 --- a/examples/models/llama3_2_vision/preprocess/export_preprocess_lib.py +++ b/examples/models/llama3_2_vision/preprocess/export_preprocess_lib.py @@ -43,7 +43,6 @@ def export_preprocess( max_num_tiles: int = 4, tile_size: int = 224, antialias: bool = False, - pad_max_tiles: bool = True, ) -> ExportedProgram: # Instantiate eager model. @@ -54,7 +53,6 @@ def export_preprocess( max_num_tiles=max_num_tiles, tile_size=tile_size, antialias=antialias, - pad_max_tiles=pad_max_tiles, ) # Replace non-exportable ops with custom ops. diff --git a/examples/models/llama3_2_vision/preprocess/test_preprocess.py b/examples/models/llama3_2_vision/preprocess/test_preprocess.py index 313097020a..73a3fd2960 100644 --- a/examples/models/llama3_2_vision/preprocess/test_preprocess.py +++ b/examples/models/llama3_2_vision/preprocess/test_preprocess.py @@ -15,6 +15,11 @@ from executorch.extension.pybindings import portable_lib # noqa # usort: skip from executorch.extension.llm.custom_ops import sdpa_with_kv_cache # noqa # usort: skip +from executorch.examples.models.llama3_2_vision.preprocess.export_preprocess_lib import ( + export_preprocess, + get_example_inputs, + lower_to_executorch_preprocess, +) from executorch.extension.pybindings.portable_lib import ( _load_for_executorch_from_buffer, ) @@ -37,12 +42,6 @@ ) from torchvision.transforms.v2 import functional as F -from .export_preprocess_lib import ( - export_preprocess, - get_example_inputs, - lower_to_executorch_preprocess, -) - @dataclass class PreprocessConfig: @@ -54,7 +53,6 @@ class PreprocessConfig: tile_size: int = 224 max_num_tiles: int = 4 possible_resolutions = None - pad_max_tiles: bool = True class TestImageTransform(unittest.TestCase): @@ -137,17 +135,6 @@ def prepare_inputs( [1.0, 1.0], # expected_tile_max [0.0, 0.0], # expected_tile_min [1, 2], # expected_aspect_ratio - False, # pad_max_tiles - ), - ( - (100, 400, 3), # image_size - torch.Size([4, 3, 224, 224]), # expected shape - False, # resize_to_max_canvas - [0.2230, 0.1763, 0.0, 0.0], # expected_tile_means - [1.0, 1.0, 0.0, 0.0], # expected_tile_max - [0.0, 0.0, 0.0, 0.0], # expected_tile_min - [1, 2], # expected_aspect_ratio - True, # pad_max_tiles ), ( (1000, 300, 3), # image_size @@ -157,7 +144,6 @@ def prepare_inputs( [0.9976, 0.9940, 0.9936, 0.9906], # expected_tile_max [0.0037, 0.0047, 0.0039, 0.0], # expected_tile_min [4, 1], # expected_aspect_ratio - False, # pad_max_tiles ), ( (200, 200, 3), # image_size @@ -167,7 +153,6 @@ def prepare_inputs( [0.9921, 0.9925, 0.9969, 0.9908], # expected_tile_max [0.0056, 0.0069, 0.0059, 0.0032], # expected_tile_min [2, 2], # expected_aspect_ratio - False, # pad_max_tiles ), ( (600, 200, 3), # image_size @@ -177,17 +162,6 @@ def prepare_inputs( [1.0, 1.0, 1.0], # expected_tile_max [0.0, 0.0, 0.0], # expected_tile_min [3, 1], # expected_aspect_ratio - False, # pad_max_tiles - ), - ( - (600, 200, 3), # image_size - torch.Size([4, 3, 224, 224]), # expected shape - False, # resize_to_max_canvas - [0.4472, 0.4468, 0.3031, 0.0], # expected_tile_means - [1.0, 1.0, 1.0, 0.0], # expected_tile_max - [0.0, 0.0, 0.0, 0.0], # expected_tile_min - [3, 1], # expected_aspect_ratio - True, # pad_max_tiles ), ] ) @@ -200,11 +174,8 @@ def test_preprocess( expected_tile_max: List[float], expected_tile_min: List[float], expected_ar: List[int], - pad_max_tiles: bool, ) -> None: - config = PreprocessConfig( - resize_to_max_canvas=resize_to_max_canvas, pad_max_tiles=pad_max_tiles - ) + config = PreprocessConfig(resize_to_max_canvas=resize_to_max_canvas) reference_model = CLIPImageTransform( image_mean=config.image_mean, @@ -215,7 +186,6 @@ def test_preprocess( tile_size=config.tile_size, max_num_tiles=config.max_num_tiles, possible_resolutions=None, - pad_max_tiles=config.pad_max_tiles, ) eager_model = _CLIPImageTransform( @@ -225,7 +195,6 @@ def test_preprocess( antialias=config.antialias, tile_size=config.tile_size, max_num_tiles=config.max_num_tiles, - pad_max_tiles=config.pad_max_tiles, ) exported_model = export_preprocess( @@ -235,7 +204,6 @@ def test_preprocess( antialias=config.antialias, tile_size=config.tile_size, max_num_tiles=config.max_num_tiles, - pad_max_tiles=config.pad_max_tiles, ) executorch_model = lower_to_executorch_preprocess(exported_model) @@ -275,11 +243,8 @@ def test_preprocess( self.assertAlmostEqual(tile.min().item(), expected_tile_min[i], delta=1e-4) # Check num tiles matches the product of the aspect ratio. - if pad_max_tiles: - self.assertEqual(config.max_num_tiles, reference_image.shape[0]) - else: - expected_num_tiles = reference_ar[0] * reference_ar[1] - self.assertEqual(expected_num_tiles, reference_image.shape[0]) + expected_num_tiles = reference_ar[0] * reference_ar[1] + self.assertEqual(expected_num_tiles, reference_image.shape[0]) # Pre-work for eager and exported models. The reference model performs these # calculations and passes the result to _CLIPImageTransform, the exportable model. diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index cc475f1b19..bdb30db735 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -12,15 +12,15 @@ ConfigPrecisionType, ) from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner -from executorch.examples.models.llama2.export_llama_lib import ( +from executorch.examples.models.llama.export_llama_lib import ( build_args_parser, get_quantizer_and_quant_params, ) -from executorch.examples.models.llama2.source_transformation.quantize import ( +from executorch.examples.models.llama.source_transformation.quantize import ( EmbeddingQuantHandler, get_quant_weight_transform, ) -from executorch.examples.models.llama2.source_transformation.sdpa import ( +from executorch.examples.models.llama.source_transformation.sdpa import ( replace_sdpa_with_custom_op, ) from executorch.examples.models.llava.image_util import serialize_image diff --git a/examples/models/llava/install_requirements.sh b/examples/models/llava/install_requirements.sh index facf3032b9..4dcdeea83b 100755 --- a/examples/models/llava/install_requirements.sh +++ b/examples/models/llava/install_requirements.sh @@ -12,4 +12,4 @@ pip install transformers accelerate sentencepiece tiktoken # Run llama2/install requirements for torchao deps SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -bash "$SCRIPT_DIR"/../llama2/install_requirements.sh +bash "$SCRIPT_DIR"/../llama/install_requirements.sh diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py index 8dcf286727..a24249d953 100644 --- a/examples/models/llava/model.py +++ b/examples/models/llava/model.py @@ -12,9 +12,9 @@ import requests import torch -from executorch.examples.models.llama2.llama_transformer import ModelArgs, Transformer +from executorch.examples.models.llama.llama_transformer import ModelArgs, Transformer -from executorch.examples.models.llama2.source_transformation.sdpa import ( +from executorch.examples.models.llama.source_transformation.sdpa import ( replace_sdpa_with_custom_op, ) from executorch.examples.models.llava.image_util import prepare_image diff --git a/examples/qualcomm/oss_scripts/llama2/model/static_llama.py b/examples/qualcomm/oss_scripts/llama2/model/static_llama.py index 85f018e71f..ca9afb6fa9 100644 --- a/examples/qualcomm/oss_scripts/llama2/model/static_llama.py +++ b/examples/qualcomm/oss_scripts/llama2/model/static_llama.py @@ -9,7 +9,7 @@ import torch import torch.nn as nn -from executorch.examples.models.llama2.llama_transformer import ( +from executorch.examples.models.llama.llama_transformer import ( FeedForward, ModelArgs, precompute_freqs_cis, diff --git a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt index 947b3ef975..4c493eb5a5 100644 --- a/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt +++ b/examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt @@ -78,7 +78,7 @@ list( list( APPEND _qaihub_llama3_8b_runner__srcs - ${CMAKE_CURRENT_SOURCE_DIR}/../../../models/llama2/tokenizer/llama_tiktoken.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/../../../models/llama/tokenizer/llama_tiktoken.cpp ) # build qaihub llama3 8b runner diff --git a/examples/qualcomm/qaihub_scripts/llama/README.md b/examples/qualcomm/qaihub_scripts/llama/README.md index d49ca4cc94..b0a3ca4645 100644 --- a/examples/qualcomm/qaihub_scripts/llama/README.md +++ b/examples/qualcomm/qaihub_scripts/llama/README.md @@ -21,7 +21,7 @@ Note that the pre-compiled context binaries could not be futher fine-tuned for o ```bash # tokenizer.model: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/tokenizer.model # tokenizer.bin: -python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin +python -m examples.models.llama.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin ``` #### Step3: Run default examples diff --git a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp index 721c16209c..959f6810ae 100644 --- a/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/qaihub_scripts/llama/runner/runner.cpp @@ -10,7 +10,7 @@ // logic. The module takes in a string as input and emits a string as output. #if defined(QAIHUB_LLAMA3_RUNNER) -#include +#include #else #include #endif diff --git a/examples/xnnpack/targets.bzl b/examples/xnnpack/targets.bzl index 35df8999b4..ce9575e8cc 100644 --- a/examples/xnnpack/targets.bzl +++ b/examples/xnnpack/targets.bzl @@ -40,7 +40,7 @@ def define_common_targets(): name = "aot_compiler", main_module = "executorch.examples.xnnpack.aot_compiler", resources = { - "//executorch/examples/models/llama2/params:params": "params", + "//executorch/examples/models/llama/params:params": "params", }, deps = [ ":xnnpack_aot_lib", diff --git a/extension/android/CMakeLists.txt b/extension/android/CMakeLists.txt index cc08071611..0ee8b042a2 100644 --- a/extension/android/CMakeLists.txt +++ b/extension/android/CMakeLists.txt @@ -152,8 +152,8 @@ if(EXECUTORCH_BUILD_LLAMA_JNI) ) add_subdirectory( - ${EXECUTORCH_ROOT}/examples/models/llama2/runner - ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama2/runner + ${EXECUTORCH_ROOT}/examples/models/llama/runner + ${CMAKE_CURRENT_BINARY_DIR}/../../examples/models/llama/runner ) endif() diff --git a/extension/android/jni/BUCK b/extension/android/jni/BUCK index a400138a04..6f269739c0 100644 --- a/extension/android/jni/BUCK +++ b/extension/android/jni/BUCK @@ -92,7 +92,7 @@ fb_android_cxx_library( "//fbandroid/native/fb:fb", "//third-party/glog:glog", "//xplat/executorch/backends/xnnpack:xnnpack_backend_static", - "//xplat/executorch/examples/models/llama2/runner:runner_static", + "//xplat/executorch/examples/models/llama/runner:runner_static", "//xplat/executorch/examples/models/llava/runner:runner_static", "//xplat/executorch/extension/module:module_static", "//xplat/executorch/extension/runner_util:inputs_static", diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index 367ed3a966..6ffc88d810 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include #include diff --git a/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj b/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj index fe25a17384..c43b701e88 100644 --- a/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj +++ b/extension/benchmark/apple/Benchmark/Benchmark.xcodeproj/project.pbxproj @@ -67,10 +67,10 @@ 032A73FD2CAFBB7800932D36 /* tiktoken.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tiktoken.h; sourceTree = ""; }; 032A73FE2CAFBB7800932D36 /* tiktoken.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = tiktoken.cpp; sourceTree = ""; }; 032A73FF2CAFBB7800932D36 /* tokenizer.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = tokenizer.h; sourceTree = ""; }; - 032A74212CAFC1B300932D36 /* runner.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = runner.h; path = ../../../../examples/models/llama2/runner/runner.h; sourceTree = SOURCE_ROOT; }; - 032A74222CAFC1B300932D36 /* runner.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = runner.cpp; path = ../../../../examples/models/llama2/runner/runner.cpp; sourceTree = SOURCE_ROOT; }; - 032A74242CAFC34800932D36 /* llama_tiktoken.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = llama_tiktoken.h; path = ../../../../examples/models/llama2/tokenizer/llama_tiktoken.h; sourceTree = SOURCE_ROOT; }; - 032A74252CAFC34800932D36 /* llama_tiktoken.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = llama_tiktoken.cpp; path = ../../../../examples/models/llama2/tokenizer/llama_tiktoken.cpp; sourceTree = SOURCE_ROOT; }; + 032A74212CAFC1B300932D36 /* runner.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = runner.h; path = ../../../../examples/models/llama/runner/runner.h; sourceTree = SOURCE_ROOT; }; + 032A74222CAFC1B300932D36 /* runner.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = runner.cpp; path = ../../../../examples/models/llama/runner/runner.cpp; sourceTree = SOURCE_ROOT; }; + 032A74242CAFC34800932D36 /* llama_tiktoken.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = llama_tiktoken.h; path = ../../../../examples/models/llama/tokenizer/llama_tiktoken.h; sourceTree = SOURCE_ROOT; }; + 032A74252CAFC34800932D36 /* llama_tiktoken.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = llama_tiktoken.cpp; path = ../../../../examples/models/llama/tokenizer/llama_tiktoken.cpp; sourceTree = SOURCE_ROOT; }; 037C96A02C8A570B00B3DF38 /* Tests.xctestplan */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = Tests.xctestplan; sourceTree = ""; }; 03B0118B2CAC567900054791 /* DynamicTestCase.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = DynamicTestCase.h; sourceTree = ""; }; 03B0118C2CAC567900054791 /* DynamicTestCase.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = DynamicTestCase.m; sourceTree = ""; }; diff --git a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm index f3558308c8..c03ad14517 100644 --- a/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm +++ b/extension/benchmark/apple/Benchmark/Tests/LLaMA/LLaMATests.mm @@ -8,7 +8,7 @@ #import "ResourceTestCase.h" -#import +#import using namespace ::executorch::extension; using namespace ::executorch::runtime; diff --git a/extension/export_util/export_hf_model.py b/extension/export_util/export_hf_model.py index e45ba8d417..929773fa4d 100644 --- a/extension/export_util/export_hf_model.py +++ b/extension/export_util/export_hf_model.py @@ -12,7 +12,7 @@ from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig, to_edge from torch.nn.attention import SDPBackend -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM from transformers.generation.configuration_utils import GenerationConfig from transformers.integrations.executorch import convert_and_export_with_cache from transformers.modeling_utils import PreTrainedModel @@ -73,12 +73,11 @@ def main() -> None: print(f"{model.config}") print(f"{model.generation_config}") - tokenizer = AutoTokenizer.from_pretrained(args.hf_model_repo) - input_ids = tokenizer([""], return_tensors="pt").to(device)["input_ids"] + input_ids = torch.tensor([[1]], dtype=torch.long) cache_position = torch.tensor([0], dtype=torch.long) def _get_constant_methods(model: PreTrainedModel): - return { + metadata = { "get_dtype": 5 if model.config.torch_dtype == torch.float16 else 6, "get_bos_id": model.config.bos_token_id, "get_eos_id": model.config.eos_token_id, @@ -90,6 +89,7 @@ def _get_constant_methods(model: PreTrainedModel): "get_vocab_size": model.config.vocab_size, "use_kv_cache": model.generation_config.use_cache, } + return {k: v for k, v in metadata.items() if v is not None} with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad(): diff --git a/extension/gguf_util/converters/llama_converter.py b/extension/gguf_util/converters/llama_converter.py index 463e5a0fcf..63839adc5c 100644 --- a/extension/gguf_util/converters/llama_converter.py +++ b/extension/gguf_util/converters/llama_converter.py @@ -9,7 +9,7 @@ import torch import torch.nn as nn -from executorch.examples.models.llama2.llama_transformer import ( +from executorch.examples.models.llama.llama_transformer import ( ModelArgs as LlamaModelArgs, Transformer as LlamaTransformer, ) diff --git a/extension/llm/README.md b/extension/llm/README.md index 7f4baed7d3..ad50496682 100644 --- a/extension/llm/README.md +++ b/extension/llm/README.md @@ -18,7 +18,7 @@ Commonly used methods in this class include: - _to_executorch_: get the executorch graph with optional optimization passes. - _save_to_pte_: finally, the lowered and optimized graph can be saved into a .pte file for the runtime. -Some usage of LLMEdgeManager can be found in executorch/examples/models/llama2, and executorch/examples/models/llava. +Some usage of LLMEdgeManager can be found in executorch/examples/models/llama, and executorch/examples/models/llava. When the .pte file is exported and saved, we can load and run it in a runner (see below). @@ -44,6 +44,6 @@ Contains custom op, such as: ## runner It hosts the libary components used in a C++ llm runner. Currently, it hosts _stats.h_ on runtime status like token numbers and latency. -With the components above, an actual runner can be built for a model or a series of models. An example is in //executorch/examples/models/llama2/runner, where a C++ runner code is built to run Llama 2, 3, 3.1 and other models using the same architecture. +With the components above, an actual runner can be built for a model or a series of models. An example is in //executorch/examples/models/llama/runner, where a C++ runner code is built to run Llama 2, 3, 3.1 and other models using the same architecture. Usages can also be found in the [torchchat repo](https://github.com/pytorch/torchchat/tree/main/runner). diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index d2a413fc79..bd12c374b5 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -216,15 +216,15 @@ def pt2e_calibrate( ): logging.info("Run calibration...") try: - from executorch.examples.models.llama2.eval_llama_lib import ( + from executorch.examples.models.llama.eval_llama_lib import ( GraphModuleEvalWrapper, ) - from executorch.examples.models.llama2.evaluate import ( # pyre-ignore[21] + from executorch.examples.models.llama.evaluate import ( # pyre-ignore[21] evaluate_model, ) except ImportError: raise ImportError( - "Please install the llm eval dependency via examples/models/llama2/install_requirements.sh" + "Please install the llm eval dependency via examples/models/llama/install_requirements.sh" ) tokenizer = get_tokenizer(tokenizer_path) diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py index 87feb080b4..d966de9a25 100644 --- a/extension/llm/export/partitioner_lib.py +++ b/extension/llm/export/partitioner_lib.py @@ -37,9 +37,6 @@ def get_vulkan_partitioner( assert ( dtype_override == "fp32" or dtype_override is None ), "Vulkan backend does not support non fp32 dtypes at the moment" - assert ( - quantization_mode is None - ), "Vulkan backend does not support quantization at the moment" from executorch.backends.vulkan.partitioner.vulkan_partitioner import ( VulkanPartitioner, ) diff --git a/extension/llm/tokenizer/targets.bzl b/extension/llm/tokenizer/targets.bzl index fa6cc915c4..3549083eda 100644 --- a/extension/llm/tokenizer/targets.bzl +++ b/extension/llm/tokenizer/targets.bzl @@ -23,7 +23,7 @@ def define_common_targets(): ], _is_external_target = True, deps = [ - "//executorch/examples/models/llama2/tokenizer:tiktoken_py", + "//executorch/examples/models/llama/tokenizer:tiktoken_py", ], external_deps = [ "sentencepiece-py", diff --git a/extension/llm/tokenizer/utils.py b/extension/llm/tokenizer/utils.py index 763febdf47..126a120327 100644 --- a/extension/llm/tokenizer/utils.py +++ b/extension/llm/tokenizer/utils.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from executorch.examples.models.llama2.tokenizer.tiktoken import Tokenizer as Tiktoken +from executorch.examples.models.llama.tokenizer.tiktoken import Tokenizer as Tiktoken from executorch.extension.llm.tokenizer.tokenizer import ( Tokenizer as SentencePieceTokenizer, ) diff --git a/extension/module/module.cpp b/extension/module/module.cpp index 58ada0c246..0e708ab0b7 100644 --- a/extension/module/module.cpp +++ b/extension/module/module.cpp @@ -125,7 +125,7 @@ runtime::Result> Module::method_names() { runtime::Error Module::load_method( const std::string& method_name, - torch::executor::EventTracer* tracer) { + torch::executor::EventTracer* event_tracer) { if (!is_method_loaded(method_name)) { ET_CHECK_OK_OR_RETURN_ERROR(load()); @@ -153,7 +153,9 @@ runtime::Error Module::load_method( method_holder.planned_memory.get(), temp_allocator_.get()); method_holder.method = ET_UNWRAP_UNIQUE(program_->load_method( - method_name.c_str(), method_holder.memory_manager.get(), tracer)); + method_name.c_str(), + method_holder.memory_manager.get(), + event_tracer ? event_tracer : this->event_tracer())); method_holder.inputs.resize(method_holder.method->inputs_size()); methods_.emplace(method_name, std::move(method_holder)); } diff --git a/extension/module/module.h b/extension/module/module.h index f7c9b1c8c5..45ed38a7ff 100644 --- a/extension/module/module.h +++ b/extension/module/module.h @@ -133,7 +133,10 @@ class Module { * needed. The loaded method is cached to reuse the next time it's executed. * * @param[in] method_name The name of the method to load. - * @param[in] event_tracer A EventTracer used for tracking and logging events. + * @param[in] event_tracer Per-method event tracer to profile/trace methods + * individually. When not given, the event tracer passed to the Module + * constructor is used. Otherwise, this per-method event tracer takes + * precedence. * * @returns An Error to indicate success or failure. */ diff --git a/pytest.ini b/pytest.ini index 166890bd25..1ca39f0a50 100644 --- a/pytest.ini +++ b/pytest.ini @@ -15,7 +15,7 @@ addopts = examples/models/test devtools/ # examples - examples/models/llama2/tests + examples/models/llama/tests # examples/models/llava/test TODO: enable this # exir exir/_serialize/test @@ -45,7 +45,7 @@ addopts = --ignore=backends/xnnpack/test/ops/linear.py --ignore=backends/xnnpack/test/models/llama2_et_example.py # T200992559: Add torchao to ET as core dependency - --ignore=examples/models/llama2/tests/test_pre_quantization_transforms.py + --ignore=examples/models/llama/tests/test_pre_quantization_transforms.py --ignore=exir/backend/test/demos --ignore=exir/backend/test/test_backends.py --ignore=exir/backend/test/test_backends_lifted.py diff --git a/runtime/COMPATIBILITY.md b/runtime/COMPATIBILITY.md new file mode 100644 index 0000000000..7dc46115bb --- /dev/null +++ b/runtime/COMPATIBILITY.md @@ -0,0 +1,9 @@ +# Runtime Compatibility Policy + +This document will describe the compatibility guarantees between the [`.pte` file +format](https://pytorch.org/executorch/stable/pte-file-format.html) and the +ExecuTorch runtime. + +> [!IMPORTANT] +> The [canonical version of this document](https://github.com/pytorch/executorch/tree/main/runtime/COMPATIBILITY.md) +> is in the `main` branch of the `pytorch/executorch` GitHub repo. diff --git a/test/run_oss_cpp_tests.sh b/test/run_oss_cpp_tests.sh index e771fd4b12..3b786bb49c 100755 --- a/test/run_oss_cpp_tests.sh +++ b/test/run_oss_cpp_tests.sh @@ -90,8 +90,8 @@ build_and_run_test() { -Bcmake-out/"${test_dir}" cmake --build cmake-out/"${test_dir}" -j9 - if [[ "$test_dir" =~ .*examples/models/llama2/tokenizer.* ]]; then - RESOURCES_PATH=$(realpath examples/models/llama2/tokenizer/test/resources) + if [[ "$test_dir" =~ .*examples/models/llama/tokenizer.* ]]; then + RESOURCES_PATH=$(realpath examples/models/llama/tokenizer/test/resources) elif [[ "$test_dir" =~ .*extension/llm/tokenizer.* ]]; then RESOURCES_PATH=$(realpath extension/llm/tokenizer/test/resources) else