From 8222c247e49986186c7aefe7451ff899bf02ec7e Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Fri, 25 Oct 2024 12:23:38 -0700
Subject: [PATCH 1/3] Add rpath to libcustom_ops_aot_lib.dylib and
 libquantized_ops_aot_lib.dylib

Summary: As titled. This issue is from https://github.com/pytorch/torchchat/actions/runs/11523122333/job/32080481174?pr=1312

In that job when we try to load `libcustom_ops_aot_lib.dylib` into python,
it complains that it can't find `_portable_lib.cpython-310-darwin.so`.
This PR is trying to fix it by adding the relative path to
`_portable_lib.cpython-310-darwin.so` into `LC_RPATH`.

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
---
 extension/llm/custom_ops/CMakeLists.txt | 28 +++++++++++++++++++++----
 kernels/quantized/CMakeLists.txt        | 22 +++++++++++++++++++
 2 files changed, 46 insertions(+), 4 deletions(-)
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
index d42e37f9bd..36b03a480f 100644
--- a/extension/llm/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -59,9 +59,7 @@ target_include_directories(custom_ops PUBLIC "${_common_include_directories}")
 target_include_directories(
   custom_ops PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../../include"
 )
-target_link_libraries(
-  custom_ops PUBLIC ${custom_ops_libs} executorch_core
-)
+target_link_libraries(custom_ops PUBLIC ${custom_ops_libs} executorch_core)
 
 target_compile_options(
   custom_ops PUBLIC ${_common_compile_options} -DET_USE_THREADPOOL
@@ -74,7 +72,8 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
   find_package(Torch CONFIG REQUIRED)
   add_library(
     custom_ops_aot_lib SHARED
-    ${_custom_ops__srcs} ${CMAKE_CURRENT_SOURCE_DIR}/op_sdpa_aot.cpp
+    ${_custom_ops__srcs}
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_sdpa_aot.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/op_fast_hadamard_transform_aten.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/op_tile_crop.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/op_tile_crop_aot.cpp
@@ -110,5 +109,26 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
            ${_common_compile_options} -DET_USE_THREADPOOL
   )
 
+  # pip wheels will need to be able to find the dependent libraries. On Linux,
+  # the .so has non-absolute dependencies on libs like "_portable_lib.so"
+  # without paths; as long as we `import torch` first, those dependencies will
+  # work. But Apple dylibs do not support non-absolute dependencies, so we need
+  # to tell the loader where to look for its libraries. The LC_LOAD_DYLIB
+  # entries for the portable_lib libraries will look like
+  # "@rpath/_portable_lib.cpython-310-darwin.so", so we can add an LC_RPATH
+  # entry to look in a directory relative to the installed location of our
+  # _portable_lib.so file. To see these LC_* values, run `otool -l
+  # libcustom_ops_aot_lib.dylib`.
+  if(APPLE)
+    set_target_properties(
+      custom_ops_aot_lib
+      PROPERTIES # Assume this library will be installed in
+                 # <site-packages>/executorch/extension/llm/custom_ops/, and the
+                 # _portable_lib.so is installed in
+                 # <site-packages>/executorch/extension/pybindings/
+                 BUILD_RPATH "@loader_path/../../pybindings"
+                 INSTALL_RPATH "@loader_path/../../pybindings"
+    )
+  endif()
   install(TARGETS custom_ops_aot_lib DESTINATION lib)
 endif()
diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt
index f073835c93..96305f638e 100644
--- a/kernels/quantized/CMakeLists.txt
+++ b/kernels/quantized/CMakeLists.txt
@@ -114,6 +114,28 @@ if(NOT CMAKE_GENERATOR STREQUAL "Xcode"
       target_link_libraries(
         quantized_ops_aot_lib PUBLIC quantized_ops_pybind_lib
       )
+
+      # pip wheels will need to be able to find the dependent libraries. On
+      # Linux, the .so has non-absolute dependencies on libs like
+      # "_portable_lib.so" without paths; as long as we `import torch` first,
+      # those dependencies will work. But Apple dylibs do not support
+      # non-absolute dependencies, so we need to tell the loader where to look
+      # for its libraries. The LC_LOAD_DYLIB entries for the portable_lib
+      # libraries will look like "@rpath/_portable_lib.cpython-310-darwin.so",
+      # so we can add an LC_RPATH entry to look in a directory relative to the
+      # installed location of our _portable_lib.so file. To see these LC_*
+      # values, run `otool -l libquantized_ops_lib.dylib`.
+      if(APPLE)
+        set_target_properties(
+          quantized_ops_lib
+          PROPERTIES # Assume this library will be installed in
+                     # <site-packages>/executorch/kernels/quantized/, and the
+                     # _portable_lib.so is installed in
+                     # <site-packages>/executorch/extension/pybindings/
+                     BUILD_RPATH "@loader_path/../../extensions/pybindings"
+                     INSTALL_RPATH "@loader_path/../../extensions/pybindings"
+        )
+      endif()
     endif()
   endif()
 endif()

From 08f4d84709d25f0b29766922bd9d6bf76e360804 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@fb.com>
Date: Fri, 25 Oct 2024 15:59:33 -0700
Subject: [PATCH 2/3] Add smoke test

---
 build/packaging/smoke_test.py    | 17 +++++++++++++++++
 kernels/quantized/CMakeLists.txt |  2 +-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/build/packaging/smoke_test.py b/build/packaging/smoke_test.py
index be53ae5a37..1c640fb7c5 100644
--- a/build/packaging/smoke_test.py
+++ b/build/packaging/smoke_test.py
@@ -15,6 +15,14 @@
 # will fail and the process will exit.
 from executorch.extension.pybindings import portable_lib  # usort: skip
 
+# Import custom ops. This requires portable_lib to be loaded first.
+from executorch.extension.llm.custom_ops import (
+    sdpa_with_kv_cache,
+)  # usort: skip # noqa: F401, F403
+
+# Import quantized ops. This requires portable_lib to be loaded first.
+from executorch.kernels import quantized  # usort: skip # noqa: F401, F403
+
 # Import this after importing the ExecuTorch pybindings. If the pybindings
 # links against a different torch.so than this uses, there will be a set of
 # symbol comflicts; the process will either exit now, or there will be issues
@@ -75,6 +83,15 @@ def main():
     assert len(ops) > 0, "Empty operator list"
     print(f"Found {len(ops)} operators; first element '{ops[0]}'")
 
+    # Make sure custom ops are registered.
+    assert (
+        "llama::sdpa_with_kv_cache" in ops
+    ), f"sdpa_with_kv_cache not registered, Got ops: {ops}"
+
+    # Make sure quantized ops are registered.
+    assert (
+        "quantized_decomposed::add.out" in ops
+    ), f"quantized_decomposed::add.out not registered, Got ops: {ops}"
     # Export LinearModel to .pte data.
     pte_data: bytes = export_linear_model()
 
diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt
index 96305f638e..9d2b14d8eb 100644
--- a/kernels/quantized/CMakeLists.txt
+++ b/kernels/quantized/CMakeLists.txt
@@ -127,7 +127,7 @@ if(NOT CMAKE_GENERATOR STREQUAL "Xcode"
       # values, run `otool -l libquantized_ops_lib.dylib`.
       if(APPLE)
         set_target_properties(
-          quantized_ops_lib
+          quantized_ops_aot_lib
           PROPERTIES # Assume this library will be installed in
                      # <site-packages>/executorch/kernels/quantized/, and the
                      # _portable_lib.so is installed in

From 9113c47c0b71193248033524b7b5274596b77f98 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@fb.com>
Date: Fri, 25 Oct 2024 16:21:23 -0700
Subject: [PATCH 3/3] Lint

---
 build/packaging/smoke_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build/packaging/smoke_test.py b/build/packaging/smoke_test.py
index 1c640fb7c5..59778b50d0 100644
--- a/build/packaging/smoke_test.py
+++ b/build/packaging/smoke_test.py
@@ -16,9 +16,9 @@
 from executorch.extension.pybindings import portable_lib  # usort: skip
 
 # Import custom ops. This requires portable_lib to be loaded first.
-from executorch.extension.llm.custom_ops import (
+from executorch.extension.llm.custom_ops import (  # noqa: F401, F403
     sdpa_with_kv_cache,
-)  # usort: skip # noqa: F401, F403
+)  # usort: skip
 
 # Import quantized ops. This requires portable_lib to be loaded first.
 from executorch.kernels import quantized  # usort: skip # noqa: F401, F403