Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit f1bfb18

Browse files
larryliu0820claude
andauthored
Add CUDA backend to pybind (pytorch#15544)
This PR integrates CUDA and AOTI support into the pybind build system. The implementation starts by updating setup.py to automatically detect CUDA availability using install_utils.py functions, replacing the problematic sys.path hack with a clean importlib-based approach. This enables automatic building of CUDA and AOTI targets when CUDA is detected on the system. The changes then extend to CUDA runtime shims for SlimTensor support, update AOTI build targets and common shims, and enhance CI workflows for CUDA testing. The implementation ensures proper symbol resolution when loading AOTI-produced shared libraries. ## Test Plan - [x] Verified setup.py imports work correctly without sys.path modifications - [x] Confirmed CUDA detection functionality works as expected - [x] Validated that install_utils functions are accessible via importlib approach - [x] Tested setup.py basic functionality (--help, --version) still works - [x] Confirmed Python syntax validation passes for all modified files Co-Authored-By: Claude Sonnet 4 <[email protected]> Co-authored-by: Claude Sonnet 4 <[email protected]>
1 parent 50c170c commit f1bfb18

8 files changed

Lines changed: 247 additions & 34 deletions

File tree

.ci/scripts/test_huggingface_optimum_model.py

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -214,16 +214,39 @@ def test_llm_with_image_modality(
214214
recipe,
215215
"--output_dir",
216216
model_dir,
217-
"--use_custom_sdpa",
218-
"--use_custom_kv_cache",
219-
"--qlinear",
220-
"8da4w",
221-
"--qembedding",
222-
"8w",
223217
]
218+
219+
# Recipe-specific configurations
220+
if "xnnpack" in recipe:
221+
command += ["--use_custom_sdpa", "--use_custom_kv_cache"]
222+
if quantize:
223+
command += ["--qlinear", "8da4w", "--qembedding", "8w"]
224+
elif recipe == "cuda":
225+
command += ["--dtype", "bfloat16", "--device", "cuda"]
226+
if quantize:
227+
command += [
228+
"--qlinear",
229+
"4w",
230+
"--qlinear_encoder",
231+
"4w",
232+
"--qlinear_packing_format",
233+
"tile_packed_to_4d",
234+
"--qlinear_encoder_packing_format",
235+
"tile_packed_to_4d",
236+
]
237+
else:
238+
assert not quantize, f"Quantization not supported for {recipe} recipe"
239+
224240
if not run_only:
225241
cli_export(command, model_dir)
226242

243+
# CUDA artifact validation
244+
if recipe == "cuda":
245+
model_path = Path(model_dir) / "model.pte"
246+
cuda_blob_path = Path(model_dir) / "aoti_cuda_blob.ptd"
247+
assert model_path.exists(), f"Main model file not found: {model_path}"
248+
assert cuda_blob_path.exists(), f"CUDA blob not found: {cuda_blob_path}"
249+
227250
tokenizer = AutoTokenizer.from_pretrained(model_id)
228251
saved_files = tokenizer.save_pretrained(model_dir)
229252
tokenizer_path = get_tokenizer_path(model_dir, saved_files)
@@ -262,7 +285,14 @@ def test_llm_with_image_modality(
262285

263286
from executorch.extension.llm.runner import GenerationConfig, MultimodalRunner
264287

265-
runner = MultimodalRunner(f"{model_dir}/model.pte", tokenizer_path)
288+
# Recipe-specific MultimodalRunner instantiation
289+
if recipe == "cuda":
290+
runner = MultimodalRunner(
291+
f"{model_dir}/model.pte", tokenizer_path, f"{model_dir}/aoti_cuda_blob.ptd"
292+
)
293+
else:
294+
runner = MultimodalRunner(f"{model_dir}/model.pte", tokenizer_path)
295+
266296
generated_text = runner.generate_text_hf(
267297
inputs,
268298
GenerationConfig(max_new_tokens=128, temperature=0, echo=False),

.github/workflows/cuda-windows.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ jobs:
6666
echo "::endgroup::"
6767
6868
echo "::group::Setup ExecuTorch"
69+
# Disable MKL to avoid duplicate target error when conda has multiple MKL installations
70+
export USE_MKL=OFF
6971
PYTHON_EXECUTABLE=python ./install_executorch.sh
7072
echo "::endgroup::"
7173

.github/workflows/cuda.yml

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,8 @@ jobs:
164164
set -eux
165165
166166
echo "::group::Setup ExecuTorch"
167+
# Disable MKL to avoid duplicate target error when conda has multiple MKL installations
168+
export USE_MKL=OFF
167169
./install_executorch.sh
168170
echo "::endgroup::"
169171
@@ -221,3 +223,68 @@ jobs:
221223
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
222224
script: |
223225
source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
226+
227+
test-cuda-pybind:
228+
name: test-cuda-pybind
229+
needs: export-model-cuda-artifact
230+
# This job downloads models exported by export-model-cuda-artifact and runs them using pybind.
231+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
232+
permissions:
233+
id-token: write
234+
contents: read
235+
secrets: inherit
236+
strategy:
237+
fail-fast: false
238+
matrix:
239+
model: ["gemma3-4b"]
240+
quantize: ["", "--quantize"]
241+
with:
242+
timeout: 120
243+
secrets-env: EXECUTORCH_HF_TOKEN
244+
download-artifact: google-gemma-3-4b-it-cuda-${{ matrix.quantize && 'quantized-int4-tile-packed' || 'non-quantized' }}
245+
runner: linux.g5.4xlarge.nvidia.gpu
246+
gpu-arch-type: cuda
247+
gpu-arch-version: 12.6
248+
use-custom-docker-registry: false
249+
submodules: recursive
250+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
251+
script: |
252+
set -eux
253+
254+
echo "::group::Setup ExecuTorch"
255+
# Disable MKL to avoid duplicate target error when conda has multiple MKL installations
256+
export USE_MKL=OFF
257+
./install_executorch.sh
258+
echo "::endgroup::"
259+
260+
echo "::group::Fix libstdc++ GLIBCXX version"
261+
# The embedded .so files in the CUDA blob require GLIBCXX_3.4.29
262+
# which the default conda libstdc++ doesn't have. Install a newer
263+
# libstdc++ from conda-forge and use it via LD_PRELOAD.
264+
conda install -y -c conda-forge 'libstdcxx-ng>=12'
265+
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
266+
# Verify the new libstdc++ has GLIBCXX_3.4.29
267+
strings /opt/conda/lib/libstdc++.so.6 | grep GLIBCXX_3.4.29 || {
268+
echo "Error: GLIBCXX_3.4.29 not found in /opt/conda/lib/libstdc++.so.6"
269+
exit 1
270+
}
271+
echo "::endgroup::"
272+
273+
echo "::group::Setup Huggingface"
274+
pip install -U "huggingface_hub[cli]<1.0"
275+
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
276+
echo "::endgroup::"
277+
278+
echo "::group::Install optimum-executorch"
279+
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
280+
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
281+
echo "::endgroup::"
282+
283+
echo "::group::Test CUDA Multimodal: ${{ matrix.model }} ${{ matrix.quantize }}"
284+
python .ci/scripts/test_huggingface_optimum_model.py \
285+
--model ${{ matrix.model }} \
286+
--recipe cuda \
287+
--model_dir "${RUNNER_ARTIFACT_DIR}" \
288+
--run_only \
289+
${{ matrix.quantize }}
290+
echo "::endgroup::"

CMakeLists.txt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -668,6 +668,16 @@ if(EXECUTORCH_BUILD_CORTEX_M)
668668
list(APPEND _executorch_backends coretex_m_backend)
669669
endif()
670670

671+
# If CUDA, Metal, or pybind will need Torch, find it at root scope first so that
672+
# imported targets (e.g. MKL::MKL) aren't created in child directory scopes and
673+
# then duplicated when found again at root scope.
674+
if(EXECUTORCH_BUILD_CUDA
675+
OR EXECUTORCH_BUILD_METAL
676+
OR EXECUTORCH_BUILD_PYBIND
677+
)
678+
find_package_torch()
679+
endif()
680+
671681
# Build common AOTI functionality if needed by CUDA or Metal backends
672682
if(EXECUTORCH_BUILD_CUDA OR EXECUTORCH_BUILD_METAL)
673683
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti)
@@ -886,6 +896,15 @@ if(EXECUTORCH_BUILD_PYBIND)
886896
torch
887897
)
888898

899+
# Build common AOTI functionality if needed by CUDA or Metal backends
900+
if(EXECUTORCH_BUILD_CUDA)
901+
# CUDA uses SlimTensor-based shims
902+
list(APPEND _dep_libs aoti_cuda_backend)
903+
elseif(EXECUTORCH_BUILD_METAL)
904+
# Metal still uses ETensor-based shims (for now)
905+
list(APPEND _dep_libs aoti_common)
906+
endif()
907+
889908
# RPATH for _portable_lib.so
890909
set(_portable_lib_rpath "$ORIGIN/../../../torch/lib")
891910

backends/cuda/runtime/platform/platform.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <executorch/runtime/core/error.h>
1212
#include <executorch/runtime/core/result.h>
1313
#include <filesystem>
14+
#include <mutex>
1415
#include <string>
1516

1617
#ifdef _WIN32
@@ -41,6 +42,28 @@ executorch::runtime::Result<void*> load_library(
4142
}
4243

4344
#else
45+
// Before loading the delegate .so, we need to ensure symbols from the current
46+
// process (e.g., _portable_lib.so) are globally visible. Python loads modules
47+
// with RTLD_LOCAL by default, so we re-open the current module with
48+
// RTLD_GLOBAL | RTLD_NOLOAD to promote its symbols to global visibility.
49+
// This allows the delegate .so to resolve symbols like aoti_torch_dtype_*.
50+
static std::once_flag symbols_promoted_flag;
51+
std::call_once(symbols_promoted_flag, []() {
52+
Dl_info info;
53+
// Get info about a symbol we know exists in _portable_lib.so
54+
if (dladdr((void*)&load_library, &info) && info.dli_fname) {
55+
// Re-open with RTLD_GLOBAL | RTLD_NOLOAD to promote symbols
56+
void* handle =
57+
dlopen(info.dli_fname, RTLD_NOW | RTLD_GLOBAL | RTLD_NOLOAD);
58+
if (!handle) {
59+
ET_LOG(Error, "Failed to promote symbols: %s", dlerror());
60+
} else {
61+
// Close the handle after successful promotion
62+
dlclose(handle);
63+
}
64+
}
65+
});
66+
4467
std::string path_str = path.string();
4568
void* lib_handle = dlopen(path_str.c_str(), RTLD_LAZY | RTLD_LOCAL);
4669
if (lib_handle == nullptr) {

install_requirements.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,6 @@
1818
# This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled.
1919
TORCH_NIGHTLY_URL_BASE = "https://download.pytorch.org/whl/nightly"
2020

21-
# Supported CUDA versions - modify this to add/remove supported versions
22-
# Format: tuple of (major, minor) version numbers
23-
SUPPORTED_CUDA_VERSIONS = (
24-
(12, 6),
25-
(12, 8),
26-
(12, 9),
27-
(13, 0),
28-
)
29-
3021
# Since ExecuTorch often uses main-branch features of pytorch, only the nightly
3122
# pip versions will have the required features.
3223
#
@@ -39,7 +30,7 @@
3930
# https://hud.pytorch.org/hud/pytorch/pytorch/nightly/ @lint-ignore
4031
#
4132
# NOTE: If you're changing, make the corresponding supported CUDA versions in
42-
# SUPPORTED_CUDA_VERSIONS above if needed.
33+
# SUPPORTED_CUDA_VERSIONS in install_utils.py if needed.
4334

4435

4536
def install_requirements(use_pytorch_nightly):
@@ -53,7 +44,7 @@ def install_requirements(use_pytorch_nightly):
5344
sys.exit(1)
5445

5546
# Determine the appropriate PyTorch URL based on CUDA delegate status
56-
torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE, SUPPORTED_CUDA_VERSIONS)
47+
torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE)
5748

5849
# pip packages needed by exir.
5950
TORCH_PACKAGE = [
@@ -123,7 +114,7 @@ def install_requirements(use_pytorch_nightly):
123114

124115
def install_optional_example_requirements(use_pytorch_nightly):
125116
# Determine the appropriate PyTorch URL based on CUDA delegate status
126-
torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE, SUPPORTED_CUDA_VERSIONS)
117+
torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE)
127118

128119
print("Installing torch domain libraries")
129120
DOMAIN_LIBRARIES = [

0 commit comments

Comments
 (0)