From 7490ad1a12827ab307817ac3b82b9c8886388baa Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 9 Sep 2025 15:32:00 -0700 Subject: [PATCH 1/3] Updates KernelAttributes to avoid possible dangling handles. --- cuda_core/cuda/core/experimental/_module.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index 63bb6ff26..61e5d3cb5 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 +import weakref from collections import namedtuple from typing import Optional, Union from warnings import warn @@ -60,12 +61,12 @@ class KernelAttributes: def __new__(self, *args, **kwargs): raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.") - slots = ("_handle", "_cache", "_backend_version", "_loader") + slots = ("_kernel", "_cache", "_backend_version", "_loader") @classmethod - def _init(cls, handle): + def _init(cls, kernel): self = super().__new__(cls) - self._handle = handle + self._kernel = weakref.ref(kernel) self._cache = {} self._backend_version = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old" @@ -76,15 +77,18 @@ def _get_cached_attribute(self, device_id: int, attribute: driver.CUfunction_att """Helper function to get a cached attribute or fetch and cache it if not present.""" if device_id in self._cache and attribute in self._cache[device_id]: return self._cache[device_id][attribute] + kernel = self._kernel() + if kernel is None: + raise RuntimeError("Cannot access kernel attributes for expired Kernel object") if self._backend_version == "new": - result = handle_return(self._loader["attribute"](attribute, self._handle, device_id)) + result = handle_return(self._loader["attribute"](attribute, kernel._handle, device_id)) else: # "old" backend warn( "Device ID argument is ignored when getting attribute from kernel when cuda version < 12. ", RuntimeWarning, stacklevel=2, ) - result = handle_return(self._loader["attribute"](attribute, self._handle)) + result = handle_return(self._loader["attribute"](attribute, kernel._handle)) if device_id not in self._cache: self._cache[device_id] = {} self._cache[device_id][attribute] = result @@ -365,7 +369,7 @@ class Kernel: """ - __slots__ = ("_handle", "_module", "_attributes", "_occupancy") + __slots__ = ("_handle", "_module", "_attributes", "_occupancy", "__weakref__") def __new__(self, *args, **kwargs): raise RuntimeError("Kernel objects cannot be instantiated directly. Please use ObjectCode APIs.") @@ -385,7 +389,7 @@ def _from_obj(cls, obj, mod): def attributes(self) -> KernelAttributes: """Get the read-only attributes of this kernel.""" if self._attributes is None: - self._attributes = KernelAttributes._init(self._handle) + self._attributes = KernelAttributes._init(self) return self._attributes def _get_arguments_info(self, param_info=False) -> tuple[int, list[ParamInfo]]: From 651beb9243360c66f9482c94a98619f8a0be24bf Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 9 Sep 2025 15:46:35 -0700 Subject: [PATCH 2/3] Simplifies the caching logic in KernelAttributes. --- cuda_core/cuda/core/experimental/_module.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index 61e5d3cb5..4224c8972 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -75,8 +75,8 @@ def _init(cls, kernel): def _get_cached_attribute(self, device_id: int, attribute: driver.CUfunction_attribute) -> int: """Helper function to get a cached attribute or fetch and cache it if not present.""" - if device_id in self._cache and attribute in self._cache[device_id]: - return self._cache[device_id][attribute] + if (device_id, attribute) in self._cache: + return self._cache[device_id, attribute] kernel = self._kernel() if kernel is None: raise RuntimeError("Cannot access kernel attributes for expired Kernel object") @@ -89,9 +89,7 @@ def _get_cached_attribute(self, device_id: int, attribute: driver.CUfunction_att stacklevel=2, ) result = handle_return(self._loader["attribute"](attribute, kernel._handle)) - if device_id not in self._cache: - self._cache[device_id] = {} - self._cache[device_id][attribute] = result + self._cache[device_id, attribute] = result return result def max_threads_per_block(self, device_id: int = None) -> int: From 17e4bc5e245fd8bdc0c25a64eb8c7e22c6b5f84f Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 10 Sep 2025 08:11:14 -0700 Subject: [PATCH 3/3] Slight change to caching logic. Fix in rst format to satisfy pre-commit script. --- cuda_bindings/docs/source/release/13.X.Y-notes.rst | 2 +- cuda_core/cuda/core/experimental/_module.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/cuda_bindings/docs/source/release/13.X.Y-notes.rst b/cuda_bindings/docs/source/release/13.X.Y-notes.rst index 4cf9bd940..0e0e82bad 100644 --- a/cuda_bindings/docs/source/release/13.X.Y-notes.rst +++ b/cuda_bindings/docs/source/release/13.X.Y-notes.rst @@ -16,7 +16,7 @@ Highlights * Automatic CUDA library path detection based on ``CUDA_HOME``, eliminating the need to manually set ``LIBRARY_PATH`` environment variables for installation. * The ``[all]`` optional dependencies now use ``cuda-toolkit`` with appropriate extras instead of individual packages. The NVCC compiler is no longer automatically installed with ``pip install cuda-python[all]`` as it was previously included only to access the NVVM library, which now has its own dedicated wheel. Users who need the NVCC compiler should explicitly install it with ``pip install cuda-toolkit[nvcc]==X.Y`` with the appropriate version for their needs. -* The Python overhead of calling functions in CUDA bindings in `driver`, `runtime` and `nvrtc` has been reduced by approximately 30%. +* The Python overhead of calling functions in CUDA bindings in ``driver``, ``runtime`` and ``nvrtc`` has been reduced by approximately 30%. Known issues diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index 4224c8972..c659a8d78 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -75,8 +75,10 @@ def _init(cls, kernel): def _get_cached_attribute(self, device_id: int, attribute: driver.CUfunction_attribute) -> int: """Helper function to get a cached attribute or fetch and cache it if not present.""" - if (device_id, attribute) in self._cache: - return self._cache[device_id, attribute] + cache_key = device_id, attribute + result = self._cache.get(cache_key, cache_key) + if result is not cache_key: + return result kernel = self._kernel() if kernel is None: raise RuntimeError("Cannot access kernel attributes for expired Kernel object") @@ -89,7 +91,7 @@ def _get_cached_attribute(self, device_id: int, attribute: driver.CUfunction_att stacklevel=2, ) result = handle_return(self._loader["attribute"](attribute, kernel._handle)) - self._cache[device_id, attribute] = result + self._cache[cache_key] = result return result def max_threads_per_block(self, device_id: int = None) -> int: