From 7490ad1a12827ab307817ac3b82b9c8886388baa Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 9 Sep 2025 15:32:00 -0700
Subject: [PATCH 1/3] Updates KernelAttributes to avoid possible dangling
 handles.

---
 cuda_core/cuda/core/experimental/_module.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
index 63bb6ff26..61e5d3cb5 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/experimental/_module.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import weakref
 from collections import namedtuple
 from typing import Optional, Union
 from warnings import warn
@@ -60,12 +61,12 @@ class KernelAttributes:
     def __new__(self, *args, **kwargs):
         raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.")
 
-    slots = ("_handle", "_cache", "_backend_version", "_loader")
+    slots = ("_kernel", "_cache", "_backend_version", "_loader")
 
     @classmethod
-    def _init(cls, handle):
+    def _init(cls, kernel):
         self = super().__new__(cls)
-        self._handle = handle
+        self._kernel = weakref.ref(kernel)
         self._cache = {}
 
         self._backend_version = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old"
@@ -76,15 +77,18 @@ def _get_cached_attribute(self, device_id: int, attribute: driver.CUfunction_att
         """Helper function to get a cached attribute or fetch and cache it if not present."""
         if device_id in self._cache and attribute in self._cache[device_id]:
             return self._cache[device_id][attribute]
+        kernel = self._kernel()
+        if kernel is None:
+            raise RuntimeError("Cannot access kernel attributes for expired Kernel object")
         if self._backend_version == "new":
-            result = handle_return(self._loader["attribute"](attribute, self._handle, device_id))
+            result = handle_return(self._loader["attribute"](attribute, kernel._handle, device_id))
         else:  # "old" backend
             warn(
                 "Device ID argument is ignored when getting attribute from kernel when cuda version < 12. ",
                 RuntimeWarning,
                 stacklevel=2,
             )
-            result = handle_return(self._loader["attribute"](attribute, self._handle))
+            result = handle_return(self._loader["attribute"](attribute, kernel._handle))
         if device_id not in self._cache:
             self._cache[device_id] = {}
         self._cache[device_id][attribute] = result
@@ -365,7 +369,7 @@ class Kernel:
 
     """
 
-    __slots__ = ("_handle", "_module", "_attributes", "_occupancy")
+    __slots__ = ("_handle", "_module", "_attributes", "_occupancy", "__weakref__")
 
     def __new__(self, *args, **kwargs):
         raise RuntimeError("Kernel objects cannot be instantiated directly. Please use ObjectCode APIs.")
@@ -385,7 +389,7 @@ def _from_obj(cls, obj, mod):
     def attributes(self) -> KernelAttributes:
         """Get the read-only attributes of this kernel."""
         if self._attributes is None:
-            self._attributes = KernelAttributes._init(self._handle)
+            self._attributes = KernelAttributes._init(self)
         return self._attributes
 
     def _get_arguments_info(self, param_info=False) -> tuple[int, list[ParamInfo]]:

From 651beb9243360c66f9482c94a98619f8a0be24bf Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 9 Sep 2025 15:46:35 -0700
Subject: [PATCH 2/3] Simplifies the caching logic in KernelAttributes.

---
 cuda_core/cuda/core/experimental/_module.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
index 61e5d3cb5..4224c8972 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/experimental/_module.py
@@ -75,8 +75,8 @@ def _init(cls, kernel):
 
     def _get_cached_attribute(self, device_id: int, attribute: driver.CUfunction_attribute) -> int:
         """Helper function to get a cached attribute or fetch and cache it if not present."""
-        if device_id in self._cache and attribute in self._cache[device_id]:
-            return self._cache[device_id][attribute]
+        if (device_id, attribute) in self._cache:
+            return self._cache[device_id, attribute]
         kernel = self._kernel()
         if kernel is None:
             raise RuntimeError("Cannot access kernel attributes for expired Kernel object")
@@ -89,9 +89,7 @@ def _get_cached_attribute(self, device_id: int, attribute: driver.CUfunction_att
                 stacklevel=2,
             )
             result = handle_return(self._loader["attribute"](attribute, kernel._handle))
-        if device_id not in self._cache:
-            self._cache[device_id] = {}
-        self._cache[device_id][attribute] = result
+        self._cache[device_id, attribute] = result
         return result
 
     def max_threads_per_block(self, device_id: int = None) -> int:

From 17e4bc5e245fd8bdc0c25a64eb8c7e22c6b5f84f Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 10 Sep 2025 08:11:14 -0700
Subject: [PATCH 3/3] Slight change to caching logic. Fix in rst format to
 satisfy pre-commit script.

---
 cuda_bindings/docs/source/release/13.X.Y-notes.rst | 2 +-
 cuda_core/cuda/core/experimental/_module.py        | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/cuda_bindings/docs/source/release/13.X.Y-notes.rst b/cuda_bindings/docs/source/release/13.X.Y-notes.rst
index 4cf9bd940..0e0e82bad 100644
--- a/cuda_bindings/docs/source/release/13.X.Y-notes.rst
+++ b/cuda_bindings/docs/source/release/13.X.Y-notes.rst
@@ -16,7 +16,7 @@ Highlights
 * Automatic CUDA library path detection based on ``CUDA_HOME``, eliminating the need to manually set ``LIBRARY_PATH`` environment variables for installation.
 * The ``[all]`` optional dependencies now use ``cuda-toolkit`` with appropriate extras instead of individual packages. The NVCC compiler is no longer automatically installed with ``pip install cuda-python[all]`` as it was previously included only to access the NVVM library, which now has its own dedicated wheel. Users who need the NVCC compiler should explicitly install it with ``pip install cuda-toolkit[nvcc]==X.Y`` with the appropriate version for their needs.
 
-* The Python overhead of calling functions in CUDA bindings in `driver`, `runtime` and `nvrtc` has been reduced by approximately 30%.
+* The Python overhead of calling functions in CUDA bindings in ``driver``, ``runtime`` and ``nvrtc`` has been reduced by approximately 30%.
 
 
 Known issues
diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
index 4224c8972..c659a8d78 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/experimental/_module.py
@@ -75,8 +75,10 @@ def _init(cls, kernel):
 
     def _get_cached_attribute(self, device_id: int, attribute: driver.CUfunction_attribute) -> int:
         """Helper function to get a cached attribute or fetch and cache it if not present."""
-        if (device_id, attribute) in self._cache:
-            return self._cache[device_id, attribute]
+        cache_key = device_id, attribute
+        result = self._cache.get(cache_key, cache_key)
+        if result is not cache_key:
+            return result
         kernel = self._kernel()
         if kernel is None:
             raise RuntimeError("Cannot access kernel attributes for expired Kernel object")
@@ -89,7 +91,7 @@ def _get_cached_attribute(self, device_id: int, attribute: driver.CUfunction_att
                 stacklevel=2,
             )
             result = handle_return(self._loader["attribute"](attribute, kernel._handle))
-        self._cache[device_id, attribute] = result
+        self._cache[cache_key] = result
         return result
 
     def max_threads_per_block(self, device_id: int = None) -> int: