Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
befa768
Implement non-pooling memory allocation.
Andy-Jost Nov 3, 2025
88834f7
Add GraphMemoryResource.
Andy-Jost Nov 5, 2025
57855a1
Remove mempool_enabled option now that GraphMemoryResource is ready.
Andy-Jost Nov 5, 2025
0375941
Add docstring and make GraphMemoryResource a singleton.
Andy-Jost Nov 5, 2025
0b82b1f
Move tests to a separate file.
Andy-Jost Nov 5, 2025
53b1c58
Add errors for DeviceMemoryResource and GraphMemoryResource when grap…
Andy-Jost Nov 6, 2025
1b8409b
Add tests for attributes and memory allocation escaping graphs.
Andy-Jost Nov 7, 2025
e9422b2
Simplify logic for converting IsStreamT arguments.
Andy-Jost Nov 10, 2025
3e21b9b
Standardize Stream arguments to IsStreamT. Update Buffer and MemoryRe…
Andy-Jost Nov 10, 2025
98ccfc7
Add tests for IsStreamT conversions.
Andy-Jost Nov 10, 2025
183f7af
Expand files named _gmr.*. Add __eq__ and __hash__ support to StreamW…
Andy-Jost Nov 12, 2025
13e3dfb
Fix format/lint issues.
Andy-Jost Nov 12, 2025
7408fe8
Minor clean up.
Andy-Jost Nov 13, 2025
c1dbc6a
Change public signatures to accept "Stream | GraphBuiler" where only …
Andy-Jost Nov 19, 2025
2f14443
Add deprecation warning when stream protocol is used with launch.
Andy-Jost Nov 20, 2025
02935fd
Merge branch 'main' into dmr-graph-support-2
Andy-Jost Nov 20, 2025
4ac5c99
Fix builds post-merge.
Andy-Jost Nov 20, 2025
e5ea645
Simplify GraphMemoryResourceAttributes.
Andy-Jost Nov 20, 2025
7042437
Simplify Stream_accept. Default arguments can more easily be handled …
Andy-Jost Nov 20, 2025
45db4ff
Adjust tests for platform-dependent behavior.
Andy-Jost Nov 20, 2025
7c3ad5f
Disable additional tests for platform-dependent behavior.
Andy-Jost Nov 20, 2025
af22c81
Merge branch 'main' into dmr-graph-support-2
Andy-Jost Nov 20, 2025
556c6bf
Adjust deallocation stream for legacy memory resources to avoid platf…
Andy-Jost Nov 21, 2025
1d07da1
Adjust test_graph_alloc to launch the graph multiple times.
Andy-Jost Nov 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cuda_core/cuda/core/experimental/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
Buffer,
DeviceMemoryResource,
DeviceMemoryResourceOptions,
GraphMemoryResource,
LegacyPinnedMemoryResource,
MemoryResource,
VirtualMemoryResource,
Expand Down
6 changes: 2 additions & 4 deletions cuda_core/cuda/core/experimental/_device.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ from cuda.bindings cimport cydriver
from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN

import threading
from typing import Union, TYPE_CHECKING
from typing import Optional, TYPE_CHECKING, Union

from cuda.core.experimental._context import Context, ContextOptions
from cuda.core.experimental._event import Event, EventOptions
Expand Down Expand Up @@ -1306,7 +1306,7 @@ class Device:
ctx = self._get_current_context()
return Event._init(self._id, ctx, options, True)

def allocate(self, size, stream: Stream | None = None) -> Buffer:
def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer:
"""Allocate device memory from a specified stream.

Allocates device memory of `size` bytes on the specified `stream`
Expand All @@ -1333,8 +1333,6 @@ class Device:

"""
self._check_context_initialized()
if stream is None:
stream = default_stream()
return self.memory_resource.allocate(size, stream)

def sync(self):
Expand Down
32 changes: 9 additions & 23 deletions cuda_core/cuda/core/experimental/_launcher.pyx
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config
from cuda.core.experimental._stream cimport Stream_accept

from libc.stdint cimport uintptr_t

from cuda.core.experimental._stream cimport _try_to_get_stream_ptr

from typing import Union

from cuda.core.experimental._kernel_arg_handler import ParamHolder
from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config
from cuda.core.experimental._module import Kernel
from cuda.core.experimental._stream import IsStreamT, Stream
from cuda.core.experimental._stream import Stream
from cuda.core.experimental._utils.clear_error_support import assert_type
from cuda.core.experimental._utils.cuda_utils import (
_reduce_3_tuple,
Expand Down Expand Up @@ -39,13 +35,13 @@ def _lazy_init():
_inited = True


def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kernel, *kernel_args):
def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kernel: Kernel, *kernel_args):
"""Launches a :obj:`~_module.Kernel`
object with launch-time configuration.

Parameters
----------
stream : :obj:`~_stream.Stream`
stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
The stream establishing the stream ordering semantic of a
launch.
config : :obj:`LaunchConfig`
Expand All @@ -58,17 +54,7 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne
launching kernel.

"""
if stream is None:
raise ValueError("stream cannot be None, stream must either be a Stream object or support __cuda_stream__")
try:
stream_handle = stream.handle
except AttributeError:
try:
stream_handle = driver.CUstream(<uintptr_t>(_try_to_get_stream_ptr(stream)))
except Exception:
raise ValueError(
f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})"
) from None
stream = Stream_accept(stream, allow_stream_protocol=True)
assert_type(kernel, Kernel)
_lazy_init()
config = check_or_create_options(LaunchConfig, config, "launch config")
Expand All @@ -85,20 +71,20 @@ def launch(stream: Union[Stream, IsStreamT], config: LaunchConfig, kernel: Kerne
# rich.
if _use_ex:
drv_cfg = _to_native_launch_config(config)
drv_cfg.hStream = stream_handle
drv_cfg.hStream = stream.handle
if config.cooperative_launch:
_check_cooperative_launch(kernel, config, stream)
handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
else:
# TODO: check if config has any unsupported attrs
handle_return(
driver.cuLaunchKernel(
int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream_handle, args_ptr, 0
int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0
)
)


def _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream):
cdef _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream):
dev = stream.device
num_sm = dev.properties.multiprocessor_count
max_grid_size = (
Expand Down
1 change: 1 addition & 0 deletions cuda_core/cuda/core/experimental/_memory/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from ._buffer import * # noqa: F403
from ._device_memory_resource import * # noqa: F403
from ._graph_memory_resource import * # noqa: F403
from ._ipc import * # noqa: F403
from ._legacy import * # noqa: F403
from ._virtual_memory_resource import * # noqa: F403
41 changes: 15 additions & 26 deletions cuda_core/cuda/core/experimental/_memory/_buffer.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ from libc.stdint cimport uintptr_t
from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource
from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor
from cuda.core.experimental._memory cimport _ipc
from cuda.core.experimental._stream cimport default_stream, Stream
from cuda.core.experimental._stream cimport Stream_accept, Stream
from cuda.core.experimental._utils.cuda_utils cimport (
_check_driver_error as raise_if_driver_error,
)
Expand Down Expand Up @@ -102,21 +102,21 @@ cdef class Buffer:
"""Export a buffer allocated for sharing between processes."""
return _ipc.Buffer_get_ipc_descriptor(self)

def close(self, stream: Stream = None):
def close(self, stream: Stream | GraphBuilder | None = None):
"""Deallocate this buffer asynchronously on the given stream.

This buffer is released back to their memory resource
asynchronously on the given stream.

Parameters
----------
stream : Stream, optional
stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
The stream object to use for asynchronous deallocation. If None,
the behavior depends on the underlying memory resource.
"""
Buffer_close(self, stream)

def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer:
def copy_to(self, dst: Buffer = None, *, stream: Stream | GraphBuilder) -> Buffer:
"""Copy from this buffer to the dst buffer asynchronously on the given stream.

Copies the data from this buffer to the provided dst buffer.
Expand All @@ -127,14 +127,12 @@ cdef class Buffer:
----------
dst : :obj:`~_memory.Buffer`
Source buffer to copy data from
stream : Stream
stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
Keyword argument specifying the stream for the
asynchronous copy

"""
if stream is None:
raise ValueError("stream must be provided")

stream = Stream_accept(stream)
cdef size_t src_size = self._size

if dst is None:
Expand All @@ -152,21 +150,19 @@ cdef class Buffer:
raise_if_driver_error(err)
return dst

def copy_from(self, src: Buffer, *, stream: Stream):
def copy_from(self, src: Buffer, *, stream: Stream | GraphBuilder):
"""Copy from the src buffer to this buffer asynchronously on the given stream.

Parameters
----------
src : :obj:`~_memory.Buffer`
Source buffer to copy data from
stream : Stream
stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`
Keyword argument specifying the stream for the
asynchronous copy

"""
if stream is None:
raise ValueError("stream must be provided")

stream = Stream_accept(stream)
cdef size_t dst_size = self._size
cdef size_t src_size = src._size

Expand Down Expand Up @@ -274,17 +270,10 @@ cdef class Buffer:

# Buffer Implementation
# ---------------------
cdef Buffer_close(Buffer self, stream):
cdef inline void Buffer_close(Buffer self, stream):
cdef Stream s
if self._ptr and self._memory_resource is not None:
if stream is None:
if self._alloc_stream is not None:
s = self._alloc_stream
else:
# TODO: remove this branch when from_handle takes a stream
s = <Stream>(default_stream())
else:
s = <Stream>stream
s = Stream_accept(stream) if stream is not None else self._alloc_stream
self._memory_resource.deallocate(self._ptr, self._size, s)
self._ptr = 0
self._memory_resource = None
Expand All @@ -305,14 +294,14 @@ cdef class MemoryResource:
"""

@abc.abstractmethod
def allocate(self, size_t size, stream: Stream = None) -> Buffer:
def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
"""Allocate a buffer of the requested size.

Parameters
----------
size : int
The size of the buffer to allocate, in bytes.
stream : Stream, optional
stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
The stream on which to perform the allocation asynchronously.
If None, it is up to each memory resource implementation to decide
and document the behavior.
Expand All @@ -326,7 +315,7 @@ cdef class MemoryResource:
...

@abc.abstractmethod
def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream | GraphBuilder | None = None):
"""Deallocate a buffer previously allocated by this resource.

Parameters
Expand All @@ -335,7 +324,7 @@ cdef class MemoryResource:
The pointer or handle to the buffer to deallocate.
size : int
The size of the buffer to deallocate, in bytes.
stream : Stream, optional
stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
The stream on which to perform the deallocation asynchronously.
If None, it is up to each memory resource implementation to decide
and document the behavior.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ from cuda.bindings cimport cydriver
from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
from cuda.core.experimental._memory cimport _ipc
from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCData
from cuda.core.experimental._stream cimport default_stream, Stream
from cuda.core.experimental._stream cimport default_stream, Stream_accept, Stream
from cuda.core.experimental._utils.cuda_utils cimport (
check_or_create_options,
HANDLE_RETURN,
Expand Down Expand Up @@ -65,6 +65,12 @@ cdef class DeviceMemoryResourceAttributes:
self._mr_weakref = mr
return self

def __repr__(self):
return f"{self.__class__.__name__}(%s)" % ", ".join(
f"{attr}={getattr(self, attr)}" for attr in dir(self)
if not attr.startswith("_")
)

cdef int _getattribute(self, cydriver.CUmemPool_attribute attr_enum, void* value) except?-1:
cdef DeviceMemoryResource mr = <DeviceMemoryResource>(self._mr_weakref())
if mr is None:
Expand Down Expand Up @@ -133,7 +139,7 @@ cdef class DeviceMemoryResourceAttributes:

cdef class DeviceMemoryResource(MemoryResource):
"""
Create a device memory resource managing a stream-ordered memory pool.
A device memory resource managing a stream-ordered memory pool.

Parameters
----------
Expand Down Expand Up @@ -309,14 +315,14 @@ cdef class DeviceMemoryResource(MemoryResource):
raise RuntimeError("Imported memory resource cannot be exported")
return self._ipc_data._alloc_handle

def allocate(self, size_t size, stream: Stream = None) -> Buffer:
def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
"""Allocate a buffer of the requested size.

Parameters
----------
size : int
The size of the buffer to allocate, in bytes.
stream : Stream, optional
stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
The stream on which to perform the allocation asynchronously.
If None, an internal stream is used.

Expand All @@ -328,11 +334,10 @@ cdef class DeviceMemoryResource(MemoryResource):
"""
if self.is_mapped:
raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource")
if stream is None:
stream = default_stream()
return DMR_allocate(self, size, <Stream>stream)
stream = Stream_accept(stream) if stream is not None else default_stream()
return DMR_allocate(self, size, <Stream> stream)

def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream | GraphBuilder | None = None):
"""Deallocate a buffer previously allocated by this resource.

Parameters
Expand All @@ -341,15 +346,17 @@ cdef class DeviceMemoryResource(MemoryResource):
The pointer or handle to the buffer to deallocate.
size : int
The size of the buffer to deallocate, in bytes.
stream : Stream, optional
stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
The stream on which to perform the deallocation asynchronously.
If the buffer is deallocated without an explicit stream, the allocation stream
is used.
"""
DMR_deallocate(self, <uintptr_t>ptr, size, <Stream>stream)
stream = Stream_accept(stream) if stream is not None else default_stream()
DMR_deallocate(self, <uintptr_t>ptr, size, <Stream> stream)

@property
def attributes(self) -> DeviceMemoryResourceAttributes:
"""Memory pool attributes."""
if self._attributes is None:
ref = weakref.ref(self)
self._attributes = DeviceMemoryResourceAttributes._init(ref)
Expand Down Expand Up @@ -467,10 +474,21 @@ cdef void DMR_init_create(
self._ipc_data = IPCData(alloc_handle, mapped=False)


cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
# Raise an exception if the given stream is capturing.
# A result of CU_STREAM_CAPTURE_STATUS_INVALIDATED is considered an error.
cdef inline int check_not_capturing(cydriver.CUstream s) except?-1 nogil:
cdef cydriver.CUstreamCaptureStatus capturing
HANDLE_RETURN(cydriver.cuStreamIsCapturing(s, &capturing))
if capturing != cydriver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_NONE:
raise RuntimeError("DeviceMemoryResource cannot perform memory operations on "
"a capturing stream (consider using GraphMemoryResource).")


cdef inline Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
cdef cydriver.CUstream s = stream._handle
cdef cydriver.CUdeviceptr devptr
with nogil:
check_not_capturing(s)
HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s))
cdef Buffer buf = Buffer.__new__(Buffer)
buf._ptr = <uintptr_t>(devptr)
Expand All @@ -481,16 +499,19 @@ cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
return buf


cdef void DMR_deallocate(
cdef inline void DMR_deallocate(
DeviceMemoryResource self, uintptr_t ptr, size_t size, Stream stream
) noexcept:
cdef cydriver.CUstream s = stream._handle
cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
cdef cydriver.CUresult r
with nogil:
HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s))
r = cydriver.cuMemFreeAsync(devptr, s)
if r != cydriver.CUDA_ERROR_INVALID_CONTEXT:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: mempools are not tied to a CUDA context, so when will this happen?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a reactive fix as I was seeing many of these errors at the end of a full pytest run. I think this issue exists prior to this PR, so I could separate this if you prefer.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Andy-Jost Let's save this to a separate PR. I think this one is clean, and I'd like to merge it asap (after I read it one more time).

HANDLE_RETURN(r)


cdef DMR_close(DeviceMemoryResource self):
cdef inline DMR_close(DeviceMemoryResource self):
if self._handle == NULL:
return

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

from cuda.core.experimental._memory._buffer cimport MemoryResource


cdef class cyGraphMemoryResource(MemoryResource):
cdef:
int _dev_id
Loading
Loading