cuda.cccl.parallel API Reference#

Warning

Python exposure of parallel algorithms is in public beta. The API is subject to change without notice.

Algorithms#

cuda.cccl.parallel.experimental.algorithms.reduce_into(d_in, d_out, op, num_items, h_init, stream=None)#

Performs device-wide reduction.

This function automatically handles temporary storage allocation and execution.

Example

Below, reduce_into is used to compute the sum of a sequence of integers.

"""
Sum all values in an array using reduction with PLUS operation.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
dtype = np.int32
h_init = np.array([0], dtype=dtype)
d_input = cp.array([1, 2, 3, 4, 5], dtype=dtype)
d_output = cp.empty(1, dtype=dtype)

# Perform the reduction.
parallel.reduce_into(d_input, d_output, parallel.OpKind.PLUS, len(d_input), h_init)

# Verify the result.
expected_output = 15
assert (d_output == expected_output).all()
result = d_output[0]
print(f"Sum reduction result: {result}")
Parameters:
  • d_in (DeviceArrayLike | IteratorBase) – Device array or iterator containing the input sequence of data items

  • d_out (DeviceArrayLike | IteratorBase) – Device array to store the result of the reduction

  • op (Callable | cuda.cccl.parallel.experimental._bindings.OpKind) – Binary reduction operator

  • num_items (int) – Number of items to reduce

  • h_init (ndarray | Any) – Initial value for the reduction

  • stream – CUDA stream for the operation (optional)

cuda.cccl.parallel.experimental.algorithms.make_reduce_into(d_in, d_out, op, h_init)#

Computes a device-wide reduction using the specified binary op and initial value init.

Example

Below, make_reduce_into is used to create a reduction object that can be reused.

"""
Reduction example using the object API.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
dtype = np.int32
init_value = 5
h_init = np.array([init_value], dtype=dtype)
h_input = np.array([1, 2, 3, 4], dtype=dtype)
d_input = cp.asarray(h_input)
d_output = cp.empty(1, dtype=dtype)

# Create a reducer object.
reducer = parallel.make_reduce_into(d_input, d_output, parallel.OpKind.PLUS, h_init)

# Get the temporary storage size.
temp_storage_size = reducer(None, d_input, d_output, len(h_input), h_init)

# Allocate temporary storage using any user-defined allocator.
# The result must be an object exposing `__cuda_array_interface__`.
d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)

# Perform the reduction.
reducer(d_temp_storage, d_input, d_output, len(h_input), h_init)

expected_result = np.sum(h_input) + init_value
actual_result = d_output.get()[0]
assert actual_result == expected_result
print("Reduce object example completed successfully")
Parameters:
  • d_in (DeviceArrayLike | IteratorBase) – Device array or iterator containing the input sequence of data items

  • d_out (DeviceArrayLike | IteratorBase) – Device array (of size 1) that will store the result of the reduction

  • op (Callable | cuda.cccl.parallel.experimental._bindings.OpKind) – Callable or OpKind representing the binary operator to apply

  • init – Numpy array storing initial value of the reduction

  • h_init (ndarray)

Returns:

A callable object that can be used to perform the reduction

cuda.cccl.parallel.experimental.algorithms.inclusive_scan(d_in, d_out, op, h_init, num_items, stream=None)#

Performs device-wide inclusive scan.

This function automatically handles temporary storage allocation and execution.

Example

Below, inclusive_scan is used to compute an inclusive scan (prefix sum).

"""
Inclusive scan with custom operation (prefix sum of even values).
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
h_init = np.array([0], dtype="int32")
d_input = cp.array([1, 2, 3, 4, 5], dtype="int32")
d_output = cp.empty_like(d_input, dtype="int32")

# Define the binary operation for the scan.


def add_op(a, b):
    return (a if a % 2 == 0 else 0) + (b if b % 2 == 0 else 0)


# Perform the inclusive scan.
parallel.inclusive_scan(d_input, d_output, add_op, h_init, d_input.size)

# Verify the result.
expected = np.asarray([0, 2, 2, 6, 6])
assert np.array_equal(d_output.get(), expected)
result = d_output.get()
print(f"Inclusive scan custom result: {result}")
Parameters:
  • d_in (DeviceArrayLike | IteratorBase) – Device array or iterator containing the input sequence of data items

  • d_out (DeviceArrayLike | IteratorBase) – Device array or iterator to store the result of the scan

  • op (Callable | cuda.cccl.parallel.experimental._bindings.OpKind) – Binary scan operator

  • h_init (ndarray | Any) – Initial value for the scan

  • num_items (int) – Number of items to scan

  • stream – CUDA stream for the operation (optional)

cuda.cccl.parallel.experimental.algorithms.make_inclusive_scan(d_in, d_out, op, h_init)#

Computes a device-wide scan using the specified binary op and initial value init.

Example

Below, make_inclusive_scan is used to create an inclusive scan object that can be reused.

"""
Inclusive scan example demonstrating the object API.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
dtype = np.int32
h_init = np.array([0], dtype=dtype)
h_input = np.array([1, 2, 3, 4], dtype=dtype)
d_input = cp.asarray(h_input)
d_output = cp.empty(len(h_input), dtype=dtype)

# Create the scanner object and allocate temporary storage.
scanner = parallel.make_inclusive_scan(d_input, d_output, parallel.OpKind.PLUS, h_init)
temp_storage_size = scanner(None, d_input, d_output, len(h_input), h_init)
d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)

# Perform the inclusive scan.
scanner(d_temp_storage, d_input, d_output, len(h_input), h_init)

# Verify the result.
expected_result = np.array([1, 3, 6, 10], dtype=dtype)
actual_result = d_output.get()
np.testing.assert_array_equal(actual_result, expected_result)
print("Inclusive scan object example completed successfully")
Parameters:
  • d_in (DeviceArrayLike | IteratorBase) – Device array or iterator containing the input sequence of data items

  • d_out (DeviceArrayLike | IteratorBase) – Device array that will store the result of the scan

  • op (Callable | cuda.cccl.parallel.experimental._bindings.OpKind) – Callable or OpKind representing the binary operator to apply

  • init – Numpy array storing initial value of the scan

  • h_init (ndarray)

Returns:

A callable object that can be used to perform the scan

cuda.cccl.parallel.experimental.algorithms.exclusive_scan(d_in, d_out, op, h_init, num_items, stream=None)#

Performs device-wide exclusive scan.

This function automatically handles temporary storage allocation and execution.

Example

Below, exclusive_scan is used to compute an exclusive scan with max operation.

"""
Exclusive scan using custom maximum operation.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Define the binary operation for the scan.


def max_op(a, b):
    return max(a, b)


# Prepare the input and output arrays.
h_init = np.array([1], dtype="int32")
d_input = cp.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype="int32")
d_output = cp.empty_like(d_input, dtype="int32")

# Perform the exclusive scan.
parallel.exclusive_scan(d_input, d_output, max_op, h_init, d_input.size)

# Verify the result.
expected = np.asarray([1, 1, 1, 2, 2, 2, 4, 4, 4, 4])
result = d_output.get()

np.testing.assert_equal(result, expected)
print(f"Exclusive scan max result: {result}")
Parameters:
  • d_in (DeviceArrayLike | IteratorBase) – Device array or iterator containing the input sequence of data items

  • d_out (DeviceArrayLike | IteratorBase) – Device array or iterator to store the result of the scan

  • op (Callable | cuda.cccl.parallel.experimental._bindings.OpKind) – Binary scan operator

  • h_init (ndarray | Any) – Initial value for the scan

  • num_items (int) – Number of items to scan

  • stream – CUDA stream for the operation (optional)

cuda.cccl.parallel.experimental.algorithms.make_exclusive_scan(d_in, d_out, op, h_init)#

Computes a device-wide scan using the specified binary op and initial value init.

Example

Below, make_exclusive_scan is used to create an exclusive scan object that can be reused.

"""
Exclusive scan example demonstrating the object API.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
dtype = np.int32
h_init = np.array([0], dtype=dtype)
h_input = np.array([1, 2, 3, 4], dtype=dtype)
d_input = cp.asarray(h_input)
d_output = cp.empty(len(h_input), dtype=dtype)

# Create the scanner object and allocate temporary storage.
scanner = parallel.make_exclusive_scan(d_input, d_output, parallel.OpKind.PLUS, h_init)
temp_storage_size = scanner(None, d_input, d_output, len(h_input), h_init)
d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)

# Perform the exclusive scan.
scanner(d_temp_storage, d_input, d_output, len(h_input), h_init)

# Verify the result.
expected_result = np.array([0, 1, 3, 6], dtype=dtype)
actual_result = d_output.get()
np.testing.assert_array_equal(actual_result, expected_result)
print("Exclusive scan object example completed successfully")
Parameters:
  • d_in (DeviceArrayLike | IteratorBase) – Device array or iterator containing the input sequence of data items

  • d_out (DeviceArrayLike | IteratorBase) – Device array that will store the result of the scan

  • op (Callable | cuda.cccl.parallel.experimental._bindings.OpKind) – Callable or OpKind representing the binary operator to apply

  • init – Numpy array storing initial value of the scan

  • h_init (ndarray)

Returns:

A callable object that can be used to perform the scan

cuda.cccl.parallel.experimental.algorithms.unary_transform(d_in, d_out, op, num_items, stream=None)#

Performs device-wide unary transform.

This function automatically handles temporary storage allocation and execution.

Example

Below, unary_transform is used to apply a transformation to each element of the input.

"""
Example showing how to use unary_transform to apply a unary operation to each element.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
input_data = np.array([1, 2, 3, 4, 5], dtype=np.int32)
d_in = cp.asarray(input_data)
d_out = cp.empty_like(d_in)


# Define the unary operation.
def op(a):
    return a + 1


# Perform the unary transform.
parallel.unary_transform(d_in, d_out, op, len(d_in))

# Verify the result.
result = d_out.get()
expected = input_data + 1

np.testing.assert_array_equal(result, expected)
print(f"Unary transform result: {result}")
Parameters:
  • d_in (DeviceArrayLike | IteratorBase) – Device array or iterator containing the input sequence of data items.

  • d_out (DeviceArrayLike | IteratorBase) – Device array or iterator to store the result of the transformation.

  • op (Callable | cuda.cccl.parallel.experimental._bindings.OpKind) – Callable or OpKind representing the unary operation to apply to each element of the input.

  • num_items (int) – Number of items to transform.

  • stream – CUDA stream to use for the operation.

cuda.cccl.parallel.experimental.algorithms.make_unary_transform(d_in, d_out, op)#

Create a unary transform object that can be called to apply a transformation to each element of the input according to the unary operation op.

This is the object-oriented API that allows explicit control over temporary storage allocation. For simpler usage, consider using unary_transform().

Example

"""
Unary transform examples demonstrating the object API and well-known operations.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
dtype = np.int32
h_input = np.array([1, 2, 3, 4], dtype=dtype)
d_input = cp.asarray(h_input)
d_output = cp.empty_like(d_input)


# Define the unary operation.
def add_one_op(a):
    return a + 1


# Create the unary transform object.
transformer = parallel.make_unary_transform(d_input, d_output, add_one_op)

# Perform the unary transform.
transformer(d_input, d_output, len(h_input))

# Verify the result.
expected_result = np.array([2, 3, 4, 5], dtype=dtype)
actual_result = d_output.get()
np.testing.assert_array_equal(actual_result, expected_result)
print("Unary transform object example completed successfully")
Parameters:
  • d_in (DeviceArrayLike | IteratorBase) – Device array or iterator containing the input sequence of data items.

  • d_out (DeviceArrayLike | IteratorBase) – Device array or iterator to store the result of the transformation.

  • op (Callable | cuda.cccl.parallel.experimental._bindings.OpKind) – Callable or OpKind representing the unary operation to apply to each element of the input.

Returns:

A callable object that performs the transformation.

cuda.cccl.parallel.experimental.algorithms.binary_transform(d_in1, d_in2, d_out, op, num_items, stream=None)#

Performs device-wide binary transform.

This function automatically handles temporary storage allocation and execution.

Example

Below, binary_transform is used to apply a transformation to pairs of elements from two input sequences.

"""
Example showing how to use binary_transform to perform elementwise addition.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
input1_data = np.array([1, 2, 3, 4], dtype=np.int32)
input2_data = np.array([10, 20, 30, 40], dtype=np.int32)
d_in1 = cp.asarray(input1_data)
d_in2 = cp.asarray(input2_data)
d_out = cp.empty_like(d_in1)

# Perform the binary transform.
parallel.binary_transform(d_in1, d_in2, d_out, parallel.OpKind.PLUS, len(d_in1))

# Verify the result.
result = d_out.get()
expected = input1_data + input2_data

np.testing.assert_array_equal(result, expected)
print(f"Binary transform result: {result}")
Parameters:
  • d_in1 (DeviceArrayLike | IteratorBase) – Device array or iterator containing the first input sequence of data items.

  • d_in2 (DeviceArrayLike | IteratorBase) – Device array or iterator containing the second input sequence of data items.

  • d_out (DeviceArrayLike | IteratorBase) – Device array or iterator to store the result of the transformation.

  • op (Callable | cuda.cccl.parallel.experimental._bindings.OpKind) – Callable or OpKind representing the binary operation to apply to each pair of items from the input sequences.

  • num_items (int) – Number of items to transform.

  • stream – CUDA stream to use for the operation.

cuda.cccl.parallel.experimental.algorithms.make_binary_transform(d_in1, d_in2, d_out, op)#

Create a binary transform object that can be called to apply a transformation to the given pair of input sequences according to the binary operation op.

This is the object-oriented API that allows explicit control over temporary storage allocation. For simpler usage, consider using binary_transform().

Example

"""
Binary transform examples demonstrating the transform object API.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
dtype = np.int32
h_input1 = np.array([1, 2, 3, 4], dtype=dtype)
h_input2 = np.array([10, 20, 30, 40], dtype=dtype)
d_input1 = cp.asarray(h_input1)
d_input2 = cp.asarray(h_input2)
d_output = cp.empty_like(d_input1)

# Create the binary transform object.
transformer = parallel.make_binary_transform(
    d_input1, d_input2, d_output, parallel.OpKind.PLUS
)

# Perform the binary transform.
transformer(d_input1, d_input2, d_output, len(h_input1))

# Verify the result.
expected_result = np.array([11, 22, 33, 44], dtype=dtype)
actual_result = d_output.get()
np.testing.assert_array_equal(actual_result, expected_result)
print("Binary transform object example completed successfully")
Parameters:
  • d_in1 (DeviceArrayLike | IteratorBase) – Device array or iterator containing the first input sequence of data items.

  • d_in2 (DeviceArrayLike | IteratorBase) – Device array or iterator containing the second input sequence of data items.

  • d_out (DeviceArrayLike | IteratorBase) – Device array or iterator to store the result of the transformation.

  • op (Callable | cuda.cccl.parallel.experimental._bindings.OpKind) – Callable or OpKind representing the binary operation to apply to each pair of items from the input sequences.

Returns:

A callable object that performs the transformation.

cuda.cccl.parallel.experimental.algorithms.histogram_even(
d_samples,
d_histogram,
num_output_levels,
lower_level,
upper_level,
num_samples,
stream=None,
)#

Performs device-wide histogram computation with evenly-spaced bins.

This function automatically handles temporary storage allocation and execution.

Example

Below, histogram_even is used to compute a histogram with evenly-spaced bins.

Basic histogram example.#
"""
Example showing how to use histogram_even to bin a sequence of samples.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
num_samples = 10
h_samples = np.array(
    [2.2, 6.1, 7.1, 2.9, 3.5, 0.3, 2.9, 2.1, 6.1, 999.5], dtype="float32"
)
d_samples = cp.asarray(h_samples)
num_levels = 7
d_histogram = cp.empty(num_levels - 1, dtype="int32")
lower_level = np.float64(0)
upper_level = np.float64(12)

# Perform the histogram even.
parallel.histogram_even(
    d_samples,
    d_histogram,
    num_levels,
    lower_level,
    upper_level,
    num_samples,
)

# Verify the result.
h_actual_histogram = cp.asnumpy(d_histogram)
h_expected_histogram, _ = np.histogram(
    h_samples, bins=num_levels - 1, range=(lower_level, upper_level)
)
h_expected_histogram = h_expected_histogram.astype("int32")

np.testing.assert_array_equal(h_actual_histogram, h_expected_histogram)
print(f"Histogram even basic result: {h_actual_histogram}")
Parameters:
  • d_samples (DeviceArrayLike | IteratorBase) – Device array or iterator containing the input sequence of data samples

  • d_histogram (DeviceArrayLike) – Device array to store the computed histogram

  • num_output_levels (int) – Number of histogram bin levels (num_bins = num_output_levels - 1)

  • lower_level (floating | integer) – Lower sample value bound (inclusive)

  • upper_level (floating | integer) – Upper sample value bound (exclusive)

  • num_samples (int) – Number of input samples

  • stream – CUDA stream for the operation (optional)

cuda.cccl.parallel.experimental.algorithms.make_histogram_even(
d_samples,
d_histogram,
h_num_output_levels,
h_lower_level,
h_upper_level,
num_samples,
)#

Implements a device-wide histogram that places d_samples into evenly-spaced bins.

Example

Below, make_histogram_even is used to create a histogram object that can be reused.

"""
Example showing how to use histogram object API to bin a sequence of samples.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
h_samples = np.array(
    [1.5, 2.3, 4.7, 6.2, 7.8, 3.1, 5.5, 8.9, 2.7, 6.4], dtype="float32"
)
d_samples = cp.asarray(h_samples)

num_levels = 6

# note that the object API requires passing numpy arrays
# rather than scalars:
h_num_output_levels = np.array([num_levels], dtype=np.int32)
h_lower_level = np.array([0.0], dtype=np.float64)
h_upper_level = np.array([10.0], dtype=np.float64)

d_histogram = cp.zeros(num_levels - 1, dtype="int32")

# Create the histogram object.
histogrammer = parallel.make_histogram_even(
    d_samples,
    d_histogram,
    h_num_output_levels,
    h_lower_level,
    h_upper_level,
    len(h_samples),
)

# Get the temporary storage size.
temp_storage_size = histogrammer(
    None,
    d_samples,
    d_histogram,
    h_num_output_levels,
    h_lower_level,
    h_upper_level,
    len(h_samples),
)

# Allocate the temporary storage.
d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)

# Perform the histogram.
histogrammer(
    d_temp_storage,
    d_samples,
    d_histogram,
    h_num_output_levels,
    h_lower_level,
    h_upper_level,
    len(h_samples),
)

# Verify the result.
h_result = cp.asnumpy(d_histogram)
expected_histogram = np.array([1, 3, 2, 3, 1], dtype="int32")

np.testing.assert_array_equal(h_result, expected_histogram)
print("Histogram object example completed successfully")
Parameters:
  • d_samples (DeviceArrayLike | IteratorBase) – Device array or iterator containing the input samples to be histogrammed

  • d_histogram (DeviceArrayLike) – Device array to store the histogram

  • h_num_output_levels (ndarray) – Host array containing the number of output levels

  • h_lower_level (ndarray) – Host array containing the lower level

  • h_upper_level (ndarray) – Host array containing the upper level

  • num_samples (int) – Number of samples to be histogrammed

Returns:

A callable object that can be used to perform the histogram

cuda.cccl.parallel.experimental.algorithms.merge_sort(
d_in_keys,
d_in_items,
d_out_keys,
d_out_items,
op,
num_items,
stream=None,
)#

Performs device-wide merge sort.

This function automatically handles temporary storage allocation and execution.

Example

Below, merge_sort is used to sort a sequence of keys inplace. It also rearranges the items according to the keys’ order.

"""
Demonstrate basic merge sort with keys and values.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
h_in_keys = np.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype="int32")
h_in_values = np.array(
    [-3.2, 2.2, 1.9, 4.0, -3.9, 2.7, 0, 8.3 - 1, 2.9, 5.4], dtype="float32"
)

d_in_keys = cp.asarray(h_in_keys)
d_in_values = cp.asarray(h_in_values)

# Perform the merge sort.
parallel.merge_sort(
    d_in_keys,
    d_in_values,
    d_in_keys,
    d_in_values,
    parallel.OpKind.LESS,
    d_in_keys.size,
)

# Verify the result.
h_out_keys = cp.asnumpy(d_in_keys)
h_out_values = cp.asnumpy(d_in_values)

argsort = np.argsort(h_in_keys, stable=True)
expected_keys = np.array(h_in_keys)[argsort]
expected_values = np.array(h_in_values)[argsort]

assert np.array_equal(h_out_keys, expected_keys)
assert np.array_equal(h_out_values, expected_values)
print(f"Merge sort basic result - keys: {h_out_keys}, values: {h_out_values}")
Parameters:
  • d_in_keys (DeviceArrayLike | IteratorBase) – Device array or iterator containing the input sequence of keys

  • d_in_items (DeviceArrayLike | IteratorBase | None) – Device array or iterator containing the input sequence of items (optional)

  • d_out_keys (DeviceArrayLike) – Device array to store the sorted keys

  • d_out_items (DeviceArrayLike | None) – Device array to store the sorted items (optional)

  • op (Callable | cuda.cccl.parallel.experimental._bindings.OpKind) – Comparison operator for sorting

  • num_items (int) – Number of items to sort

  • stream – CUDA stream for the operation (optional)

cuda.cccl.parallel.experimental.algorithms.make_merge_sort(d_in_keys, d_in_items, d_out_keys, d_out_items, op)#

Implements a device-wide merge sort using d_in_keys and the comparison operator op.

Example

Below, make_merge_sort is used to create a merge sort object that can be reused.

"""
Merge sort example demonstrating the object API.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
dtype = np.int32
h_input_keys = np.array([4, 2, 3, 1], dtype=dtype)
h_input_values = np.array([40, 20, 30, 10], dtype=dtype)
d_input_keys = cp.asarray(h_input_keys)
d_input_values = cp.asarray(h_input_values)
d_output_keys = cp.empty_like(d_input_keys)
d_output_values = cp.empty_like(d_input_values)

# Create the merge sort object.
sorter = parallel.make_merge_sort(
    d_input_keys,
    d_input_values,
    d_output_keys,
    d_output_values,
    parallel.OpKind.LESS,
)

# Get the temporary storage size.
temp_storage_size = sorter(
    None,
    d_input_keys,
    d_input_values,
    d_output_keys,
    d_output_values,
    len(h_input_keys),
)

# Allocate the temporary storage.
d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)

# Perform the merge sort.
sorter(
    d_temp_storage,
    d_input_keys,
    d_input_values,
    d_output_keys,
    d_output_values,
    len(h_input_keys),
)

# Verify the result.
expected_keys = np.array([1, 2, 3, 4], dtype=dtype)
expected_values = np.array([10, 20, 30, 40], dtype=dtype)
actual_keys = d_output_keys.get()
actual_values = d_output_values.get()
np.testing.assert_array_equal(actual_keys, expected_keys)
np.testing.assert_array_equal(actual_values, expected_values)
print("Merge sort object example completed successfully")
Parameters:
  • d_in_keys (DeviceArrayLike | IteratorBase) – Device array or iterator containing the input keys to be sorted

  • d_in_items (DeviceArrayLike | IteratorBase | None) – Optional device array or iterator that contains each key’s corresponding item

  • d_out_keys (DeviceArrayLike) – Device array to store the sorted keys

  • d_out_items (DeviceArrayLike | None) – Device array to store the sorted items

  • op (Callable | cuda.cccl.parallel.experimental._bindings.OpKind) – Callable or OpKind representing the comparison operator

Returns:

A callable object that can be used to perform the merge sort

cuda.cccl.parallel.experimental.algorithms.radix_sort(
d_in_keys,
d_out_keys,
d_in_values,
d_out_values,
order,
num_items,
begin_bit=None,
end_bit=None,
stream=None,
)#

Performs device-wide radix sort.

This function automatically handles temporary storage allocation and execution.

Example

Below, radix_sort is used to sort a sequence of keys. It also rearranges the values according to the keys’ order.

"""
Example showing how to use radix_sort to sort keys and values.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
h_in_keys = np.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype="int32")
h_in_values = np.array(
    [-3.2, 2.2, 1.9, 4.0, -3.9, 2.7, 0, 8.3 - 1, 2.9, 5.4], dtype="float32"
)

d_in_keys = cp.asarray(h_in_keys)
d_in_values = cp.asarray(h_in_values)

# Prepare the output arrays.
d_out_keys = cp.empty_like(d_in_keys)
d_out_values = cp.empty_like(d_in_values)

# Perform the radix sort.
parallel.radix_sort(
    d_in_keys,
    d_out_keys,
    d_in_values,
    d_out_values,
    parallel.SortOrder.ASCENDING,
    d_in_keys.size,
)

# Verify the result.
h_out_keys = cp.asnumpy(d_out_keys)
h_out_values = cp.asnumpy(d_out_values)

argsort = np.argsort(h_in_keys, stable=True)
expected_keys = np.array(h_in_keys)[argsort]
expected_values = np.array(h_in_values)[argsort]

assert np.array_equal(h_out_keys, expected_keys)
assert np.array_equal(h_out_values, expected_values)
print(f"Radix sort basic result - keys: {h_out_keys}, values: {h_out_values}")

In the following example, radix_sort is used to sort a sequence of keys with a ``DoubleBuffer` for reduced temporary storage.

"""
Example showing how to use radix_sort with DoubleBuffer for reduced temporary storage.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
h_in_keys = np.array([-5, 0, 2, -3, 2, 4, 0, -1, 2, 8], dtype="int32")
h_in_values = np.array(
    [-3.2, 2.2, 1.9, 4.0, -3.9, 2.7, 0, 8.3 - 1, 2.9, 5.4], dtype="float32"
)

d_in_keys = cp.asarray(h_in_keys)
d_in_values = cp.asarray(h_in_values)

d_out_keys = cp.empty_like(d_in_keys)
d_out_values = cp.empty_like(d_in_values)

# Create the double buffer.
keys_double_buffer = parallel.DoubleBuffer(d_in_keys, d_out_keys)
values_double_buffer = parallel.DoubleBuffer(d_in_values, d_out_values)

# Perform the radix sort.
parallel.radix_sort(
    keys_double_buffer,
    None,
    values_double_buffer,
    None,
    parallel.SortOrder.ASCENDING,
    d_in_keys.size,
)

# Verify the result.
h_out_keys = cp.asnumpy(keys_double_buffer.current())
h_out_values = cp.asnumpy(values_double_buffer.current())

argsort = np.argsort(h_in_keys, stable=True)
h_expected_keys = np.array(h_in_keys)[argsort]
h_expected_values = np.array(h_in_values)[argsort]

assert np.array_equal(h_out_keys, h_expected_keys)
assert np.array_equal(h_out_values, h_expected_values)
print(f"Radix sort buffer result - keys: {h_out_keys}, values: {h_out_values}")
Parameters:
  • d_in_keys (DeviceArrayLike | DoubleBuffer) – Device array or DoubleBuffer containing the input sequence of keys

  • d_out_keys (DeviceArrayLike | None) – Device array to store the sorted keys (optional)

  • d_in_values (DeviceArrayLike | DoubleBuffer | None) – Device array or DoubleBuffer containing the input sequence of values (optional)

  • d_out_values (DeviceArrayLike | None) – Device array to store the sorted values (optional)

  • order (SortOrder) – Sort order (ascending or descending)

  • num_items (int) – Number of items to sort

  • begin_bit (int | None) – Beginning bit position for comparison (optional)

  • end_bit (int | None) – Ending bit position for comparison (optional)

  • stream – CUDA stream for the operation (optional)

cuda.cccl.parallel.experimental.algorithms.make_radix_sort(
d_in_keys,
d_out_keys,
d_in_values,
d_out_values,
order,
)#

Implements a device-wide radix sort using d_in_keys in the requested order.

Example

Below, make_radix_sort is used to create a radix sort object that can be reused.

"""
Example showing how to use radix_sort with the object API.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
dtype = np.int32
h_input_keys = np.array([4, 2, 3, 1], dtype=dtype)
h_input_values = np.array([40, 20, 30, 10], dtype=dtype)
d_input_keys = cp.asarray(h_input_keys)
d_input_values = cp.asarray(h_input_values)
d_output_keys = cp.empty_like(d_input_keys)
d_output_values = cp.empty_like(d_input_values)

# Create the radix sort object.
sorter = parallel.make_radix_sort(
    d_input_keys,
    d_output_keys,
    d_input_values,
    d_output_values,
    parallel.SortOrder.ASCENDING,
)

# Get the temporary storage size.
temp_storage_size = sorter(
    None,
    d_input_keys,
    d_output_keys,
    d_input_values,
    d_output_values,
    len(h_input_keys),
)
d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)

# Perform the radix sort.
sorter(
    d_temp_storage,
    d_input_keys,
    d_output_keys,
    d_input_values,
    d_output_values,
    len(h_input_keys),
)

# Verify the result.
expected_keys = np.array([1, 2, 3, 4], dtype=dtype)
expected_values = np.array([10, 20, 30, 40], dtype=dtype)
actual_keys = d_output_keys.get()
actual_values = d_output_values.get()
np.testing.assert_array_equal(actual_keys, expected_keys)
np.testing.assert_array_equal(actual_values, expected_values)
print("Radix sort object example completed successfully")
Parameters:
  • d_in_keys (DeviceArrayLike | DoubleBuffer) – Device array or DoubleBuffer containing the input keys to be sorted

  • d_out_keys (DeviceArrayLike | None) – Device array to store the sorted keys

  • d_in_values (DeviceArrayLike | DoubleBuffer | None) – Optional Device array or DoubleBuffer containing the input keys to be sorted

  • d_out_values (DeviceArrayLike | None) – Device array to store the sorted values

  • op – Callable representing the comparison operator

  • order (SortOrder)

Returns:

A callable object that can be used to perform the radix sort

cuda.cccl.parallel.experimental.algorithms.segmented_reduce(
d_in,
d_out,
start_offsets_in,
end_offsets_in,
op,
h_init,
num_segments,
stream=None,
)#

Performs device-wide segmented reduction.

This function automatically handles temporary storage allocation and execution.

Example

Below, segmented_reduce is used to compute the minimum value of segments in a sequence of integers.

"""
Example showing how to use segmented_reduce to find the minimum in each segment.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel


def min_op(a, b):
    return a if a < b else b


dtype = np.dtype(np.int32)
max_val = np.iinfo(dtype).max
h_init = np.asarray(max_val, dtype=dtype)

# Prepare the offsets.
offsets = cp.array([0, 7, 11, 16], dtype=np.int64)
first_segment = (8, 6, 7, 5, 3, 0, 9)
second_segment = (-4, 3, 0, 1)
third_segment = (3, 1, 11, 25, 8)

# Prepare the input array.
d_input = cp.array(
    [*first_segment, *second_segment, *third_segment],
    dtype=dtype,
)

# Prepare the start and end offsets.
start_o = offsets[:-1]
end_o = offsets[1:]

# Prepare the output array.
n_segments = start_o.size
d_output = cp.empty(n_segments, dtype=dtype)

# Perform the segmented reduce.
parallel.segmented_reduce(d_input, d_output, start_o, end_o, min_op, h_init, n_segments)

# Verify the result.
expected_output = cp.asarray([0, -4, 1], dtype=d_output.dtype)
assert (d_output == expected_output).all()
print(f"Segmented reduce basic result: {d_output.get()}")
Parameters:
  • d_in (DeviceArrayLike | IteratorBase) – Device array or iterator containing the input sequence of data items

  • d_out (DeviceArrayLike | IteratorBase) – Device array to store the result of the reduction for each segment

  • start_offsets_in (DeviceArrayLike | IteratorBase) – Device array or iterator containing the sequence of beginning offsets

  • end_offsets_in (DeviceArrayLike | IteratorBase) – Device array or iterator containing the sequence of ending offsets

  • op (Callable | cuda.cccl.parallel.experimental._bindings.OpKind) – Binary reduction operator

  • h_init (ndarray | Any) – Initial value for the reduction

  • num_segments (int) – Number of segments to reduce

  • stream – CUDA stream for the operation (optional)

cuda.cccl.parallel.experimental.algorithms.make_segmented_reduce(
d_in,
d_out,
start_offsets_in,
end_offsets_in,
op,
h_init,
)#

Computes a device-wide segmented reduction using the specified binary op and initial value init.

Example

Below, make_segmented_reduce is used to create a segmented reduction object that can be reused.

"""
Segmented reduction using the object API.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
dtype = np.int32
h_init = np.array([0], dtype=dtype)
h_input = np.array([1, 2, 3, 4, 5, 6], dtype=dtype)
d_input = cp.asarray(h_input)
d_output = cp.empty(2, dtype=dtype)

start_offsets = cp.array([0, 3], dtype=np.int64)
end_offsets = cp.array([3, 6], dtype=np.int64)

# Create the segmented reduce object.
reducer = parallel.make_segmented_reduce(
    d_input, d_output, start_offsets, end_offsets, parallel.OpKind.PLUS, h_init
)

# Get the temporary storage size.
temp_storage_size = reducer(
    None, d_input, d_output, 2, start_offsets, end_offsets, h_init
)

# Allocate the temporary storage.
d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)

# Perform the segmented reduce.
reducer(d_temp_storage, d_input, d_output, 2, start_offsets, end_offsets, h_init)

# Verify the result.
expected_result = np.array([6, 15], dtype=dtype)
actual_result = d_output.get()
np.testing.assert_array_equal(actual_result, expected_result)
print("Segmented reduce object example completed successfully")
Parameters:
  • d_in (DeviceArrayLike | IteratorBase) – Device array or iterator containing the input sequence of data items

  • d_out (DeviceArrayLike | IteratorBase) – Device array that will store the result of the reduction

  • start_offsets_in (DeviceArrayLike | IteratorBase) – Device array or iterator containing offsets to start of segments

  • end_offsets_in (DeviceArrayLike | IteratorBase) – Device array or iterator containing offsets to end of segments

  • op (Callable | cuda.cccl.parallel.experimental._bindings.OpKind) – Callable or OpKind representing the binary operator to apply

  • init – Numpy array storing initial value of the reduction

  • h_init (ndarray)

Returns:

A callable object that can be used to perform the reduction

cuda.cccl.parallel.experimental.algorithms.unique_by_key(
d_in_keys,
d_in_items,
d_out_keys,
d_out_items,
d_out_num_selected,
op,
num_items,
stream=None,
)#

Performs device-wide unique by key operation using the single-phase API.

This function automatically handles temporary storage allocation and execution.

Example

Below, unique_by_key is used to populate the arrays of output keys and items with the first key and its corresponding item from each sequence of equal keys. It also outputs the number of items selected.

"""
Example showing how to use unique_by_key to remove all
but the first value for each group of consecutive keys.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
h_in_keys = np.array([0, 2, 2, 9, 5, 5, 5, 8], dtype="int32")
h_in_values = np.array([1, 2, 3, 4, 5, 6, 7, 8], dtype="float32")

d_in_keys = cp.asarray(h_in_keys)
d_in_values = cp.asarray(h_in_values)
d_out_keys = cp.empty_like(d_in_keys)
d_out_values = cp.empty_like(d_in_values)
d_out_num_selected = cp.empty(1, np.int32)

# Perform the unique by key operation.
parallel.unique_by_key(
    d_in_keys,
    d_in_values,
    d_out_keys,
    d_out_values,
    d_out_num_selected,
    parallel.OpKind.EQUAL_TO,
    d_in_keys.size,
)

# Verify the result.
num_selected = cp.asnumpy(d_out_num_selected)[0]
h_out_keys = cp.asnumpy(d_out_keys)[:num_selected]
h_out_values = cp.asnumpy(d_out_values)[:num_selected]

expected_keys = np.array([0, 2, 9, 5, 8])
expected_values = np.array([1, 2, 4, 5, 8])

assert np.array_equal(h_out_keys, expected_keys)
assert np.array_equal(h_out_values, expected_values)
print(
    f"Unique by key basic result - keys: {h_out_keys}, values: {h_out_values}, count: {num_selected}"
)
Parameters:
  • d_in_keys (DeviceArrayLike | IteratorBase) – Device array or iterator containing the input sequence of keys

  • d_in_items (DeviceArrayLike | IteratorBase) – Device array or iterator that contains each key’s corresponding item

  • d_out_keys (DeviceArrayLike | IteratorBase) – Device array or iterator to store the outputted keys

  • d_out_items (DeviceArrayLike | IteratorBase) – Device array or iterator to store each outputted key’s item

  • d_out_num_selected (DeviceArrayLike) – Device array to store how many items were selected

  • op (Callable | cuda.cccl.parallel.experimental._bindings.OpKind) – Callable or OpKind representing the equality operator

  • num_items (int) – Number of items to process

  • stream – CUDA stream for the operation (optional)

cuda.cccl.parallel.experimental.algorithms.make_unique_by_key(
d_in_keys,
d_in_items,
d_out_keys,
d_out_items,
d_out_num_selected,
op,
)#

Implements a device-wide unique by key operation using d_in_keys and the comparison operator op. Only the first key and its value from each run is selected and the total number of items selected is also reported.

Example

Below, make_unique_by_key is used to create a unique by key object that can be reused.

"""
Example showing how to use unique_by_key with the object API.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Unique by key example demonstrating the object API
dtype = np.int32
h_input_keys = np.array([1, 1, 2, 3, 3], dtype=dtype)
h_input_values = np.array([10, 20, 30, 40, 50], dtype=dtype)
d_input_keys = cp.asarray(h_input_keys)
d_input_values = cp.asarray(h_input_values)
d_output_keys = cp.empty_like(d_input_keys)
d_output_values = cp.empty_like(d_input_values)
d_num_selected = cp.empty(1, dtype=np.int32)

# Create the unique by key object.
uniquer = parallel.make_unique_by_key(
    d_input_keys,
    d_input_values,
    d_output_keys,
    d_output_values,
    d_num_selected,
    parallel.OpKind.EQUAL_TO,
)

# Get the temporary storage size.
temp_storage_size = uniquer(
    None,
    d_input_keys,
    d_input_values,
    d_output_keys,
    d_output_values,
    d_num_selected,
    len(h_input_keys),
)

# Allocate the temporary storage.
d_temp_storage = cp.empty(temp_storage_size, dtype=np.uint8)

# Perform the unique by key operation.
uniquer(
    d_temp_storage,
    d_input_keys,
    d_input_values,
    d_output_keys,
    d_output_values,
    d_num_selected,
    len(h_input_keys),
)

# Verify the result.
num_selected = d_num_selected.get()[0]
expected_keys = np.array([1, 2, 3], dtype=dtype)
expected_values = np.array([10, 30, 40], dtype=dtype)
actual_keys = d_output_keys.get()[:num_selected]
actual_values = d_output_values.get()[:num_selected]
np.testing.assert_array_equal(actual_keys, expected_keys)
np.testing.assert_array_equal(actual_values, expected_values)
print("Unique by key object example completed successfully")
Parameters:
  • d_in_keys (DeviceArrayLike | IteratorBase) – Device array or iterator containing the input sequence of keys

  • d_in_items (DeviceArrayLike | IteratorBase) – Device array or iterator that contains each key’s corresponding item

  • d_out_keys (DeviceArrayLike | IteratorBase) – Device array or iterator to store the outputted keys

  • d_out_items (DeviceArrayLike | IteratorBase) – Device array or iterator to store each outputted key’s item

  • d_out_num_selected (DeviceArrayLike) – Device array to store how many items were selected

  • op (Callable | cuda.cccl.parallel.experimental._bindings.OpKind) – Callable or OpKind representing the equality operator

Returns:

A callable object that can be used to perform unique by key

class cuda.cccl.parallel.experimental.algorithms.DoubleBuffer(d_current, d_alternate)#
Parameters:
  • d_current (DeviceArrayLike)

  • d_alternate (DeviceArrayLike)

__init__(d_current, d_alternate)#
Parameters:
  • d_current (DeviceArrayLike)

  • d_alternate (DeviceArrayLike)

current()#
alternate()#
class cuda.cccl.parallel.experimental.algorithms.SortOrder(
value,
names=<not given>,
*values,
module=None,
qualname=None,
type=None,
start=1,
boundary=None,
)#
ASCENDING = 0#
DESCENDING = 1#

Iterators#

cuda.cccl.parallel.experimental.iterators.CacheModifiedInputIterator(device_array, modifier)#

Random Access Cache Modified Iterator that wraps a native device pointer.

Similar to https://nvidia.github.io/cccl/cub/api/classcub_1_1CacheModifiedInputIterator.html

Currently the only supported modifier is “stream” (LOAD_CS).

Example

The code snippet below demonstrates the usage of a CacheModifiedInputIterator:

"""
Example showing how to use cache_modified_iterator.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input array.
h_input = np.array([1, 2, 3, 4, 5], dtype=np.int32)
d_input = cp.asarray(h_input)

# Create the cache modified iterator.
cache_it = parallel.CacheModifiedInputIterator(d_input, "stream")

# Prepare the initial value for the reduction.
h_init = np.array([0], dtype=np.int32)

# Prepare the output array.
d_output = cp.empty(1, dtype=np.int32)

# Perform the reduction.
parallel.reduce_into(cache_it, d_output, parallel.OpKind.PLUS, len(d_input), h_init)

# Verify the result.
expected_output = sum(h_input)  # 1 + 2 + 3 + 4 + 5 = 15
assert (d_output == expected_output).all()
print(f"Cache modified iterator result: {d_output[0]} (expected: {expected_output})")
Parameters:
  • device_array – Array storing the input sequence of data items

  • modifier – The PTX cache load modifier

Returns:

A CacheModifiedInputIterator object initialized with device_array

cuda.cccl.parallel.experimental.iterators.ConstantIterator(value)#

Returns an Iterator representing a sequence of constant values.

Similar to https://nvidia.github.io/cccl/thrust/api/classthrust_1_1constant__iterator.html

Example

The code snippet below demonstrates the usage of a ConstantIterator representing a sequence of constant values:

"""
Example showing how to use constant_iterator.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
constant_value = 42
num_items = 5

# Create the constant iterator.
constant_it = parallel.ConstantIterator(np.int32(constant_value))

# Prepare the initial value for the reduction.
h_init = np.array([0], dtype=np.int32)

# Prepare the output array.
d_output = cp.empty(1, dtype=np.int32)

# Perform the reduction.
parallel.reduce_into(constant_it, d_output, parallel.OpKind.PLUS, num_items, h_init)

# Verify the result.
expected_output = constant_value * num_items
assert (d_output == expected_output).all()
print(f"Constant iterator result: {d_output[0]} (expected: {expected_output})")
Parameters:

value – The value of every item in the sequence

Returns:

A ConstantIterator object initialized to value

cuda.cccl.parallel.experimental.iterators.CountingIterator(offset)#

Returns an Iterator representing a sequence of incrementing values.

Similar to https://nvidia.github.io/cccl/thrust/api/classthrust_1_1counting__iterator.html

Example

The code snippet below demonstrates the usage of a CountingIterator representing the sequence [10, 11, 12]:

"""
Example showing how to use counting_iterator.
"""

import functools

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
first_item = 10
num_items = 3

# Create the counting iterator.
first_it = parallel.CountingIterator(np.int32(first_item))

# Prepare the initial value for the reduction.
h_init = np.array([0], dtype=np.int32)

# Prepare the output array.
d_output = cp.empty(1, dtype=np.int32)

# Perform the reduction.
parallel.reduce_into(first_it, d_output, parallel.OpKind.PLUS, num_items, h_init)

# Verify the result.
expected_output = functools.reduce(
    lambda a, b: a + b, range(first_item, first_item + num_items)
)
assert (d_output == expected_output).all()
print(f"Counting iterator result: {d_output[0]} (expected: {expected_output})")
Parameters:

offset – The initial value of the sequence

Returns:

A CountingIterator object initialized to offset

cuda.cccl.parallel.experimental.iterators.ReverseIterator(sequence)#

Returns an Iterator over an array or another iterator in reverse.

Similar to [std::reverse_iterator](https://en.cppreference.com/w/cpp/iterator/reverse_iterator).

Examples

The code snippet below demonstrates the usage of a ReverseIterator as an input iterator:

"""
Example showing how to use reverse_input_iterator.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
h_input = np.array([1, 2, 3, 4, 5], dtype=np.int32)
d_input = cp.asarray(h_input)

# Create the reverse input iterator.
reverse_it = parallel.ReverseIterator(d_input)
d_output = cp.empty(len(d_input), dtype=np.int32)

# Prepare the initial value for the reduction.
h_init = np.array(0, dtype=np.int32)

# Perform the reduction.
parallel.inclusive_scan(
    reverse_it, d_output, parallel.OpKind.PLUS, h_init, len(d_input)
)

# Verify the result.
expected_output = np.array([5, 9, 12, 14, 15], dtype=np.int32)
result = d_output.get()

np.testing.assert_array_equal(result, expected_output)
print(f"Original input: {h_input}")
print(f"Reverse scan result: {result}")
print(f"Expected result: {expected_output}")

The code snippet below demonstrates the usage of a ReverseIterator as an output iterator:

"""
Example showing how to use reverse_output_iterator.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input and output arrays.
h_input = np.array([1, 2, 3, 4, 5], dtype=np.int32)
d_input = cp.asarray(h_input)

# Prepare the output array.
d_output = cp.empty(len(d_input), dtype=np.int32)
h_init = np.array(0, dtype=np.int32)

# Create the reverse output iterator.
reverse_out_it = parallel.ReverseIterator(d_output)

# Perform the reduction.
parallel.inclusive_scan(
    d_input, reverse_out_it, parallel.OpKind.PLUS, h_init, len(d_input)
)

# Verify the result.
expected_output = np.array([15, 10, 6, 3, 1], dtype=np.int32)
result = d_output.get()

np.testing.assert_array_equal(result, expected_output)
print(f"Original input: {h_input}")
print(f"Reverse output result: {result}")
print(f"Expected result: {expected_output}")
Parameters:

sequence – The iterator or array to be reversed

Returns:

A ReverseIterator object

cuda.cccl.parallel.experimental.iterators.TransformIterator(it, op)#

An iterator that applies a user-defined unary function to the elements of an underlying iterator as they are read.

Similar to [thrust::transform_iterator](https://nvidia.github.io/cccl/thrust/api/classthrust_1_1transform__iterator.html)

Example

The code snippet below demonstrates the usage of a TransformIterator composed with a CountingIterator to transform the input before performing a reduction.

"""
Demonstrate reduction with transform iterator.
"""

import functools

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel


def transform_op(a):
    return -a if a % 2 == 0 else a


# Prepare the input and output arrays.
first_item = 10
num_items = 100

transform_it = parallel.TransformIterator(
    parallel.CountingIterator(np.int32(first_item)), transform_op
)  # Input sequence
h_init = np.array([0], dtype=np.int64)  # Initial value for the reduction
d_output = cp.empty(1, dtype=np.int64)  # Storage for output

# Perform the reduction.
parallel.reduce_into(transform_it, d_output, parallel.OpKind.PLUS, num_items, h_init)

# Verify the result.
expected_output = functools.reduce(
    lambda a, b: a + b,
    [-a if a % 2 == 0 else a for a in range(first_item, first_item + num_items)],
)

# Test assertions
print(f"Transform iterator result: {d_output[0]} (expected: {expected_output})")
assert (d_output == expected_output).all()
assert d_output[0] == expected_output
Parameters:
  • it – The underlying iterator

  • op – The unary operation to be applied to values as they are read from it

Returns:

A TransformIterator object to transform the items in it using op

cuda.cccl.parallel.experimental.iterators.TransformOutputIterator(it, op)#

An iterator that applies a user-defined unary function to values before writing them to an underlying iterator.

Similar to [thrust::transform_output_iterator](https://nvidia.github.io/cccl/thrust/api/classthrust_1_1transform__output__iterator.html).

Example

The code snippet below demonstrates the usage of a TransformOutputIterator to transform the output of a reduction before writing to an output array.

Parameters:
  • it – The underlying iterator

  • op – The operation to be applied to values before they are written to it

Returns:

A TransformOutputIterator object that applies op to transform values before writing them to it

cuda.cccl.parallel.experimental.iterators.ZipIterator(*iterators)#

Returns an Iterator representing a zipped sequence of values from N iterators.

Similar to https://nvidia.github.io/cccl/thrust/api/classthrust_1_1zip__iterator.html

The resulting iterator yields gpu_struct objects with fields corresponding to each input iterator. For 2 iterators, fields are named ‘first’ and ‘second’. For N iterators, fields are indexed as field_0, field_1, …, field_N-1.

Example

The code snippet below demonstrates the usage of a ZipIterator combining two device arrays:

"""
Example showing how to use zip_iterator to perform elementwise sum of two arrays.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel

# Prepare the input arrays.
d_input1 = cp.array([1, 2, 3, 4, 5], dtype=np.int32)
d_input2 = cp.array([10, 20, 30, 40, 50], dtype=np.int32)

# Create the zip iterator.
zip_it = parallel.ZipIterator(d_input1, d_input2)

# Prepare the output array.
num_items = len(d_input1)
d_output = cp.empty(num_items, dtype=np.int32)


def sum_paired_values(pair):
    """Extract values from the zip iterator pair and sum them."""
    return pair[0] + pair[1]


# Perform the unary transform.
parallel.unary_transform(zip_it, d_output, sum_paired_values, num_items)

# Calculate the expected results.
expected = d_input1.get() + d_input2.get()
result = d_output.get()

# Verify the result.
np.testing.assert_allclose(result, expected)

print(f"Input array 1: {d_input1.get()}")
print(f"Input array 2: {d_input2.get()}")
print(f"Elementwise sum result: {result}")
print(f"Expected result: {expected}")
Parameters:

*iterators – Variable number of iterators to zip (at least 1)

Returns:

A ZipIterator object that yields combined values from all input iterators

Utilities#

cuda.cccl.parallel.experimental.struct.gpu_struct_from_numba_types(name, field_names, field_types)#

Create a struct type from tuples of field names and numba types.

Parameters:
  • name (str) – The name of the struct class

  • field_names (tuple) – Tuple of field names

  • field_types (tuple) – Tuple of corresponding numba types

Returns:

A dynamically created struct class with the specified fields

Return type:

Type

cuda.cccl.parallel.experimental.struct.gpu_struct(this)#

Decorate a class as a GPU struct.

A GpuStruct represents a value composed of one or more other values, and is defined as a class with annotated fields (similar to a dataclass). The type of each field must be a subclass of np.number, like np.int32 or np.float64.

Arrays of GPUStruct objects can be used as inputs to cuda.cccl.parallel algorithms.

Example

The code snippet below shows how to use gpu_struct to define a MinMax type (composed of min_val, max_val values), and perform a reduction on an input array of floating point values to compute its the smallest and the largest absolute values:

"""
Simultaneously computing the minimum and maximum values of a sequence using `reduce_into`
with a custom data type.
"""

import cupy as cp
import numpy as np

import cuda.cccl.parallel.experimental as parallel


# Define a custom data type for the accumulator.
@parallel.gpu_struct
class MinMax:
    min_val: np.float64
    max_val: np.float64


# Define the binary operation for the reduction.
def minmax_op(v1: MinMax, v2: MinMax):
    c_min = min(v1.min_val, v2.min_val)
    c_max = max(v1.max_val, v2.max_val)
    return MinMax(c_min, c_max)


# Define a transform operation to convert a value `x` to MinMax(abs(x), abs(x)).
def transform_op(v):
    av = abs(v)
    return MinMax(av, av)


# Prepare the input and output data.
nelems = 4096
d_in = cp.random.randn(nelems)
tr_it = parallel.TransformIterator(d_in, transform_op)

d_out = cp.empty(tuple(), dtype=MinMax.dtype)

h_init = MinMax(np.inf, -np.inf)

# Perform the reduction.
parallel.reduce_into(tr_it, d_out, minmax_op, nelems, h_init)

# Verify the result.
actual = d_out.get()
h = np.abs(d_in.get())
expected = np.asarray([(h.min(), h.max())], dtype=MinMax.dtype)

assert actual == expected
print(f"MinMax reduction result: {actual}")
Parameters:

this (type)

Return type:

Type[Any]

cuda.cccl.parallel.experimental.struct.gpu_struct_from_numpy_dtype(name, np_dtype)#

Create a GPU struct from a numpy record dtype.