Use P2322R6 to determine intermediate types for relevant algorithms

The `reduction_op` is called with a type derived from the output iterator.  Example:

```cpp
#include <cub/cub.cuh>
#include <iostream>

struct CustomMin {
	template <typename T>
	__device__ __host__ __forceinline__ T operator()(const T& a,
	                                                 const T& b) const {
		return (b < a) ? b : a;
	}
};

struct result_t {
	result_t() = default;
	__device__ __host__ result_t(int i) : data(i) {
	}

	double data = 0;
};

inline std::ostream& operator<<(std::ostream& os, const result_t& r) {
	os << r.data;
	return os;
}

int main() {
	int* d_offsets;
	int* d_in;
	using OT = result_t;
	//using OT = int; // compiles
	OT* d_out;
	CustomMin min_op;

	int initial_value = INT_MAX;

	const int num_segments = 3;
	const int num_values = 7;

	cudaMallocManaged(&d_offsets, sizeof(int) * (num_segments + 1));
	cudaMallocManaged(&d_in, sizeof(int) * num_values);
	cudaMallocManaged(&d_out, sizeof(OT) * num_segments);

	d_offsets[0] = 0;
	d_offsets[1] = 3;
	d_offsets[2] = 3;
	d_offsets[3] = 7;

	d_in[0] = 8;
	d_in[1] = 6;
	d_in[2] = 7;
	d_in[3] = 5;
	d_in[4] = 3;
	d_in[5] = 0;
	d_in[6] = 9;

	void* d_temp_storage = NULL;
	size_t temp_storage_bytes = 0;

	cub::DeviceSegmentedReduce::Reduce(d_temp_storage,
	                                   temp_storage_bytes,
	                                   d_in,
	                                   d_out,
	                                   num_segments,
	                                   d_offsets,
	                                   d_offsets + 1,
	                                   min_op,
	                                   initial_value);

	cudaMalloc(&d_temp_storage, temp_storage_bytes);

	cub::DeviceSegmentedReduce::Reduce(d_temp_storage,
	                                   temp_storage_bytes,
	                                   d_in,
	                                   d_out,
	                                   num_segments,
	                                   d_offsets,
	                                   d_offsets + 1,
	                                   min_op,
	                                   initial_value);
	cudaDeviceSynchronize();

	for (int i = 0; i < num_segments; ++i) {
		std::cout << "d_out[" << i << "] = " << d_out[i] << std::endl;
	}

	cudaFree(d_offsets);
	cudaFree(d_in);
	cudaFree(d_out);
	cudaFree(d_temp_storage);
}
```

If this snippet is compiled with CUDA 11.4.48,  CUB on commit 93f26ab71dd8d69b6922936a0067644178fa731f and Thrust on commit 0b00326becfdd7a78182b36d0752c41b341863b2, which represent the current state of the default branches of CUB and Thrust, you recieve the error:

```
reduction.cu(8): error: no operator "<" matches these operands
            operand types are: const cub::detail::non_void_value_t<result_t *, cub::detail::value_t<int *>> < const cub::detail::non_void_value_t<result_t *, cub::detail::value_t<int *>>
          detected during:
            instantiation of "T CustomMin::operator()(const T &, const T &) const [with T=cub::detail::non_void_value_t<result_t *, cub::detail::value_t<int *>>]" 
/home/cklein2/repositories/cub/cub/device/dispatch/../../agent/agent_reduce.cuh(297): here
            instantiation of "void cub::AgentReduce<AgentReducePolicy, InputIteratorT, OutputIteratorT, OffsetT, ReductionOp>::ConsumeTile<IS_FIRST_TILE,CAN_VECTORIZE>(cub::AgentReduce<AgentReducePolicy, InputIteratorT, OutputIteratorT, OffsetT, ReductionOp>::OutputT &, OffsetT, int, cub::Int2Type<0>, cub::Int2Type<CAN_VECTORIZE>) [with AgentReducePolicy=cub::AgentReducePolicy<256, 16, cub::detail::value_t<int *>, 4, cub::BLOCK_REDUCE_WARP_REDUCTIONS, cub::LOAD_LDG, cub::MemBoundScaling<256, 16, cub::detail::value_t<int *>>>, InputIteratorT=int *, OutputIteratorT=result_t *, OffsetT=int, ReductionOp=CustomMin, IS_FIRST_TILE=1, CAN_VECTORIZE=1]"
...
```

The compilation also fails for the CUB and Thrust installation, which comes with the CUDA Toolkit 11.4.48.
Clearly, the reduction operator is invoked with a type based on `result_t`. The [documentation](https://github.com/NVIDIA/cub/blob/main/cub/device/device_segmented_reduce.cuh#L125) currently says:

> `ReductionOp | [inferred]` Binary reduction functor type having member `T operator()(const T &a, const T &b)`
  `T | [inferred]` Data element type that is convertible to the value type of `InputIteratorT`

I think it is intuitive to assume that the reduction operator is called with a type derived from the input iterator and not from the output iterator. So maybe the documentation can state more precisely how type `T` is derived.


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Use P2322R6 to determine intermediate types for relevant algorithms #428

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Use P2322R6 to determine intermediate types for relevant algorithms #428

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions