- 
                Notifications
    You must be signed in to change notification settings 
- Fork 463
Use P2322R6 to determine intermediate types for relevant algorithms #428
Description
The reduction_op is called with a type derived from the output iterator.  Example:
#include <cub/cub.cuh>
#include <iostream>
struct CustomMin {
	template <typename T>
	__device__ __host__ __forceinline__ T operator()(const T& a,
	                                                 const T& b) const {
		return (b < a) ? b : a;
	}
};
struct result_t {
	result_t() = default;
	__device__ __host__ result_t(int i) : data(i) {
	}
	double data = 0;
};
inline std::ostream& operator<<(std::ostream& os, const result_t& r) {
	os << r.data;
	return os;
}
int main() {
	int* d_offsets;
	int* d_in;
	using OT = result_t;
	//using OT = int; // compiles
	OT* d_out;
	CustomMin min_op;
	int initial_value = INT_MAX;
	const int num_segments = 3;
	const int num_values = 7;
	cudaMallocManaged(&d_offsets, sizeof(int) * (num_segments + 1));
	cudaMallocManaged(&d_in, sizeof(int) * num_values);
	cudaMallocManaged(&d_out, sizeof(OT) * num_segments);
	d_offsets[0] = 0;
	d_offsets[1] = 3;
	d_offsets[2] = 3;
	d_offsets[3] = 7;
	d_in[0] = 8;
	d_in[1] = 6;
	d_in[2] = 7;
	d_in[3] = 5;
	d_in[4] = 3;
	d_in[5] = 0;
	d_in[6] = 9;
	void* d_temp_storage = NULL;
	size_t temp_storage_bytes = 0;
	cub::DeviceSegmentedReduce::Reduce(d_temp_storage,
	                                   temp_storage_bytes,
	                                   d_in,
	                                   d_out,
	                                   num_segments,
	                                   d_offsets,
	                                   d_offsets + 1,
	                                   min_op,
	                                   initial_value);
	cudaMalloc(&d_temp_storage, temp_storage_bytes);
	cub::DeviceSegmentedReduce::Reduce(d_temp_storage,
	                                   temp_storage_bytes,
	                                   d_in,
	                                   d_out,
	                                   num_segments,
	                                   d_offsets,
	                                   d_offsets + 1,
	                                   min_op,
	                                   initial_value);
	cudaDeviceSynchronize();
	for (int i = 0; i < num_segments; ++i) {
		std::cout << "d_out[" << i << "] = " << d_out[i] << std::endl;
	}
	cudaFree(d_offsets);
	cudaFree(d_in);
	cudaFree(d_out);
	cudaFree(d_temp_storage);
}If this snippet is compiled with CUDA 11.4.48, CUB on commit 93f26ab and Thrust on commit 0b00326becfdd7a78182b36d0752c41b341863b2, which represent the current state of the default branches of CUB and Thrust, you recieve the error:
reduction.cu(8): error: no operator "<" matches these operands
            operand types are: const cub::detail::non_void_value_t<result_t *, cub::detail::value_t<int *>> < const cub::detail::non_void_value_t<result_t *, cub::detail::value_t<int *>>
          detected during:
            instantiation of "T CustomMin::operator()(const T &, const T &) const [with T=cub::detail::non_void_value_t<result_t *, cub::detail::value_t<int *>>]" 
/home/cklein2/repositories/cub/cub/device/dispatch/../../agent/agent_reduce.cuh(297): here
            instantiation of "void cub::AgentReduce<AgentReducePolicy, InputIteratorT, OutputIteratorT, OffsetT, ReductionOp>::ConsumeTile<IS_FIRST_TILE,CAN_VECTORIZE>(cub::AgentReduce<AgentReducePolicy, InputIteratorT, OutputIteratorT, OffsetT, ReductionOp>::OutputT &, OffsetT, int, cub::Int2Type<0>, cub::Int2Type<CAN_VECTORIZE>) [with AgentReducePolicy=cub::AgentReducePolicy<256, 16, cub::detail::value_t<int *>, 4, cub::BLOCK_REDUCE_WARP_REDUCTIONS, cub::LOAD_LDG, cub::MemBoundScaling<256, 16, cub::detail::value_t<int *>>>, InputIteratorT=int *, OutputIteratorT=result_t *, OffsetT=int, ReductionOp=CustomMin, IS_FIRST_TILE=1, CAN_VECTORIZE=1]"
...
The compilation also fails for the CUB and Thrust installation, which comes with the CUDA Toolkit 11.4.48.
Clearly, the reduction operator is invoked with a type based on result_t. The documentation currently says:
ReductionOp | [inferred]Binary reduction functor type having memberT operator()(const T &a, const T &b)
T | [inferred]Data element type that is convertible to the value type ofInputIteratorT
I think it is intuitive to assume that the reduction operator is called with a type derived from the input iterator and not from the output iterator. So maybe the documentation can state more precisely how type T is derived.