Fix CUDA bf16 median filter (OpenNMT#1972)

sssshhhhhh · web-flow · commit d09369dce260 · 2026-01-10T10:22:49.000+01:00
diff --git a/include/ctranslate2/ops/median_filter.h b/include/ctranslate2/ops/median_filter.h
@@ -1,12 +1,13 @@
 #pragma once
+
 #include "op.h"
 
 namespace ctranslate2 {
   namespace ops {
 
     class MedianFilter : public Op {
     public:
-      explicit MedianFilter(dim_t width);
+      MedianFilter(const dim_t width);
       void operator()(const StorageView& input, StorageView& output) const;
 
     private:
diff --git a/src/ops/median_filter.cc b/src/ops/median_filter.cc
@@ -5,7 +5,7 @@
 namespace ctranslate2 {
   namespace ops {
 
-    MedianFilter::MedianFilter(dim_t width)
+    MedianFilter::MedianFilter(const dim_t width)
       : _width(width)
       {
       }
diff --git a/src/ops/median_filter_gpu.cu b/src/ops/median_filter_gpu.cu
@@ -1,35 +1,12 @@
 #include "ctranslate2/ops/median_filter.h"
 
-#include <cuda_fp16.h>
-#ifdef CUDA_BF16_AVAILABLE
-#include <cuda_bf16.h>
-#endif
-
-#include "type_dispatch.h"
 #include "cuda/helpers.h"
-#include <type_traits>
 
 namespace ctranslate2 {
   namespace ops {
 
     constexpr dim_t num_threads = 256;
-
-    // Conversion helpers
-    __device__ __forceinline__ float to_float(float v) { return v; }
-    __device__ __forceinline__ float to_float(const half v) { return __half2float(v); }
-#ifdef CUDA_BF16_AVAILABLE
-    __device__ __forceinline__ float to_float(const __nv_bfloat16 v) { return __bfloat162float(v); }
-#endif
-
-    __device__ __forceinline__ float from_float(float v) { return v; }
-    __device__ __forceinline__ half from_float_half(float v) { return __float2half(v); }
-#ifdef CUDA_BF16_AVAILABLE
-    __device__ __forceinline__ __nv_bfloat16 from_float_bf16(float v) { return __float2bfloat16(v); }
-#endif
-
-    namespace {
-      constexpr int kMaxWindow = 129; // supports window widths up to 129 (rank 64)
-    }
+    constexpr int kMaxWindow = 129; // supports window widths up to 129 (rank 64)
 
     template <typename DeviceT, int kMax>
     __global__ void sliding_median_lastdim_kernel(const DeviceT* input,
@@ -45,15 +22,6 @@ namespace ctranslate2 {
       int col = tid % depth;
       const int rank = width / 2;
 
-      if (depth <= rank) {
-        output[tid] = input[tid];
-        return;
-      }
-      if (width > kMax) {
-        output[tid] = input[tid];
-        return;
-      }
-
       float window[kMax];
 
       const int row_offset = row * depth;
@@ -62,7 +30,7 @@ namespace ctranslate2 {
         int read = col + k;
         if (read < 0) read = -read;
         if (read >= depth) read = 2 * depth - read - 2;
-        window[k + rank] = to_float(input[row_offset + read]);
+        window[k + rank] = float(input[row_offset + read]);
       }
 
       // Insertion sort (width is small: <= kMax, typically < 129).
@@ -75,24 +43,13 @@ namespace ctranslate2 {
         }
         window[j + 1] = key;
       }
-      float median = window[rank];
-
-      if constexpr (std::is_same<DeviceT, float>::value) {
-        output[tid] = median;
-      } else if constexpr (std::is_same<DeviceT, half>::value) {
-        output[tid] = from_float_half(median);
-#ifdef CUDA_BF16_AVAILABLE
-      } else if constexpr (std::is_same<DeviceT, __nv_bfloat16>::value) {
-        output[tid] = from_float_bf16(median);
-#endif
-      }
+      output[tid] = DeviceT(window[rank]);
     }
 
     template <Device D, typename T>
     void MedianFilter::compute(const StorageView& input,
                               const dim_t axis_size,
                               StorageView& output) const {
-      output.resize_as(input);
       const int depth = static_cast<int>(axis_size);
       const int rows = static_cast<int>(input.size() / depth);
       const int width = static_cast<int>(_width);
@@ -130,12 +87,10 @@ namespace ctranslate2 {
         rows,
         depth,
         width);
-      CUDA_CHECK(cudaGetLastError());
-      CUDA_CHECK(cudaDeviceSynchronize());
     }
 
-#define DECLARE_IMPL(T)                                         \
-    template void                                               \
+#define DECLARE_IMPL(T)                                                 \
+    template void                                                       \
     MedianFilter::compute<Device::CUDA, T>(const StorageView& input,    \
                                            const dim_t axis_size,       \
                                            StorageView& output) const;
diff --git a/tests/ops_test.cc b/tests/ops_test.cc
@@ -125,8 +125,10 @@ class OpDeviceFPTest : public ::testing::TestWithParam<FloatType> {
 };
 
 
-TEST_P(OpDeviceTest, MedianFilter) {
-  Device device = GetParam();
+TEST_P(OpDeviceFPTest, MedianFilter) {
+  Device device = GetParam().device;
+  const DataType dtype = GetParam().dtype;
+  const float error = GetParam().error;
   StorageView x({2, 8}, std::vector<float>{
       0.2556743323802948, 0.8028775453567505, 0.3514494299888611, 0.3542254865169525,
       0.5881291031837463, 0.1458204835653305, 0.6845740675926208, 0.543143630027771,
@@ -139,9 +141,9 @@ TEST_P(OpDeviceTest, MedianFilter) {
       0.9039326310157776, 0.4063926637172699, 0.7943458557128906, 0.4063926637172699,
       0.7943458557128906, 0.4063926637172699, 0.7943458557128906, 0.289182186126709},
       device);
-  StorageView y(device);
-  ops::MedianFilter(5)(x, y);
-  expect_storage_eq(y, expected);
+  StorageView y(dtype, device);
+  ops::MedianFilter(5)(x.to(dtype), y);
+  expect_storage_eq(y.to_float32(), expected, error);
 }
 
 TEST_P(OpDeviceTest, Add) {

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@`
`5`	`5`	`namespace ctranslate2 {`
`6`	`6`	`namespace ops {`
`7`	`7`
`8`		`- MedianFilter::MedianFilter(dim_t width)`
	`8`	`+ MedianFilter::MedianFilter(const dim_t width)`
`9`	`9`	`: _width(width)`
`10`	`10`	`{`
`11`	`11`	`}`