[MPS] nanmedian with dims (pytorch#149680)

Isalia20 · malfet · amathewc · commit 9f3d13f5caaa · 2025-04-17T07:03:16.000+03:00
Third most voted op from pytorch#77764 Tests were deleted because they are covered by the regular test_output_match tests so those were redundant and were added in the last PR before the nanmedian dim version would be implemented Pull Request resolved: pytorch#149680 Approved by: https://github.com/malfet Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>
diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@@ -617,8 +617,6 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t,
 static Tensor median_common_mps(const Tensor& input_t, bool nanmedian) {
   bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
   MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, nanmedian ? "nanmedian" : "median");
-  TORCH_CHECK(!nanmedian || isFloatingType(input_t.scalar_type()),
-              "Only floating point tensors can have Nans in the tensor");
 
   IntArrayRef input_shape = input_t.sizes();
   int64_t num_in_elements = c10::multiply_integers(input_shape);
@@ -1507,19 +1505,63 @@ Tensor median_mps(const Tensor& input_t) {
   return median_common_mps(input_t, /*nanmedian=*/false);
 }
 
-static void median_out_mps(const Tensor& input_t,
-                           int64_t dim,
-                           bool keepdim,
-                           const Tensor& output_t,
-                           const Tensor& indices_t,
-                           const std::string& func_name) {
-  if (output_t.numel() == 0) {
+static void median_out_mps_common(const Tensor& input_t,
+                                  int64_t dim,
+                                  bool keepdim,
+                                  Tensor& values,
+                                  Tensor& indices,
+                                  const std::string& func_name,
+                                  bool nanmedian) {
+  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
+  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "median_out");
+
+  int64_t dim_ = maybe_wrap_dim(dim, input_t.dim());
+  native::zero_numel_check_dims(input_t, dim_, "max()");
+
+  // Calculate the output shape according to keepdim=True
+  // If there is no dim argument, the input shape is flattened
+  IntArrayRef input_shape = input_t.sizes();
+  int64_t num_input_dims = input_shape.size();
+  NSMutableArray<NSNumber*>* apparent_out_shape = nil;
+  // Use this if keepdim is false
+  int64_t num_output_dims = num_input_dims - 1 < 0 ? 0 : num_input_dims - 1;
+
+  std::vector<int64_t> vec_apparent_out_shape(num_input_dims);
+  std::vector<int64_t> vec_out_shape(num_output_dims);
+
+  apparent_out_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
+  // Counter for shape when keepdim is false
+  int out_i = 0;
+  for (const auto i : c10::irange(num_input_dims)) {
+    if (dim_ == i) {
+      apparent_out_shape[i] = @1;
+      vec_apparent_out_shape[i] = 1;
+    } else {
+      apparent_out_shape[i] = [NSNumber numberWithInt:input_shape[i]];
+      vec_apparent_out_shape[i] = input_shape[i];
+      vec_out_shape[out_i] = input_shape[i];
+      out_i++;
+    }
+  }
+
+  if (!keepdim) {
+    values =
+        at::empty(IntArrayRef(vec_out_shape), input_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
+    indices = at::empty(IntArrayRef(vec_out_shape), ScalarType::Long, std::nullopt, kMPS, std::nullopt, std::nullopt);
+  } else {
+    values = at::empty(
+        IntArrayRef(vec_apparent_out_shape), input_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
+    indices = at::empty(
+        IntArrayRef(vec_apparent_out_shape), ScalarType::Long, std::nullopt, kMPS, std::nullopt, std::nullopt);
+  }
+
+  if (values.numel() == 0 || input_t.numel() == 0) {
     return;
   }
 
   if (input_t.numel() == 1 && input_t.dim() == 0) {
-    output_t.fill_(input_t);
-    indices_t.fill_(0);
+    values.fill_(input_t);
+    indices.fill_(0);
     return;
   }
 
@@ -1531,18 +1573,6 @@ static void median_out_mps(const Tensor& input_t,
     MPSGraphTensor* indicesTensor_ = nil;
   };
 
-  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
-  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "median_out");
-
-  int64_t dim_ = maybe_wrap_dim(dim, input_t.dim());
-
-  // Calculate the output shape according to keepdim=True
-  // If there is no dim argument, the input shape is flattened
-  IntArrayRef input_shape = input_t.sizes();
-  int64_t num_input_dims = input_shape.size();
-  NSMutableArray<NSNumber*>* apparent_out_shape = nil;
-
-  apparent_out_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
   for (const int i : c10::irange(num_input_dims)) {
     apparent_out_shape[i] = dim_ == i ? @1 : [NSNumber numberWithInt:input_shape[i]];
   }
@@ -1552,35 +1582,67 @@ static void median_out_mps(const Tensor& input_t,
 
   @autoreleasepool {
     string key = func_name + ":" + std::to_string(dim_) + ":" + getTensorsStringKey(input_t) + ":" +
-        getTensorsStringKey(indices_t);
+        getTensorsStringKey(indices);
     auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
       MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
       MPSGraphTensor* castInputTensor =
           castToIHFTypes(mpsGraph, inputTensor, input_t, /*includesInt64=*/macOS13_3_plus);
 
-      MPSGraphTensor* sortedTensor = [mpsGraph sortWithTensor:castInputTensor axis:((NSUInteger)(int)dim_)name:nil];
-
-      MPSGraphTensor* outputTensor = [mpsGraph sliceTensor:sortedTensor
-                                                 dimension:dim_
-                                                     start:((NSUInteger)(int)((dim_total_elements + 1) / 2) - 1)
-                                                    length:1
-                                                      name:nil];
-      MPSGraphTensor* argreduceOutTensor = nil;
-      argreduceOutTensor = [mpsGraph argSortWithTensor:castInputTensor axis:(NSInteger)dim_ name:@"argmax_out"];
-      MPSGraphTensor* argOutputTensor = [mpsGraph sliceTensor:argreduceOutTensor
-                                                    dimension:dim_
-                                                        start:((NSUInteger)(int)((dim_total_elements + 1) / 2) - 1)
-                                                       length:1
-                                                         name:nil];
+      MPSGraphTensor* effectiveLengthTensor = nil;
+      if (nanmedian) {
+        MPSGraphTensor* isNanTensor = [mpsGraph isNaNWithTensor:castInputTensor name:nil];
+        MPSGraphTensor* nanCountTensor = [mpsGraph reductionSumWithTensor:isNanTensor
+                                                                     axis:(NSInteger)dim_
+                                                                     name:@"nanCount"];
+        MPSGraphTensor* nanCountTensorInt = [mpsGraph castTensor:nanCountTensor
+                                                          toType:MPSDataTypeInt32
+                                                            name:@"nanCountInt"];
+        MPSGraphTensor* dimSizeTensor = [mpsGraph constantWithScalar:dim_total_elements
+                                                               shape:@[]
+                                                            dataType:MPSDataTypeInt32];
+        // effective count: effectiveLength = dim_size - nan_count.
+        effectiveLengthTensor = [mpsGraph subtractionWithPrimaryTensor:dimSizeTensor
+                                                       secondaryTensor:nanCountTensorInt
+                                                                  name:@"effectiveLength"];
+      } else {
+        effectiveLengthTensor = [mpsGraph constantWithScalar:dim_total_elements
+                                                       shape:apparent_out_shape
+                                                    dataType:MPSDataTypeInt32];
+      }
+      // median index = ((effectiveLength + 1) / 2) - 1.
+      MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1 shape:@[] dataType:MPSDataTypeInt32];
+      MPSGraphTensor* twoTensor = [mpsGraph constantWithScalar:2 shape:@[] dataType:MPSDataTypeInt32];
+      MPSGraphTensor* effectivePlusOne = [mpsGraph additionWithPrimaryTensor:effectiveLengthTensor
+                                                             secondaryTensor:oneTensor
+                                                                        name:@"effectivePlusOne"];
+      MPSGraphTensor* halfEffective = [mpsGraph divisionWithPrimaryTensor:effectivePlusOne
+                                                          secondaryTensor:twoTensor
+                                                                     name:@"halfEffective"];
+      MPSGraphTensor* medianIdxTensor = [mpsGraph subtractionWithPrimaryTensor:halfEffective
+                                                               secondaryTensor:oneTensor
+                                                                          name:@"medianIdx"];
 
+      MPSGraphTensor* sortedTensor = [mpsGraph sortWithTensor:castInputTensor axis:((NSUInteger)(int)dim_)name:nil];
+      MPSGraphTensor* sortedIndicesTensor = [mpsGraph argSortWithTensor:castInputTensor
+                                                                   axis:(NSInteger)dim_
+                                                                   name:@"argsort_out"];
+
+      MPSGraphTensor* medianValueTensor = [mpsGraph gatherAlongAxis:dim_
+                                                  withUpdatesTensor:sortedTensor
+                                                      indicesTensor:medianIdxTensor
+                                                               name:@"gather_medianValue"];
+      MPSGraphTensor* medianIndexTensor = [mpsGraph gatherAlongAxis:dim_
+                                                  withUpdatesTensor:sortedIndicesTensor
+                                                      indicesTensor:medianIdxTensor
+                                                               name:@"gather_medianValue"];
       newCachedGraph->inputTensor_ = inputTensor;
-      newCachedGraph->outputTensor_ = outputTensor;
-      newCachedGraph->indicesTensor_ = argOutputTensor;
+      newCachedGraph->outputTensor_ = medianValueTensor;
+      newCachedGraph->indicesTensor_ = medianIndexTensor;
     });
 
     auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t);
-    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape);
-    auto indicesPlaceholder = Placeholder(cachedGraph->indicesTensor_, indices_t, apparent_out_shape);
+    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, values, apparent_out_shape);
+    auto indicesPlaceholder = Placeholder(cachedGraph->indicesTensor_, indices, apparent_out_shape);
 
     auto feeds = dictionaryFromPlaceholders(inputPlaceholder);
     auto results = dictionaryFromPlaceholders(outputPlaceholder, indicesPlaceholder);
@@ -1617,59 +1679,26 @@ static void median_out_mps(const Tensor& input_t,
                                                                 bool keepdim,
                                                                 at::Tensor& values,
                                                                 at::Tensor& indices) {
-  bool macOS13_3_plus = is_macos_13_or_newer(MacOSVersion::MACOS_VER_13_3_PLUS);
-  MPS_CHECK_INT64_OP_SUPPORTED(input_t, macOS13_3_plus, "median_out");
-
-  int64_t dim_ = maybe_wrap_dim(dim, input_t.dim());
-  native::zero_numel_check_dims(input_t, dim_, "max()");
-
-  // Calculate the output shape according to keepdim=True
-  // If there is no dim argument, the input shape is flattened
-  IntArrayRef input_shape = input_t.sizes();
-  int64_t num_input_dims = input_shape.size();
-  NSMutableArray<NSNumber*>* apparent_out_shape = nil;
-  // Use this if keepdim is false
-  int64_t num_output_dims = num_input_dims - 1 < 0 ? 0 : num_input_dims - 1;
-
-  std::vector<int64_t> vec_apparent_out_shape(num_input_dims);
-  std::vector<int64_t> vec_out_shape(num_output_dims);
-
-  apparent_out_shape = [NSMutableArray<NSNumber*> arrayWithCapacity:num_input_dims];
-  // Counter for shape when keepdim is false
-  int out_i = 0;
-  for (const auto i : c10::irange(num_input_dims)) {
-    if (dim_ == i) {
-      apparent_out_shape[i] = @1;
-      vec_apparent_out_shape[i] = 1;
-    } else {
-      apparent_out_shape[i] = [NSNumber numberWithInt:input_shape[i]];
-      vec_apparent_out_shape[i] = input_shape[i];
-      vec_out_shape[out_i] = input_shape[i];
-      out_i++;
-    }
-  }
-
-  if (!keepdim) {
-    values =
-        at::empty(IntArrayRef(vec_out_shape), input_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
-    indices = at::empty(IntArrayRef(vec_out_shape), ScalarType::Long, std::nullopt, kMPS, std::nullopt, std::nullopt);
-  } else {
-    values = at::empty(
-        IntArrayRef(vec_apparent_out_shape), input_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
-    indices = at::empty(
-        IntArrayRef(vec_apparent_out_shape), ScalarType::Long, std::nullopt, kMPS, std::nullopt, std::nullopt);
-  }
+  median_out_mps_common(input_t, dim, keepdim, values, indices, "median_out_mps", false);
+  return std::tuple<Tensor&, Tensor&>{values, indices};
+}
 
-  if (values.numel() == 0 || input_t.numel() == 0) {
-    return std::tuple<Tensor&, Tensor&>{values, indices};
+std::tuple<Tensor&, Tensor&> nanmedian_out_mps(const at::Tensor& self,
+                                               int64_t dim,
+                                               bool keepdim,
+                                               at::Tensor& values,
+                                               at::Tensor& indices) {
+  if (c10::isIntegralType(self.scalar_type(), true)) {
+    return median_out_mps(self, dim, keepdim, values, indices);
   }
-
-  median_out_mps(input_t, dim, keepdim, values, indices, "median_out_mps");
-
-  return std::tuple<Tensor&, Tensor&>{values, indices};
+  median_out_mps_common(self, dim, keepdim, values, indices, "nanmedian_out_mps", true);
+  return std::tie(values, indices);
 }
 
 Tensor nanmedian_mps(const Tensor& self) {
+  if (c10::isIntegralType(self.scalar_type(), true)) {
+    return median_mps(self);
+  }
   return median_common_mps(self, /*nanmedian=*/true);
 }
 
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -4011,6 +4011,7 @@
   dispatch:
     CPU: nanmedian_out_cpu
     CUDA: nanmedian_out_cuda
+    MPS: nanmedian_out_mps
 
 - func: nanmedian.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
   variants: function, method
diff --git a/test/test_mps.py b/test/test_mps.py
@@ -613,7 +613,6 @@ def mps_ops_modifier(ops):
         'masked.median': None,
         'matrix_exp': None,
         'mode': None,
-        'nanmedian': None,
         'native_dropout_backward': None,
         'normnuc': None,
         'nn.functional.fractional_max_pool2d': None,
@@ -5490,39 +5489,6 @@ def helper_dtype_float32(n1, n2, n3):
         helper_dtype_float32(3, 3, 3)
         helper_dtype_float32(1, 1, 1)
 
-    @parametrize("dtype", [torch.float32, torch.float16])
-    def test_nanmedian(self, dtype):
-        def helper(n1, n2, n3, dtype, add_nans=False):
-            cpu_x = torch.randn(n1, n2, n3, device='cpu', dtype=dtype)
-
-            if add_nans and dtype in [torch.float32, torch.float16]:
-                nan_mask = torch.rand(n1, n2, n3) < 0.2
-                cpu_x = cpu_x.clone()
-                cpu_x[nan_mask] = float('nan')
-
-            mps_x = cpu_x.clone().to('mps')
-
-            y_cpu = torch.nanmedian(cpu_x)
-            y_mps = torch.nanmedian(mps_x)
-            self.assertEqual(y_cpu, y_mps)
-
-        # test with no nans(to test the caching of the graph and behaviour when there are no nans)
-        helper(10, 10, 10, dtype)
-        helper(3, 3, 3, dtype)
-        helper(1, 1, 1, dtype)
-        helper(1, 2, 3, dtype)
-        # test with some random nans added
-        helper(10, 10, 10, dtype, add_nans=True)
-        helper(3, 3, 3, dtype, add_nans=True)
-        helper(2, 2, 3, dtype, add_nans=True)
-
-        # mix of NaNs and regular values where a median would output 3.0 while nanmedian outputs 2.0
-        cpu_x = torch.tensor([float('nan'), 1.0, 2.0, float('nan'), 3.0], device='cpu', dtype=dtype)
-        mps_x = cpu_x.detach().clone().to('mps')
-        y_cpu = torch.nanmedian(cpu_x)
-        y_mps = torch.nanmedian(mps_x)
-        self.assertEqual(y_cpu, y_mps)
-
     def test_any(self):
         def helper(shape):
             input_xs = []