@@ -80,20 +80,18 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
8080 // General case with floor division semantics
8181 const auto vec_scalar = hn::Set (d, scalar);
8282 const auto vec_zero = hn::Zero (d);
83+ const auto one = hn::Set (d, static_cast <T>(1 ));
8384 size_t i = 0 ;
8485
8586 for (; i + N <= static_cast <size_t >(len); i += N) {
8687 const auto vec_src = hn::LoadU (d, src + i);
8788 auto vec_div = hn::Div (vec_src, vec_scalar);
8889 const auto vec_mul = hn::Mul (vec_div, vec_scalar);
89- const auto has_remainder = hn::Ne (vec_src, vec_mul);
90- const auto src_sign = hn::Lt (vec_src, vec_zero);
91- const auto scalar_sign = hn::Lt (vec_scalar, vec_zero);
92- const auto different_signs = hn::Xor (src_sign, scalar_sign);
93-
94- auto adjustment = hn::And (different_signs, has_remainder);
95- vec_div = hn::IfThenElse (adjustment, hn::Sub (vec_div, hn::Set (d, static_cast <T>(1 ))), vec_div);
90+ const auto eq_mask = hn::Eq (vec_src, vec_mul);
91+ const auto diff_signs = hn::Lt (hn::Xor (vec_src, vec_scalar), vec_zero);
92+ const auto adjust = hn::AndNot (eq_mask, diff_signs);
9693
94+ vec_div = hn::MaskedSubOr (vec_div, adjust, vec_div, one);
9795 hn::StoreU (vec_div, d, dst + i);
9896 }
9997
@@ -102,7 +100,7 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
102100 T n = src[i];
103101 T r = n / scalar;
104102 if (((n > 0 ) != (scalar > 0 )) && ((r * scalar) != n)) {
105- r-- ;
103+ --r ;
106104 }
107105 dst[i] = r;
108106 }
@@ -162,7 +160,7 @@ T floor_div(T n, T d) {
162160 }
163161 T r = n / d;
164162 if (((n > 0 ) != (d > 0 )) && ((r * d) != n)) {
165- r-- ;
163+ --r ;
166164 }
167165 return r;
168166}
0 commit comments