Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 617e493

Browse files
optimise further
1 parent ef0f010 commit 617e493

File tree

1 file changed

+7
-9
lines changed

1 file changed

+7
-9
lines changed

numpy/_core/src/umath/loops_arithmetic.dispatch.cpp

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -80,20 +80,18 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
8080
// General case with floor division semantics
8181
const auto vec_scalar = hn::Set(d, scalar);
8282
const auto vec_zero = hn::Zero(d);
83+
const auto one = hn::Set(d, static_cast<T>(1));
8384
size_t i = 0;
8485

8586
for (; i + N <= static_cast<size_t>(len); i += N) {
8687
const auto vec_src = hn::LoadU(d, src + i);
8788
auto vec_div = hn::Div(vec_src, vec_scalar);
8889
const auto vec_mul = hn::Mul(vec_div, vec_scalar);
89-
const auto has_remainder = hn::Ne(vec_src, vec_mul);
90-
const auto src_sign = hn::Lt(vec_src, vec_zero);
91-
const auto scalar_sign = hn::Lt(vec_scalar, vec_zero);
92-
const auto different_signs = hn::Xor(src_sign, scalar_sign);
93-
94-
auto adjustment = hn::And(different_signs, has_remainder);
95-
vec_div = hn::IfThenElse(adjustment, hn::Sub(vec_div, hn::Set(d, static_cast<T>(1))), vec_div);
90+
const auto eq_mask = hn::Eq(vec_src, vec_mul);
91+
const auto diff_signs = hn::Lt(hn::Xor(vec_src, vec_scalar), vec_zero);
92+
const auto adjust = hn::AndNot(eq_mask, diff_signs);
9693

94+
vec_div = hn::MaskedSubOr(vec_div, adjust, vec_div, one);
9795
hn::StoreU(vec_div, d, dst + i);
9896
}
9997

@@ -102,7 +100,7 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
102100
T n = src[i];
103101
T r = n / scalar;
104102
if (((n > 0) != (scalar > 0)) && ((r * scalar) != n)) {
105-
r--;
103+
--r;
106104
}
107105
dst[i] = r;
108106
}
@@ -162,7 +160,7 @@ T floor_div(T n, T d) {
162160
}
163161
T r = n / d;
164162
if (((n > 0) != (d > 0)) && ((r * d) != n)) {
165-
r--;
163+
--r;
166164
}
167165
return r;
168166
}

0 commit comments

Comments
 (0)