Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit c3ecde7

Browse files
now correct logic
1 parent 6ce1862 commit c3ecde7

File tree

1 file changed

+83
-69
lines changed

1 file changed

+83
-69
lines changed

numpy/_core/src/umath/loops_arithmetic.dispatch.cpp

+83-69
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,9 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
4545
}
4646
else if (scalar == 1) {
4747
// Special case for division by 1
48-
memcpy(dst, src, len * sizeof(T));
48+
if (src != dst) {
49+
std::memcpy(dst, src, len * sizeof(T));
50+
}
4951
}
5052
else if (scalar == static_cast<T>(-1)) {
5153
const auto vec_min_val = hn::Set(d, std::numeric_limits<T>::min());
@@ -59,54 +61,48 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
5961
raise_overflow = true;
6062
}
6163
}
62-
if (i < static_cast<size_t>(len)) {
63-
const size_t num = len - i;
64-
const auto vec_src = hn::LoadN(d, src + i, num);
65-
const auto is_min_val = hn::Eq(vec_src, vec_min_val);
66-
const auto vec_res = hn::IfThenElse(is_min_val, vec_min_val, hn::Neg(vec_src));
67-
hn::StoreN(vec_res, d, dst + i, num);
68-
if (!raise_overflow && !hn::AllFalse(d, is_min_val)) {
64+
// Handle remaining elements
65+
for (; i < static_cast<size_t>(len); i++) {
66+
T val = src[i];
67+
if (val == std::numeric_limits<T>::min()) {
68+
dst[i] = std::numeric_limits<T>::min();
6969
raise_overflow = true;
70+
} else {
71+
dst[i] = -val;
7072
}
7173
}
7274
}
7375
else {
76+
// General case with floor division semantics
7477
const auto vec_scalar = hn::Set(d, scalar);
75-
const auto zero = hn::Zero(d);
78+
const auto vec_zero = hn::Zero(d);
7679
size_t i = 0;
80+
7781
for (; i + N <= static_cast<size_t>(len); i += N) {
7882
const auto vec_src = hn::LoadU(d, src + i);
79-
auto vec_res = hn::Div(vec_src, vec_scalar);
80-
const auto vec_mul = hn::Mul(vec_res, vec_scalar);
81-
const auto remainder_check = hn::Ne(vec_src, vec_mul);
82-
const auto vec_nsign_src = hn::Lt(vec_src, zero);
83-
const auto vec_nsign_scalar = hn::Lt(vec_scalar, zero);
84-
const auto diff_sign = hn::Xor(vec_nsign_src, vec_nsign_scalar);
85-
vec_res = hn::IfThenElse(
86-
hn::And(remainder_check, diff_sign),
87-
hn::Sub(vec_res, hn::Set(d, 1)),
88-
vec_res
89-
);
90-
hn::StoreU(vec_res, d, dst + i);
83+
auto vec_div = hn::Div(vec_src, vec_scalar);
84+
const auto vec_mul = hn::Mul(vec_div, vec_scalar);
85+
const auto has_remainder = hn::Ne(vec_src, vec_mul);
86+
const auto src_sign = hn::Lt(vec_src, vec_zero);
87+
const auto scalar_sign = hn::Lt(vec_scalar, vec_zero);
88+
const auto different_signs = hn::Xor(src_sign, scalar_sign);
89+
90+
auto adjustment = hn::And(different_signs, has_remainder);
91+
vec_div = hn::IfThenElse(adjustment, hn::Sub(vec_div, hn::Set(d, static_cast<T>(1))), vec_div);
92+
93+
hn::StoreU(vec_div, d, dst + i);
9194
}
92-
if (i < static_cast<size_t>(len)) {
93-
const size_t num = len - i;
94-
const auto vec_src = hn::LoadN(d, src + i, num);
95-
auto vec_res = hn::Div(vec_src, vec_scalar);
96-
const auto vec_mul = hn::Mul(vec_res, vec_scalar);
97-
const auto remainder_check = hn::Ne(vec_src, vec_mul);
98-
const auto vec_nsign_src = hn::Lt(vec_src, zero);
99-
const auto vec_nsign_scalar = hn::Lt(vec_scalar, zero);
100-
const auto diff_sign = hn::Xor(vec_nsign_src, vec_nsign_scalar);
101-
vec_res = hn::IfThenElse(
102-
hn::And(remainder_check, diff_sign),
103-
hn::Sub(vec_res, hn::Set(d, 1)),
104-
vec_res
105-
);
106-
hn::StoreN(vec_res, d, dst + i, num);
95+
96+
// Handle remaining elements with scalar code
97+
for (; i < static_cast<size_t>(len); i++) {
98+
T n = src[i];
99+
T r = n / scalar;
100+
if (((n > 0) != (scalar > 0)) && ((r * scalar) != n)) {
101+
r--;
102+
}
103+
dst[i] = r;
107104
}
108105
}
109-
110106
set_float_status(raise_overflow, raise_divbyzero);
111107
}
112108

@@ -126,7 +122,9 @@ void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp le
126122
}
127123
else if (scalar == 1) {
128124
// Special case for division by 1
129-
memcpy(dst, src, len * sizeof(T));
125+
if (src != dst) {
126+
std::memcpy(dst, src, len * sizeof(T));
127+
}
130128
}
131129
else {
132130
const auto vec_scalar = hn::Set(d, scalar);
@@ -136,11 +134,9 @@ void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp le
136134
const auto vec_res = hn::Div(vec_src, vec_scalar);
137135
hn::StoreU(vec_res, d, dst + i);
138136
}
139-
if (i < static_cast<size_t>(len)) {
140-
const size_t num = len - i;
141-
const auto vec_src = hn::LoadN(d, src + i, num);
142-
const auto vec_res = hn::Div(vec_src, vec_scalar);
143-
hn::StoreN(vec_res, d, dst + i, num);
137+
// Handle remaining elements
138+
for (; i < static_cast<size_t>(len); i++) {
139+
dst[i] = src[i] / scalar;
144140
}
145141
}
146142

@@ -185,8 +181,8 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
185181
}
186182
*reinterpret_cast<T*>(iop1) = io1;
187183
return;
188-
}
189-
else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) &&
184+
}
185+
if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) &&
190186
*reinterpret_cast<T*>(args[1]) != 0)
191187
{
192188
bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]);
@@ -204,7 +200,7 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
204200
const T dividend = *reinterpret_cast<T*>(ip1);
205201
const T divisor = *reinterpret_cast<T*>(ip2);
206202
T* result = reinterpret_cast<T*>(op1);
207-
203+
208204
if (HWY_UNLIKELY(divisor == 0)) {
209205
npy_set_floatstatus_divbyzero();
210206
*result = 0;
@@ -233,7 +229,7 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
233229
*reinterpret_cast<T*>(iop1) = io1;
234230
return;
235231
}
236-
else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) &&
232+
if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(T), NPY_SIMD_WIDTH) &&
237233
*reinterpret_cast<T*>(args[1]) != 0)
238234
{
239235
bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]);
@@ -245,7 +241,6 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
245241
return;
246242
}
247243
}
248-
249244
// Fallback for non-blockable, in-place, or zero divisor cases
250245
BINARY_LOOP {
251246
const T in1 = *reinterpret_cast<T*>(ip1);
@@ -261,46 +256,48 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
261256

262257
// Indexed division for signed integers
263258
template <typename T>
264-
int TYPE_divide_indexed(char * const*args, npy_intp const *dimensions,
259+
int TYPE_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
260+
char * const*args, npy_intp const *dimensions,
265261
npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) {
266262
char *ip1 = args[0];
267263
char *indxp = args[1];
268264
char *value = args[2];
269265
npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
270266
npy_intp shape = steps[3];
271267
npy_intp n = dimensions[0];
272-
268+
273269
for(npy_intp i = 0; i < n; i++, indxp += isindex, value += isb) {
274270
npy_intp indx = *(npy_intp *)indxp;
275271
if (indx < 0) {
276272
indx += shape;
277273
}
278-
T* indexed = reinterpret_cast<T*>(ip1 + is1 * indx);
279-
T divisor = *reinterpret_cast<T*>(value);
274+
T* indexed = (T*)(ip1 + is1 * indx);
275+
T divisor = *(T*)value;
280276
*indexed = floor_div(*indexed, divisor);
281277
}
282278
return 0;
283279
}
284280

285281
// Indexed division for unsigned integers
286282
template <typename T>
287-
int TYPE_divide_unsigned_indexed(char * const*args, npy_intp const *dimensions,
283+
int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
284+
char * const*args, npy_intp const *dimensions,
288285
npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) {
289286
char *ip1 = args[0];
290287
char *indxp = args[1];
291288
char *value = args[2];
292289
npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
293290
npy_intp shape = steps[3];
294291
npy_intp n = dimensions[0];
295-
292+
296293
for(npy_intp i = 0; i < n; i++, indxp += isindex, value += isb) {
297294
npy_intp indx = *(npy_intp *)indxp;
298295
if (indx < 0) {
299296
indx += shape;
300297
}
301-
T* indexed = reinterpret_cast<T*>(ip1 + is1 * indx);
302-
T divisor = *reinterpret_cast<T*>(value);
303-
298+
T* indexed = (T*)(ip1 + is1 * indx);
299+
T divisor = *(T*)value;
300+
304301
if (HWY_UNLIKELY(divisor == 0)) {
305302
npy_set_floatstatus_divbyzero();
306303
*indexed = 0;
@@ -317,17 +314,26 @@ int TYPE_divide_unsigned_indexed(char * const*args, npy_intp const *dimensions,
317314
TYPE_divide<SCALAR_TYPE>(args, dimensions, steps, func); \
318315
} \
319316
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_divide_indexed)(PyArrayMethod_Context *context, char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *func) { \
320-
return TYPE_divide_indexed<SCALAR_TYPE>(args, dimensions, steps, func); \
317+
return TYPE_divide_indexed<SCALAR_TYPE>(context, args, dimensions, steps, func); \
321318
} \
322319
} // extern "C"
323320

324321

325322
#ifdef NPY_CPU_DISPATCH_CURFX
326-
DEFINE_DIVIDE_FUNCTION(BYTE, int8_t)
327-
DEFINE_DIVIDE_FUNCTION(SHORT, int16_t)
328-
DEFINE_DIVIDE_FUNCTION(INT, int32_t)
329-
DEFINE_DIVIDE_FUNCTION(LONG, int64_t)
330-
DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t)
323+
// On Linux and macOS (LP64 model), long is 64 bits, but on 32-bit Windows (LLP64 model), long is 32 bits. Meanwhile, long long is guaranteed at least 64 bits
324+
#if (defined(_WIN32) && !defined(_WIN64)) || defined(__EMSCRIPTEN__) || (defined(__arm__) && !defined(__aarch64__)) || (defined(__linux__) && ((defined(__i386__) || defined(__i686__))))
325+
DEFINE_DIVIDE_FUNCTION(BYTE, int8_t)
326+
DEFINE_DIVIDE_FUNCTION(SHORT, int16_t)
327+
DEFINE_DIVIDE_FUNCTION(INT, int32_t)
328+
DEFINE_DIVIDE_FUNCTION(LONG, int32_t) // LONG is 32-bit on 32-bit platforms
329+
DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t)
330+
#else
331+
DEFINE_DIVIDE_FUNCTION(BYTE, int8_t)
332+
DEFINE_DIVIDE_FUNCTION(SHORT, int16_t)
333+
DEFINE_DIVIDE_FUNCTION(INT, int32_t)
334+
DEFINE_DIVIDE_FUNCTION(LONG, int64_t) // LONG is 64-bit on 64-bit platforms
335+
DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t)
336+
#endif
331337
#endif
332338

333339
#define DEFINE_DIVIDE_FUNCTION_UNSIGNED(TYPE, SCALAR_TYPE) \
@@ -336,16 +342,24 @@ DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t)
336342
TYPE_divide_unsigned<SCALAR_TYPE>(args, dimensions, steps, func); \
337343
} \
338344
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(TYPE##_divide_indexed)(PyArrayMethod_Context *context, char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *func) { \
339-
return TYPE_divide_unsigned_indexed<SCALAR_TYPE>(args, dimensions, steps, func); \
345+
return TYPE_divide_unsigned_indexed<SCALAR_TYPE>(context, args, dimensions, steps, func); \
340346
} \
341347
}
342348

343349
#ifdef NPY_CPU_DISPATCH_CURFX
344-
DEFINE_DIVIDE_FUNCTION_UNSIGNED(UBYTE, uint8_t)
345-
DEFINE_DIVIDE_FUNCTION_UNSIGNED(USHORT, uint16_t)
346-
DEFINE_DIVIDE_FUNCTION_UNSIGNED(UINT, uint32_t)
347-
DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint64_t)
348-
DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONGLONG, uint64_t)
350+
#if (defined(_WIN32) && !defined(_WIN64)) || defined(__EMSCRIPTEN__) || (defined(__arm__) && !defined(__aarch64__)) || (defined(__linux__) && ((defined(__i386__) || defined(__i686__))))
351+
DEFINE_DIVIDE_FUNCTION_UNSIGNED(UBYTE, uint8_t)
352+
DEFINE_DIVIDE_FUNCTION_UNSIGNED(USHORT, uint16_t)
353+
DEFINE_DIVIDE_FUNCTION_UNSIGNED(UINT, uint32_t)
354+
DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint32_t) // ULONG is 32-bit on 32-bit platforms
355+
DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONGLONG, uint64_t)
356+
#else
357+
DEFINE_DIVIDE_FUNCTION_UNSIGNED(UBYTE, uint8_t)
358+
DEFINE_DIVIDE_FUNCTION_UNSIGNED(USHORT, uint16_t)
359+
DEFINE_DIVIDE_FUNCTION_UNSIGNED(UINT, uint32_t)
360+
DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint64_t) // ULONG is 64-bit on 64-bit platforms
361+
DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONGLONG, uint64_t)
362+
#endif
349363
#endif
350364

351365
#undef DEFINE_DIVIDE_FUNCTION

0 commit comments

Comments
 (0)