@@ -45,7 +45,9 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
45
45
}
46
46
else if (scalar == 1 ) {
47
47
// Special case for division by 1
48
- memcpy (dst, src, len * sizeof (T));
48
+ if (src != dst) {
49
+ std::memcpy (dst, src, len * sizeof (T));
50
+ }
49
51
}
50
52
else if (scalar == static_cast <T>(-1 )) {
51
53
const auto vec_min_val = hn::Set (d, std::numeric_limits<T>::min ());
@@ -59,54 +61,48 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
59
61
raise_overflow = true ;
60
62
}
61
63
}
62
- if (i < static_cast <size_t >(len)) {
63
- const size_t num = len - i;
64
- const auto vec_src = hn::LoadN (d, src + i, num);
65
- const auto is_min_val = hn::Eq (vec_src, vec_min_val);
66
- const auto vec_res = hn::IfThenElse (is_min_val, vec_min_val, hn::Neg (vec_src));
67
- hn::StoreN (vec_res, d, dst + i, num);
68
- if (!raise_overflow && !hn::AllFalse (d, is_min_val)) {
64
+ // Handle remaining elements
65
+ for (; i < static_cast <size_t >(len); i++) {
66
+ T val = src[i];
67
+ if (val == std::numeric_limits<T>::min ()) {
68
+ dst[i] = std::numeric_limits<T>::min ();
69
69
raise_overflow = true ;
70
+ } else {
71
+ dst[i] = -val;
70
72
}
71
73
}
72
74
}
73
75
else {
76
+ // General case with floor division semantics
74
77
const auto vec_scalar = hn::Set (d, scalar);
75
- const auto zero = hn::Zero (d);
78
+ const auto vec_zero = hn::Zero (d);
76
79
size_t i = 0 ;
80
+
77
81
for (; i + N <= static_cast <size_t >(len); i += N) {
78
82
const auto vec_src = hn::LoadU (d, src + i);
79
- auto vec_res = hn::Div (vec_src, vec_scalar);
80
- const auto vec_mul = hn::Mul (vec_res, vec_scalar);
81
- const auto remainder_check = hn::Ne (vec_src, vec_mul);
82
- const auto vec_nsign_src = hn::Lt (vec_src, zero);
83
- const auto vec_nsign_scalar = hn::Lt (vec_scalar, zero);
84
- const auto diff_sign = hn::Xor (vec_nsign_src, vec_nsign_scalar);
85
- vec_res = hn::IfThenElse (
86
- hn::And (remainder_check, diff_sign),
87
- hn::Sub (vec_res, hn::Set (d, 1 )),
88
- vec_res
89
- );
90
- hn::StoreU (vec_res, d, dst + i);
83
+ auto vec_div = hn::Div (vec_src, vec_scalar);
84
+ const auto vec_mul = hn::Mul (vec_div, vec_scalar);
85
+ const auto has_remainder = hn::Ne (vec_src, vec_mul);
86
+ const auto src_sign = hn::Lt (vec_src, vec_zero);
87
+ const auto scalar_sign = hn::Lt (vec_scalar, vec_zero);
88
+ const auto different_signs = hn::Xor (src_sign, scalar_sign);
89
+
90
+ auto adjustment = hn::And (different_signs, has_remainder);
91
+ vec_div = hn::IfThenElse (adjustment, hn::Sub (vec_div, hn::Set (d, static_cast <T>(1 ))), vec_div);
92
+
93
+ hn::StoreU (vec_div, d, dst + i);
91
94
}
92
- if (i < static_cast <size_t >(len)) {
93
- const size_t num = len - i;
94
- const auto vec_src = hn::LoadN (d, src + i, num);
95
- auto vec_res = hn::Div (vec_src, vec_scalar);
96
- const auto vec_mul = hn::Mul (vec_res, vec_scalar);
97
- const auto remainder_check = hn::Ne (vec_src, vec_mul);
98
- const auto vec_nsign_src = hn::Lt (vec_src, zero);
99
- const auto vec_nsign_scalar = hn::Lt (vec_scalar, zero);
100
- const auto diff_sign = hn::Xor (vec_nsign_src, vec_nsign_scalar);
101
- vec_res = hn::IfThenElse (
102
- hn::And (remainder_check, diff_sign),
103
- hn::Sub (vec_res, hn::Set (d, 1 )),
104
- vec_res
105
- );
106
- hn::StoreN (vec_res, d, dst + i, num);
95
+
96
+ // Handle remaining elements with scalar code
97
+ for (; i < static_cast <size_t >(len); i++) {
98
+ T n = src[i];
99
+ T r = n / scalar;
100
+ if (((n > 0 ) != (scalar > 0 )) && ((r * scalar) != n)) {
101
+ r--;
102
+ }
103
+ dst[i] = r;
107
104
}
108
105
}
109
-
110
106
set_float_status (raise_overflow, raise_divbyzero);
111
107
}
112
108
@@ -126,7 +122,9 @@ void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp le
126
122
}
127
123
else if (scalar == 1 ) {
128
124
// Special case for division by 1
129
- memcpy (dst, src, len * sizeof (T));
125
+ if (src != dst) {
126
+ std::memcpy (dst, src, len * sizeof (T));
127
+ }
130
128
}
131
129
else {
132
130
const auto vec_scalar = hn::Set (d, scalar);
@@ -136,11 +134,9 @@ void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp le
136
134
const auto vec_res = hn::Div (vec_src, vec_scalar);
137
135
hn::StoreU (vec_res, d, dst + i);
138
136
}
139
- if (i < static_cast <size_t >(len)) {
140
- const size_t num = len - i;
141
- const auto vec_src = hn::LoadN (d, src + i, num);
142
- const auto vec_res = hn::Div (vec_src, vec_scalar);
143
- hn::StoreN (vec_res, d, dst + i, num);
137
+ // Handle remaining elements
138
+ for (; i < static_cast <size_t >(len); i++) {
139
+ dst[i] = src[i] / scalar;
144
140
}
145
141
}
146
142
@@ -185,8 +181,8 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
185
181
}
186
182
*reinterpret_cast <T*>(iop1) = io1;
187
183
return ;
188
- }
189
- else if (IS_BLOCKABLE_BINARY_SCALAR2 (sizeof (T), NPY_SIMD_WIDTH) &&
184
+ }
185
+ if (IS_BLOCKABLE_BINARY_SCALAR2 (sizeof (T), NPY_SIMD_WIDTH) &&
190
186
*reinterpret_cast <T*>(args[1 ]) != 0 )
191
187
{
192
188
bool no_overlap = nomemoverlap (args[2 ], steps[2 ], args[0 ], steps[0 ], dimensions[0 ]);
@@ -204,7 +200,7 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
204
200
const T dividend = *reinterpret_cast <T*>(ip1);
205
201
const T divisor = *reinterpret_cast <T*>(ip2);
206
202
T* result = reinterpret_cast <T*>(op1);
207
-
203
+
208
204
if (HWY_UNLIKELY (divisor == 0 )) {
209
205
npy_set_floatstatus_divbyzero ();
210
206
*result = 0 ;
@@ -233,7 +229,7 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
233
229
*reinterpret_cast <T*>(iop1) = io1;
234
230
return ;
235
231
}
236
- else if (IS_BLOCKABLE_BINARY_SCALAR2 (sizeof (T), NPY_SIMD_WIDTH) &&
232
+ if (IS_BLOCKABLE_BINARY_SCALAR2 (sizeof (T), NPY_SIMD_WIDTH) &&
237
233
*reinterpret_cast <T*>(args[1 ]) != 0 )
238
234
{
239
235
bool no_overlap = nomemoverlap (args[2 ], steps[2 ], args[0 ], steps[0 ], dimensions[0 ]);
@@ -245,7 +241,6 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
245
241
return ;
246
242
}
247
243
}
248
-
249
244
// Fallback for non-blockable, in-place, or zero divisor cases
250
245
BINARY_LOOP {
251
246
const T in1 = *reinterpret_cast <T*>(ip1);
@@ -261,46 +256,48 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
261
256
262
257
// Indexed division for signed integers
263
258
template <typename T>
264
- int TYPE_divide_indexed (char * const *args, npy_intp const *dimensions,
259
+ int TYPE_divide_indexed (PyArrayMethod_Context *NPY_UNUSED (context),
260
+ char * const *args, npy_intp const *dimensions,
265
261
npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) {
266
262
char *ip1 = args[0 ];
267
263
char *indxp = args[1 ];
268
264
char *value = args[2 ];
269
265
npy_intp is1 = steps[0 ], isindex = steps[1 ], isb = steps[2 ];
270
266
npy_intp shape = steps[3 ];
271
267
npy_intp n = dimensions[0 ];
272
-
268
+
273
269
for (npy_intp i = 0 ; i < n; i++, indxp += isindex, value += isb) {
274
270
npy_intp indx = *(npy_intp *)indxp;
275
271
if (indx < 0 ) {
276
272
indx += shape;
277
273
}
278
- T* indexed = reinterpret_cast <T*> (ip1 + is1 * indx);
279
- T divisor = *reinterpret_cast <T*>(value) ;
274
+ T* indexed = (T*) (ip1 + is1 * indx);
275
+ T divisor = *(T*)value ;
280
276
*indexed = floor_div (*indexed, divisor);
281
277
}
282
278
return 0 ;
283
279
}
284
280
285
281
// Indexed division for unsigned integers
286
282
template <typename T>
287
- int TYPE_divide_unsigned_indexed (char * const *args, npy_intp const *dimensions,
283
+ int TYPE_divide_unsigned_indexed (PyArrayMethod_Context *NPY_UNUSED (context),
284
+ char * const *args, npy_intp const *dimensions,
288
285
npy_intp const *steps, NpyAuxData *NPY_UNUSED(func)) {
289
286
char *ip1 = args[0 ];
290
287
char *indxp = args[1 ];
291
288
char *value = args[2 ];
292
289
npy_intp is1 = steps[0 ], isindex = steps[1 ], isb = steps[2 ];
293
290
npy_intp shape = steps[3 ];
294
291
npy_intp n = dimensions[0 ];
295
-
292
+
296
293
for (npy_intp i = 0 ; i < n; i++, indxp += isindex, value += isb) {
297
294
npy_intp indx = *(npy_intp *)indxp;
298
295
if (indx < 0 ) {
299
296
indx += shape;
300
297
}
301
- T* indexed = reinterpret_cast <T*> (ip1 + is1 * indx);
302
- T divisor = *reinterpret_cast <T*>(value) ;
303
-
298
+ T* indexed = (T*) (ip1 + is1 * indx);
299
+ T divisor = *(T*)value ;
300
+
304
301
if (HWY_UNLIKELY (divisor == 0 )) {
305
302
npy_set_floatstatus_divbyzero ();
306
303
*indexed = 0 ;
@@ -317,17 +314,26 @@ int TYPE_divide_unsigned_indexed(char * const*args, npy_intp const *dimensions,
317
314
TYPE_divide<SCALAR_TYPE>(args, dimensions, steps, func); \
318
315
} \
319
316
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX (TYPE##_divide_indexed)(PyArrayMethod_Context *context, char * const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *func) { \
320
- return TYPE_divide_indexed<SCALAR_TYPE>(args, dimensions, steps, func); \
317
+ return TYPE_divide_indexed<SCALAR_TYPE>(context, args, dimensions, steps, func); \
321
318
} \
322
319
} // extern "C"
323
320
324
321
325
322
#ifdef NPY_CPU_DISPATCH_CURFX
326
- DEFINE_DIVIDE_FUNCTION (BYTE, int8_t )
327
- DEFINE_DIVIDE_FUNCTION(SHORT, int16_t )
328
- DEFINE_DIVIDE_FUNCTION(INT, int32_t )
329
- DEFINE_DIVIDE_FUNCTION(LONG, int64_t )
330
- DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t )
323
+ // On Linux and macOS (LP64 model), long is 64 bits, but on 32-bit Windows (LLP64 model), long is 32 bits. Meanwhile, long long is guaranteed at least 64 bits
324
+ #if (defined(_WIN32) && !defined(_WIN64)) || defined(__EMSCRIPTEN__) || (defined(__arm__) && !defined(__aarch64__)) || (defined(__linux__) && ((defined(__i386__) || defined(__i686__))))
325
+ DEFINE_DIVIDE_FUNCTION (BYTE, int8_t )
326
+ DEFINE_DIVIDE_FUNCTION(SHORT, int16_t )
327
+ DEFINE_DIVIDE_FUNCTION(INT, int32_t )
328
+ DEFINE_DIVIDE_FUNCTION(LONG, int32_t ) // LONG is 32-bit on 32-bit platforms
329
+ DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t )
330
+ #else
331
+ DEFINE_DIVIDE_FUNCTION (BYTE, int8_t )
332
+ DEFINE_DIVIDE_FUNCTION(SHORT, int16_t )
333
+ DEFINE_DIVIDE_FUNCTION(INT, int32_t )
334
+ DEFINE_DIVIDE_FUNCTION(LONG, int64_t ) // LONG is 64-bit on 64-bit platforms
335
+ DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t )
336
+ #endif
331
337
#endif
332
338
333
339
#define DEFINE_DIVIDE_FUNCTION_UNSIGNED (TYPE, SCALAR_TYPE ) \
@@ -336,16 +342,24 @@ DEFINE_DIVIDE_FUNCTION(LONGLONG, int64_t)
336
342
TYPE_divide_unsigned<SCALAR_TYPE>(args, dimensions, steps, func); \
337
343
} \
338
344
NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX (TYPE##_divide_indexed)(PyArrayMethod_Context *context, char * const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *func) { \
339
- return TYPE_divide_unsigned_indexed<SCALAR_TYPE>(args, dimensions, steps, func); \
345
+ return TYPE_divide_unsigned_indexed<SCALAR_TYPE>(context, args, dimensions, steps, func); \
340
346
} \
341
347
}
342
348
343
349
#ifdef NPY_CPU_DISPATCH_CURFX
344
- DEFINE_DIVIDE_FUNCTION_UNSIGNED (UBYTE, uint8_t )
345
- DEFINE_DIVIDE_FUNCTION_UNSIGNED(USHORT, uint16_t )
346
- DEFINE_DIVIDE_FUNCTION_UNSIGNED(UINT, uint32_t )
347
- DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint64_t )
348
- DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONGLONG, uint64_t )
350
+ #if (defined(_WIN32) && !defined(_WIN64)) || defined(__EMSCRIPTEN__) || (defined(__arm__) && !defined(__aarch64__)) || (defined(__linux__) && ((defined(__i386__) || defined(__i686__))))
351
+ DEFINE_DIVIDE_FUNCTION_UNSIGNED (UBYTE, uint8_t )
352
+ DEFINE_DIVIDE_FUNCTION_UNSIGNED(USHORT, uint16_t )
353
+ DEFINE_DIVIDE_FUNCTION_UNSIGNED(UINT, uint32_t )
354
+ DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint32_t ) // ULONG is 32-bit on 32-bit platforms
355
+ DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONGLONG, uint64_t )
356
+ #else
357
+ DEFINE_DIVIDE_FUNCTION_UNSIGNED (UBYTE, uint8_t )
358
+ DEFINE_DIVIDE_FUNCTION_UNSIGNED(USHORT, uint16_t )
359
+ DEFINE_DIVIDE_FUNCTION_UNSIGNED(UINT, uint32_t )
360
+ DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONG, uint64_t ) // ULONG is 64-bit on 64-bit platforms
361
+ DEFINE_DIVIDE_FUNCTION_UNSIGNED(ULONGLONG, uint64_t )
362
+ #endif
349
363
#endif
350
364
351
365
#undef DEFINE_DIVIDE_FUNCTION
0 commit comments