Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 3d63e18

Browse files
committed
ENH, SIMD: move auto-vectorized inner functions to new dispatchable source
1 parent 866f41a commit 3d63e18

6 files changed

Lines changed: 182 additions & 195 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ numpy/core/src/umath/loops_hyperbolic.dispatch.c
229229
numpy/core/src/umath/loops_modulo.dispatch.c
230230
numpy/core/src/umath/loops_comparison.dispatch.c
231231
numpy/core/src/umath/loops_unary_complex.dispatch.c
232+
numpy/core/src/umath/loops_autovec.dispatch.c
232233
# multiarray module
233234
numpy/core/src/multiarray/argfunc.dispatch.c
234235
numpy/core/src/multiarray/arraytypes.h

numpy/core/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -744,6 +744,7 @@ src_umath = [
744744
src_file.process('src/umath/loops_unary_fp.dispatch.c.src'),
745745
src_file.process('src/umath/loops_unary_fp_le.dispatch.c.src'),
746746
src_file.process('src/umath/loops_unary_complex.dispatch.c.src'),
747+
src_file.process('src/umath/loops_autovec_int.dispatch.c.src'),
747748
src_file.process('src/umath/matmul.c.src'),
748749
src_file.process('src/umath/matmul.h.src'),
749750
'src/umath/ufunc_type_resolution.c',

numpy/core/src/umath/fast_loop_macros.h

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,19 @@
1212

1313
#include <assert.h>
1414

15+
#include "simd/simd.h"
16+
17+
/*
18+
* largest simd vector size in bytes numpy supports
19+
* it is currently a extremely large value as it is only used for memory
20+
* overlap checks
21+
*/
22+
#if NPY_SIMD > 0
23+
// Enough for compiler unroll
24+
#define AUTOVEC_OVERLAP_SIZE NPY_SIMD_WIDTH*4
25+
#else
26+
#define AUTOVEC_OVERLAP_SIZE 1024
27+
#endif
1528
/*
1629
* MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc.
1730
* Very large step size can be as slow as processing it using scalar. The
@@ -219,11 +232,11 @@ abs_ptrdiff(char *a, char *b)
219232
/* condition allows compiler to optimize the generic macro */ \
220233
if (IS_BINARY_CONT(tin, tout)) { \
221234
if (abs_ptrdiff(args[2], args[0]) == 0 && \
222-
abs_ptrdiff(args[2], args[1]) >= NPY_MAX_SIMD_SIZE) { \
235+
abs_ptrdiff(args[2], args[1]) >= AUTOVEC_OVERLAP_SIZE) { \
223236
BASE_BINARY_LOOP_INP(tin, tout, op) \
224237
} \
225238
else if (abs_ptrdiff(args[2], args[1]) == 0 && \
226-
abs_ptrdiff(args[2], args[0]) >= NPY_MAX_SIMD_SIZE) { \
239+
abs_ptrdiff(args[2], args[0]) >= AUTOVEC_OVERLAP_SIZE) { \
227240
BASE_BINARY_LOOP_INP(tin, tout, op) \
228241
} \
229242
else { \

numpy/core/src/umath/loops.c.src

Lines changed: 5 additions & 154 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,6 @@
3232
*/
3333
#define PW_BLOCKSIZE 128
3434

35-
36-
/*
37-
* largest simd vector size in bytes numpy supports
38-
* it is currently a extremely large value as it is only used for memory
39-
* overlap checks
40-
*/
41-
#ifndef NPY_MAX_SIMD_SIZE
42-
#define NPY_MAX_SIMD_SIZE 1024
43-
#endif
44-
4535
/** Provides the various *_LOOP macros */
4636
#include "fast_loop_macros.h"
4737

@@ -474,74 +464,15 @@ NPY_NO_EXPORT void
474464
}
475465

476466
/**begin repeat1
477-
* #isa = , _avx2#
478-
* #CHK = 1, defined(HAVE_ATTRIBUTE_TARGET_AVX2)#
479-
* #ATTR = , NPY_GCC_TARGET_AVX2#
480-
*/
481-
482-
#if @CHK@
483-
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
484-
@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
485-
{
486-
UNARY_LOOP_FAST(@type@, @type@, *out = in * in);
487-
}
488-
#endif
489-
490-
#if @CHK@
491-
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
492-
@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
493-
{
494-
UNARY_LOOP_FAST(@type@, @type@, *out = 1.0 / in);
495-
}
496-
#endif
497-
498-
#if @CHK@
499-
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
500-
@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
501-
{
502-
UNARY_LOOP_FAST(@type@, @type@, *out = in);
503-
}
504-
#endif
505-
506-
#if @CHK@
507-
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
508-
@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
509-
{
510-
UNARY_LOOP_FAST(@type@, npy_bool, *out = !in);
511-
}
512-
#endif
513-
514-
#if @CHK@
515-
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
516-
@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
517-
{
518-
UNARY_LOOP_FAST(@type@, @type@, *out = ~in);
519-
}
520-
#endif
521-
522-
/**begin repeat2
523467
* Arithmetic
524468
* #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor#
525469
* #OP = +, -, *, &, |, ^#
526470
*/
527471

528-
#if @CHK@
529-
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
530-
@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions,
531-
npy_intp const *steps, void *NPY_UNUSED(func))
532-
{
533-
if (IS_BINARY_REDUCE) {
534-
BINARY_REDUCE_LOOP_FAST(@type@, io1 @OP@= in2);
535-
}
536-
else {
537-
BINARY_LOOP_FAST(@type@, @type@, *out = in1 @OP@ in2);
538-
}
539-
}
540-
541-
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ int
542-
@TYPE@_@kind@@isa@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
543-
char **args, npy_intp const *dimensions, npy_intp const *steps,
544-
void *NPY_UNUSED(func))
472+
NPY_NO_EXPORT NPY_GCC_OPT_3 int
473+
@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
474+
char **args, npy_intp const *dimensions, npy_intp const *steps,
475+
void *NPY_UNUSED(func))
545476
{
546477
char *ip1 = args[0];
547478
char *indx = args[1];
@@ -556,86 +487,6 @@ NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ int
556487
}
557488
return 0;
558489
}
559-
560-
#endif
561-
562-
/**end repeat2**/
563-
564-
/*
565-
* Arithmetic bit shift operations.
566-
*
567-
* Intel hardware masks bit shift values, so large shifts wrap around
568-
* and can produce surprising results. The special handling ensures that
569-
* behavior is independent of compiler or hardware.
570-
* TODO: We could implement consistent behavior for negative shifts,
571-
* which is undefined in C.
572-
*/
573-
574-
#define INT_left_shift_needs_clear_floatstatus
575-
#define UINT_left_shift_needs_clear_floatstatus
576-
577-
#if @CHK@
578-
NPY_NO_EXPORT NPY_GCC_OPT_3 void
579-
@TYPE@_left_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps,
580-
void *NPY_UNUSED(func))
581-
{
582-
BINARY_LOOP_FAST(@type@, @type@, *out = npy_lshift@c@(in1, in2));
583-
584-
#ifdef @TYPE@_left_shift_needs_clear_floatstatus
585-
// For some reason, our macOS CI sets an "invalid" flag here, but only
586-
// for some types.
587-
npy_clear_floatstatus_barrier((char*)dimensions);
588-
#endif
589-
}
590-
#endif
591-
592-
#undef INT_left_shift_needs_clear_floatstatus
593-
#undef UINT_left_shift_needs_clear_floatstatus
594-
595-
#if @CHK@
596-
NPY_NO_EXPORT
597-
#ifndef NPY_DO_NOT_OPTIMIZE_@TYPE@_right_shift
598-
NPY_GCC_OPT_3
599-
#endif
600-
void
601-
@TYPE@_right_shift@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps,
602-
void *NPY_UNUSED(func))
603-
{
604-
BINARY_LOOP_FAST(@type@, @type@, *out = npy_rshift@c@(in1, in2));
605-
}
606-
#endif
607-
608-
/**begin repeat2
609-
* #kind = logical_and, logical_or#
610-
* #OP = &&, ||#
611-
*/
612-
613-
#if @CHK@
614-
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
615-
@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
616-
{
617-
/*
618-
* gcc vectorization of this is not good (PR60575) but manual integer
619-
* vectorization is too tedious to be worthwhile
620-
*/
621-
BINARY_LOOP_FAST(@type@, npy_bool, *out = in1 @OP@ in2);
622-
}
623-
#endif
624-
625-
/**end repeat2**/
626-
627-
#if @CHK@
628-
NPY_NO_EXPORT NPY_GCC_OPT_3 @ATTR@ void
629-
@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
630-
{
631-
BINARY_LOOP {
632-
const int t1 = !!*(@type@ *)ip1;
633-
const int t2 = !!*(@type@ *)ip2;
634-
*((npy_bool *)op1) = (t1 != t2);
635-
}
636-
}
637-
#endif
638-
639490
/**end repeat1**/
640491

641492
NPY_NO_EXPORT void
@@ -1714,7 +1565,7 @@ HALF_@kind@_indexed(void *NPY_UNUSED(context),
17141565
const float v = npy_half_to_float(*(npy_half *)value);
17151566
*indexed = npy_float_to_half(npy_half_to_float(*indexed) @OP@ v);
17161567
}
1717-
return 0;
1568+
return 0;
17181569
}
17191570
/**end repeat**/
17201571

numpy/core/src/umath/loops.h.src

Lines changed: 22 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,25 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
123123
/**end repeat1**/
124124
/**end repeat**/
125125

126+
127+
#ifndef NPY_DISABLE_OPTIMIZATION
128+
#include "loops_autovec_int.dispatch.h"
129+
#endif
130+
/**begin repeat
131+
* #TYPE = UBYTE, USHORT, UINT, ULONG, ULONGLONG,
132+
BYTE, SHORT, INT, LONG, LONGLONG#
133+
*/
134+
/**begin repeat1
135+
* #kind = invert, logical_not, conjugate, reciprocal, square, add,
136+
* subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
137+
* left_shift, right_shift, logical_and, logical_or,
138+
* logical_xor#
139+
*/
140+
NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
141+
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
142+
/**end repeat1**/
143+
/**end repeat**/
144+
126145
/**begin repeat
127146
* #TYPE = BYTE, SHORT, INT, LONG, LONGLONG#
128147
*/
@@ -132,7 +151,6 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
132151
* #s = , u#
133152
* #S = , U#
134153
*/
135-
136154
#define @S@@TYPE@_floor_divide @S@@TYPE@_divide
137155
#define @S@@TYPE@_floor_divide_indexed @S@@TYPE@_divide_indexed
138156
#define @S@@TYPE@_fmax @S@@TYPE@_maximum
@@ -147,49 +165,15 @@ NPY_NO_EXPORT void
147165
@S@@TYPE@_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
148166

149167
/**begin repeat2
150-
* #isa = , _avx2#
151-
*/
152-
153-
NPY_NO_EXPORT void
154-
@S@@TYPE@_square@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
155-
156-
NPY_NO_EXPORT void
157-
@S@@TYPE@_reciprocal@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
158-
159-
NPY_NO_EXPORT void
160-
@S@@TYPE@_conjugate@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
161-
162-
NPY_NO_EXPORT void
163-
@S@@TYPE@_logical_not@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
164-
165-
NPY_NO_EXPORT void
166-
@S@@TYPE@_invert@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
167-
168-
/**begin repeat3
169168
* Arithmetic
170169
* #kind = add, subtract, multiply, bitwise_and, bitwise_or, bitwise_xor,
171170
* left_shift, right_shift#
172171
* #OP = +, -,*, &, |, ^, <<, >>#
173172
*/
174-
NPY_NO_EXPORT void
175-
@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
176-
177173
NPY_NO_EXPORT int
178-
@S@@TYPE@_@kind@@isa@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
174+
@S@@TYPE@_@kind@_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
175+
npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
179176

180-
/**end repeat3**/
181-
182-
/**begin repeat3
183-
* #kind = logical_and, logical_or#
184-
* #OP = &&, ||#
185-
*/
186-
NPY_NO_EXPORT void
187-
@S@@TYPE@_@kind@@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
188-
189-
/**end repeat3**/
190-
191-
NPY_NO_EXPORT void
192-
@S@@TYPE@_logical_xor@isa@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
193177
/**end repeat2**/
194178

195179
/**begin repeat2
@@ -217,16 +201,15 @@ NPY_NO_EXPORT void
217201

218202
NPY_NO_EXPORT void
219203
@S@@TYPE@_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
204+
/**end repeat2**/
220205

221206
/**begin repeat2
222207
* #kind = isnan, isinf, isfinite#
223208
**/
224209
NPY_NO_EXPORT void
225210
@S@@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
226211
/**end repeat2**/
227-
228212
/**end repeat1**/
229-
230213
/**end repeat**/
231214

232215

0 commit comments

Comments
 (0)