Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 7c35c37

Browse files
committed
Loongarch: modify lsx optimization(25215PR) for newest branch
1 parent 35b2c4a commit 7c35c37

14 files changed

Lines changed: 110 additions & 15 deletions

File tree

meson.options

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ option('test-simd', type: 'array',
3535
'VSX', 'VSX2', 'VSX3', 'VSX4',
3636
'NEON', 'ASIMD',
3737
'VX', 'VXE', 'VXE2',
38+
'LSX',
3839
],
3940
description: 'Specify a list of CPU features to be tested against NumPy SIMD interface')
4041
option('test-simd-args', type: 'string', value: '',

meson_cpu/loongarch64/meson.build

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
source_root = meson.project_source_root()
2+
mod_features = import('features')
3+
4+
LSX = mod_features.new(
5+
'LSX', 1, args: ['-mlsx'],
6+
test_code: files(source_root + '/numpy/distutils/checks/cpu_lsx.c')[0]
7+
)
8+
LOONGARCH64_FEATURES = {'LSX': LSX}

meson_cpu/main_config.h.in

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,4 +389,8 @@
389389
#ifdef @P@HAVE_RVV
390390
#include <riscv_vector.h>
391391
#endif
392+
393+
#ifdef @P@HAVE_LSX
394+
#include <lsxintrin.h>
395+
#endif
392396
#endif // @P@_CPU_DISPATCHER_CONF_H_

meson_cpu/meson.build

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,13 +76,15 @@ subdir('ppc64')
7676
subdir('s390x')
7777
subdir('arm')
7878
subdir('riscv64')
79+
subdir('loongarch64')
7980

8081
CPU_FEATURES = {}
8182
CPU_FEATURES += ARM_FEATURES
8283
CPU_FEATURES += X86_FEATURES
8384
CPU_FEATURES += PPC64_FEATURES
8485
CPU_FEATURES += S390X_FEATURES
8586
CPU_FEATURES += RV64_FEATURES
87+
CPU_FEATURES += LOONGARCH64_FEATURES
8688

8789
# Parse the requested baseline (CPU_CONF_BASELINE) and dispatch features
8890
# (CPU_CONF_DISPATCH).
@@ -97,6 +99,7 @@ min_features = {
9799
'aarch64': [ASIMD],
98100
'riscv64': [],
99101
'wasm32': [],
102+
'loongarch64': [LSX],
100103
}.get(cpu_family, [])
101104
if host_machine.endian() == 'little' and cpu_family == 'ppc64'
102105
min_features = [VSX2]
@@ -112,6 +115,7 @@ max_features_dict = {
112115
'aarch64': ARM_FEATURES,
113116
'riscv64': RV64_FEATURES,
114117
'wasm32': {},
118+
'loongarch64': LOONGARCH64_FEATURES,
115119
}.get(cpu_family, {})
116120
max_features = []
117121
foreach fet_name, fet_obj : max_features_dict

numpy/_core/include/numpy/npy_cpu.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,8 @@
109109
#elif __riscv_xlen == 32
110110
#define NPY_CPU_RISCV32
111111
#endif
112-
#elif defined(__loongarch__)
113-
#define NPY_CPU_LOONGARCH
112+
#elif defined(__loongarch64)
113+
#define NPY_CPU_LOONGARCH64
114114
#elif defined(__EMSCRIPTEN__)
115115
/* __EMSCRIPTEN__ is defined by emscripten: an LLVM-to-Web compiler */
116116
#define NPY_CPU_WASM

numpy/_core/meson.build

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,10 @@ if use_svml
9797
endif
9898
endif
9999

100+
if host_machine.cpu_family() == 'loongarch64'
101+
add_project_arguments(['-DHWY_COMPILE_ONLY_SCALAR'], language: ['cpp'])
102+
endif
103+
100104
use_highway = not get_option('disable-highway')
101105
if use_highway and not fs.exists('src/highway/README.md')
102106
error('Missing the `highway` git submodule! Run `git submodule update --init` to fix this.')
@@ -880,6 +884,7 @@ foreach gen_mtargets : [
880884
ASIMD, NEON,
881885
VSX3, VSX2,
882886
VXE, VX,
887+
LSX,
883888
]
884889
],
885890
[
@@ -890,6 +895,7 @@ foreach gen_mtargets : [
890895
NEON,
891896
VSX4, VSX2,
892897
VX,
898+
LSX,
893899
]
894900
],
895901
[
@@ -900,6 +906,7 @@ foreach gen_mtargets : [
900906
VSX3, VSX2,
901907
NEON,
902908
VXE, VX,
909+
LSX,
903910
]
904911
],
905912
[
@@ -916,7 +923,8 @@ foreach gen_mtargets : [
916923
AVX512_SKX, [AVX2, FMA3],
917924
VSX4, VSX2,
918925
NEON_VFPV4,
919-
VXE
926+
VXE,
927+
LSX,
920928
]
921929
],
922930
[
@@ -927,6 +935,7 @@ foreach gen_mtargets : [
927935
AVX512_SKX, AVX2, SSE2,
928936
VSX2,
929937
VX,
938+
LSX,
930939
]
931940
],
932941
[
@@ -937,6 +946,7 @@ foreach gen_mtargets : [
937946
AVX512_SKX, AVX2, SSE2,
938947
VSX2,
939948
VXE, VX,
949+
LSX,
940950
]
941951
],
942952
[
@@ -954,6 +964,7 @@ foreach gen_mtargets : [
954964
VSX4, VSX3, VSX2,
955965
NEON_VFPV4,
956966
VXE2, VXE,
967+
LSX,
957968
]
958969
],
959970
[
@@ -968,7 +979,8 @@ foreach gen_mtargets : [
968979
ASIMD, NEON,
969980
AVX512_SKX, AVX2, SSE2,
970981
VSX2,
971-
VXE, VX
982+
VXE, VX,
983+
LSX,
972984
]
973985
],
974986
[
@@ -978,7 +990,8 @@ foreach gen_mtargets : [
978990
SSE41, SSE2,
979991
VSX2,
980992
ASIMD, NEON,
981-
VXE, VX
993+
VXE, VX,
994+
LSX,
982995
]
983996
],
984997
[
@@ -988,6 +1001,7 @@ foreach gen_mtargets : [
9881001
SSE41, SSE2,
9891002
VSX2,
9901003
ASIMD, NEON,
1004+
LSX,
9911005
]
9921006
],
9931007
[
@@ -998,6 +1012,7 @@ foreach gen_mtargets : [
9981012
ASIMD, NEON,
9991013
VSX3, VSX2,
10001014
VXE, VX,
1015+
LSX,
10011016
]
10021017
],
10031018
[
@@ -1008,6 +1023,7 @@ foreach gen_mtargets : [
10081023
NEON,
10091024
VSX2,
10101025
VX,
1026+
LSX,
10111027
]
10121028
],
10131029
]

numpy/_core/src/common/npy_cpu_features.c

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,8 @@ static struct {
125125
{NPY_CPU_FEATURE_ASIMDDP, "ASIMDDP"},
126126
{NPY_CPU_FEATURE_ASIMDFHM, "ASIMDFHM"},
127127
{NPY_CPU_FEATURE_SVE, "SVE"},
128-
{NPY_CPU_FEATURE_RVV, "RVV"}};
128+
{NPY_CPU_FEATURE_RVV, "RVV"},
129+
{NPY_CPU_FEATURE_LSX, "LSX"}};
129130

130131

131132
NPY_VISIBILITY_HIDDEN PyObject *
@@ -665,6 +666,25 @@ npy__cpu_init_features(void)
665666
npy__cpu_have[NPY_CPU_FEATURE_VX] = 1;
666667
}
667668

669+
/***************** LoongArch ******************/
670+
671+
#elif defined(__loongarch64)
672+
673+
#include <sys/auxv.h>
674+
#include <asm/hwcap.h>
675+
676+
static void
677+
npy__cpu_init_features(void)
678+
{
679+
memset(npy__cpu_have, 0, sizeof(npy__cpu_have[0]) * NPY_CPU_FEATURE_MAX);
680+
unsigned int hwcap = getauxval(AT_HWCAP);
681+
682+
if ((hwcap & HWCAP_LOONGARCH_LSX)) {
683+
npy__cpu_have[NPY_CPU_FEATURE_LSX] = 1;
684+
return;
685+
}
686+
}
687+
668688

669689
/***************** ARM ******************/
670690

numpy/_core/src/common/npy_cpu_features.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ enum npy_cpu_features
9191

9292
// IBM/ZARCH
9393
NPY_CPU_FEATURE_VX = 350,
94-
94+
9595
// Vector-Enhancements Facility 1
9696
NPY_CPU_FEATURE_VXE = 351,
9797

@@ -101,6 +101,9 @@ enum npy_cpu_features
101101
// RISC-V
102102
NPY_CPU_FEATURE_RVV = 400,
103103

104+
// LOONGARCH
105+
NPY_CPU_FEATURE_LSX = 500,
106+
104107
NPY_CPU_FEATURE_MAX
105108
};
106109

@@ -113,7 +116,7 @@ enum npy_cpu_features
113116
* - uses 'NPY_DISABLE_CPU_FEATURES' to disable dispatchable features
114117
* - uses 'NPY_ENABLE_CPU_FEATURES' to enable dispatchable features
115118
*
116-
* It will set a RuntimeError when
119+
* It will set a RuntimeError when
117120
* - CPU baseline features from the build are not supported at runtime
118121
* - 'NPY_DISABLE_CPU_FEATURES' tries to disable a baseline feature
119122
* - 'NPY_DISABLE_CPU_FEATURES' and 'NPY_ENABLE_CPU_FEATURES' are
@@ -122,14 +125,14 @@ enum npy_cpu_features
122125
* by the machine or build
123126
* - 'NPY_ENABLE_CPU_FEATURES' tries to enable a feature when the project was
124127
* not built with any feature optimization support
125-
*
128+
*
126129
* It will set an ImportWarning when:
127130
* - 'NPY_DISABLE_CPU_FEATURES' tries to disable a feature that is not supported
128131
* by the machine or build
129132
* - 'NPY_DISABLE_CPU_FEATURES' or 'NPY_ENABLE_CPU_FEATURES' tries to
130133
* disable/enable a feature when the project was not built with any feature
131134
* optimization support
132-
*
135+
*
133136
* return 0 on success otherwise return -1
134137
*/
135138
NPY_VISIBILITY_HIDDEN int

numpy/_core/src/common/simd/intdiv.h

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,10 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d)
216216
divisor.val[0] = npyv_setall_u8(m);
217217
divisor.val[1] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh1));
218218
divisor.val[2] = npyv_reinterpret_u8_s8(npyv_setall_s8(-sh2));
219+
#elif defined(NPY_HAVE_LSX)
220+
divisor.val[0] = npyv_setall_u16(m);
221+
divisor.val[1] = npyv_setall_u8(sh1);
222+
divisor.val[2] = npyv_setall_u8(sh2);
219223
#else
220224
#error "please initialize the shifting operand for the new architecture"
221225
#endif
@@ -225,7 +229,7 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d)
225229
NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d);
226230
NPY_FINLINE npyv_s8x3 npyv_divisor_s8(npy_int8 d)
227231
{
228-
#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
232+
#if defined(NPY_HAVE_SSE2) || defined(NPY_HAVE_LSX) // SSE/AVX2/AVX512
229233
npyv_s16x3 p = npyv_divisor_s16(d);
230234
npyv_s8x3 r;
231235
r.val[0] = npyv_reinterpret_s8_s16(p.val[0]);
@@ -291,6 +295,9 @@ NPY_FINLINE npyv_u16x3 npyv_divisor_u16(npy_uint16 d)
291295
#elif defined(NPY_HAVE_NEON)
292296
divisor.val[1] = npyv_reinterpret_u16_s16(npyv_setall_s16(-sh1));
293297
divisor.val[2] = npyv_reinterpret_u16_s16(npyv_setall_s16(-sh2));
298+
#elif defined(NPY_HAVE_LSX)
299+
divisor.val[1] = npyv_setall_u16(sh1);
300+
divisor.val[2] = npyv_setall_u16(sh2);
294301
#else
295302
#error "please initialize the shifting operand for the new architecture"
296303
#endif
@@ -321,6 +328,8 @@ NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d)
321328
divisor.val[1] = npyv_setall_s16(sh);
322329
#elif defined(NPY_HAVE_NEON)
323330
divisor.val[1] = npyv_setall_s16(-sh);
331+
#elif defined(NPY_HAVE_LSX)
332+
divisor.val[1] = npyv_setall_s16(sh);
324333
#else
325334
#error "please initialize the shifting operand for the new architecture"
326335
#endif
@@ -358,6 +367,9 @@ NPY_FINLINE npyv_u32x3 npyv_divisor_u32(npy_uint32 d)
358367
#elif defined(NPY_HAVE_NEON)
359368
divisor.val[1] = npyv_reinterpret_u32_s32(npyv_setall_s32(-sh1));
360369
divisor.val[2] = npyv_reinterpret_u32_s32(npyv_setall_s32(-sh2));
370+
#elif defined(NPY_HAVE_LSX)
371+
divisor.val[1] = npyv_setall_u32(sh1);
372+
divisor.val[2] = npyv_setall_u32(sh2);
361373
#else
362374
#error "please initialize the shifting operand for the new architecture"
363375
#endif
@@ -393,6 +405,8 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d)
393405
divisor.val[1] = npyv_setall_s32(sh);
394406
#elif defined(NPY_HAVE_NEON)
395407
divisor.val[1] = npyv_setall_s32(-sh);
408+
#elif defined(NPY_HAVE_LSX)
409+
divisor.val[1] = npyv_setall_s32(sh);
396410
#else
397411
#error "please initialize the shifting operand for the new architecture"
398412
#endif
@@ -427,6 +441,9 @@ NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d)
427441
#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
428442
divisor.val[1] = npyv_set_u64(sh1);
429443
divisor.val[2] = npyv_set_u64(sh2);
444+
#elif defined(NPY_HAVE_LSX)
445+
divisor.val[1] = npyv_setall_u64(sh1);
446+
divisor.val[2] = npyv_setall_u64(sh2);
430447
#else
431448
#error "please initialize the shifting operand for the new architecture"
432449
#endif
@@ -465,6 +482,8 @@ NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d)
465482
divisor.val[2] = npyv_setall_s64(d < 0 ? -1 : 0); // sign of divisor
466483
#ifdef NPY_HAVE_SSE2 // SSE/AVX2/AVX512
467484
divisor.val[1] = npyv_set_s64(sh);
485+
#elif defined(NPY_HAVE_LSX)
486+
divisor.val[1] = npyv_setall_s64(sh);
468487
#else
469488
#error "please initialize the shifting operand for the new architecture"
470489
#endif

numpy/_core/src/umath/loops_arithmetic.dispatch.c.src

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
* q = TRUNC((n - (-dsign ) + (-nsign))/d) - (-qsign);
3737
********************************************************************************/
3838

39-
#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON)
39+
#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX)
4040
// Due to integer 128-bit multiplication emulation, SIMD 64-bit division
4141
// may not perform well on both neon and up to VSX3 compared to scalar
4242
// division.
@@ -452,7 +452,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed)
452452
* Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
453453
* Power10(VSX4) is an exception here since it has native support for integer vector division.
454454
*/
455-
#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
455+
#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX))
456456
#undef TO_SIMD_SFX
457457
#endif
458458
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide)

0 commit comments

Comments
 (0)