Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 6a9644f

Browse files
committed
Address review feedback on CPU features and CI workflows
- Disable X86_V4 (AVX-512) for MSVC builds due to Highway incompatibility - Add FIXME comment for future MSVC compatibility investigation - Update SIMD CI workflow to reflect `x86-64-v2` baseline - Remove redundant test configurations - Add missing X86_V4 support to unary complex loops
1 parent f2ad1f1 commit 6a9644f

8 files changed

Lines changed: 41 additions & 36 deletions

File tree

.github/workflows/linux_simd.yml

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ name: Linux SIMD tests
77
#
88
# - baseline_only:
99
# Focuses on completing as quickly as possible and acts as a filter for other, more resource-intensive jobs.
10-
# Utilizes only the default baseline targets (e.g., SSE3 on X86_64) without enabling any runtime dispatched features.
10+
# Utilizes only the default baseline targets (e.g., X86_V2 on X86_64) without enabling any runtime dispatched features.
1111
#
1212
# - old_gcc:
1313
# Tests the oldest supported GCC version with default CPU/baseline/dispatch settings.
@@ -19,10 +19,6 @@ name: Linux SIMD tests
1919
# Tests against the host CPU features set as the baseline without enabling any runtime dispatched features.
2020
# Intended to assess the entire NumPy codebase against host flags, even for code sections lacking handwritten SIMD intrinsics.
2121
#
22-
# - without_avx512/avx2/fma3:
23-
# Uses runtime SIMD dispatching but disables AVX2, FMA3, and AVX512.
24-
# Intended to evaluate 128-bit SIMD extensions without FMA support.
25-
#
2622
# - without_avx512:
2723
# Uses runtime SIMD dispatching but disables AVX512.
2824
# Intended to evaluate 128-bit/256-bit SIMD extensions.
@@ -165,11 +161,6 @@ jobs:
165161
"-Dallow-noblas=true -Dcpu-dispatch=max-x86_v4",
166162
"3.11"
167163
]
168-
- [
169-
"without avx512/avx2/fma3",
170-
"-Dallow-noblas=true -Dcpu-dispatch=max-x86_v3",
171-
"3.11"
172-
]
173164

174165
env:
175166
MESON_ARGS: ${{ matrix.BUILD_PROP[1] }}

doc/source/reference/simd/build-options.rst

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ Enables all features supported by the host CPU.
261261

262262
Detects the features enabled by the compiler. This option is appended by default
263263
to ``cpu-baseline`` if ``-march``, ``-mcpu``, ``-xhost``, or ``/QxHost`` is set in
264-
the environment variable ``CFLAGS``.
264+
the environment variable ``CFLAGS`` unless ``cpu-baseline-detect`` is ``disabled``.
265265

266266
``MIN``
267267
~~~~~~~
@@ -303,13 +303,7 @@ Remove or add specific features, useful with ``MAX``, ``MIN``, and ``NATIVE``:
303303
- Adding a feature (``+``) includes all implied features
304304
- Removing a feature (``-``) excludes all successor features that imply the removed feature
305305

306-
<<<<<<< HEAD
307-
- ``cpu-baseline`` will be treated as "native" if compiler native flag
308-
``-march=native`` or ``-xHost`` or ``/QxHost`` is enabled through environment variable
309-
``CFLAGS`` and ``cpu-baseline-detect`` is not ``disabled``::
310-
=======
311306
Examples::
312-
>>>>>>> f68e178f88 (ENH: Modulate dispatched x86 CPU features)
313307

314308
python -m build --wheel -Csetup-args=-Dcpu-dispatch="max-X86_V4"
315309
python -m build --wheel -Csetup-args=-Dcpu-baseline="min+X86_V4"

meson.options

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,9 @@ option('disable-optimization', type: 'boolean', value: false,
2828
description: 'Disable CPU optimized code (dispatch,simd,unroll...)')
2929
option('cpu-baseline', type: 'string', value: 'min',
3030
description: 'Minimal set of required CPU features')
31-
<<<<<<< HEAD
3231
option('cpu-baseline-detect', type: 'feature', value: 'auto',
3332
description: 'Detect CPU baseline from the compiler flags')
34-
option('cpu-dispatch', type: 'string', value: 'max -xop -fma4',
35-
=======
3633
option('cpu-dispatch', type: 'string', value: 'max',
37-
>>>>>>> f68e178f88 (ENH: Modulate dispatched x86 CPU features)
3834
description: 'Dispatched set of additional CPU features')
3935
option('test-simd', type: 'array',
4036
value: [

meson_cpu/x86/meson.build

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ AVX512_ICL = mod_features.new(
4747
)
4848
AVX512_SPR = mod_features.new(
4949
'AVX512_SPR', 35, implies: AVX512_ICL,
50-
args: ['-mavx512fp16'],
51-
group: ['AVX512FP16'],
50+
args: ['-mavx512fp16', '-mavx512bf16'],
51+
group: ['AVX512FP16', 'AVX512BF16'],
5252
detect: 'AVX512_SPR',
5353
test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_spr.c')[0]
5454
)
@@ -77,13 +77,19 @@ if compiler_id == 'intel'
7777
endif
7878

7979
if compiler_id == 'msvc'
80+
cc_ver = cc.version()
8081
MSVC_SSE4 = cpu_family == 'x86' ? ['/arch:SSE2'] : []
81-
MSVC_SSE4 = cc.version().version_compare('>=19.40') ? ['/arch:SSE4.2'] : MSVC_SSE4
82-
X86_V2.update(args: MSVC_SSE4 + HWY_SSE4_FLAGS + ['/fp:contract'])
82+
MSVC_SSE4 = cc_ver.version_compare('>=19.40') ? ['/arch:SSE4.2'] : MSVC_SSE4
83+
MSVC_SSE4 = cc_ver.version_compare('>=19.30') ? MSVC_SSE4 + ['/fp:contract'] : MSVC_SSE4
84+
X86_V2.update(args: MSVC_SSE4)
8385
clear_arch = '/arch:.*'
8486
X86_V3.update(args: {'val': '/arch:AVX2', 'match': clear_arch})
85-
X86_V4.update(args: {'val': '/arch:AVX512', 'match': clear_arch})
86-
AVX512_ICL.update(args: '')
87+
# FIXME: After completing transition from universal intrinsics to Highway,
88+
# investigate which MSVC versions are incompatible with Highway's AVX-512 implementation.
89+
X86_V4.update(disable: 'Considered broken by Highway on MSVC')
90+
# To force enable AVX-512, use:
91+
# X86_V4.update(args: [{'val': '/arch:AVX512', 'match': clear_arch}, '-DHWY_BROKEN_MSVC=0'])
92+
AVX512_ICL.update(disable: 'unsupported by Highway on MSVC')
8793
endif
8894

8995
# legacy CPU features

numpy/_core/meson.build

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1052,7 +1052,7 @@ foreach gen_mtargets : [
10521052
'loops_unary_complex.dispatch.h',
10531053
src_file.process('src/umath/loops_unary_complex.dispatch.c.src'),
10541054
[
1055-
X86_V3, X86_V2,
1055+
X86_V4, X86_V3, X86_V2,
10561056
ASIMD, NEON,
10571057
VSX3, VSX2,
10581058
VXE, VX,

numpy/_core/src/common/npy_cpu_features.c

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ static struct {
113113
{NPY_CPU_FEATURE_AVX512VBMI2, "AVX512VBMI2"},
114114
{NPY_CPU_FEATURE_AVX512BITALG, "AVX512BITALG"},
115115
{NPY_CPU_FEATURE_AVX512FP16 , "AVX512FP16"},
116+
{NPY_CPU_FEATURE_AVX512BF16 , "AVX512BF16"},
116117
{NPY_CPU_FEATURE_AVX512_KNL, "AVX512_KNL"},
117118
{NPY_CPU_FEATURE_AVX512_KNM, "AVX512_KNM"},
118119
{NPY_CPU_FEATURE_AVX512_SKX, "AVX512_SKX"},
@@ -410,12 +411,18 @@ npy__cpu_getxcr0(void)
410411
}
411412

412413
static void
413-
npy__cpu_cpuid(int reg[4], int func_id)
414+
npy__cpu_cpuid_count(int reg[4], int func_id, int count)
414415
{
415416
#if defined(_MSC_VER)
416-
__cpuidex(reg, func_id, 0);
417+
__cpuidex(reg, func_id, count);
417418
#elif defined(__INTEL_COMPILER)
418419
__cpuid(reg, func_id);
420+
// classic Intel compilers do not support count
421+
if (count != 0) {
422+
for (int i = 0; i < 4; i++) {
423+
reg[i] = 0;
424+
}
425+
}
419426
#elif defined(__GNUC__) || defined(__clang__)
420427
#if defined(NPY_CPU_X86) && defined(__PIC__)
421428
// %ebx may be the PIC register
@@ -424,20 +431,26 @@ npy__cpu_cpuid(int reg[4], int func_id)
424431
"xchg{l}\t{%%}ebx, %1\n\t"
425432
: "=a" (reg[0]), "=r" (reg[1]), "=c" (reg[2]),
426433
"=d" (reg[3])
427-
: "a" (func_id), "c" (0)
434+
: "a" (func_id), "c" (count)
428435
);
429436
#else
430437
__asm__("cpuid\n\t"
431438
: "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]),
432439
"=d" (reg[3])
433-
: "a" (func_id), "c" (0)
440+
: "a" (func_id), "c" (count)
434441
);
435442
#endif
436443
#else
437444
reg[0] = 0;
438445
#endif
439446
}
440447

448+
static void
449+
npy__cpu_cpuid(int reg[4], int func_id)
450+
{
451+
return npy__cpu_cpuid_count(reg, func_id, 0);
452+
}
453+
441454
static void
442455
npy__cpu_init_features(void)
443456
{
@@ -552,6 +565,8 @@ npy__cpu_init_features(void)
552565
npy__cpu_have[NPY_CPU_FEATURE_AVX512BITALG] = (reg[2] & (1 << 12)) != 0;
553566
// Sapphire Rapids
554567
npy__cpu_have[NPY_CPU_FEATURE_AVX512FP16] = (reg[3] & (1 << 23)) != 0;
568+
npy__cpu_cpuid_count(reg, 7, 1);
569+
npy__cpu_have[NPY_CPU_FEATURE_AVX512BF16] = (reg[0] & (1 << 5)) != 0;
555570
}
556571

557572
// Groups
@@ -598,7 +613,9 @@ npy__cpu_init_features(void)
598613
npy__cpu_have[NPY_CPU_FEATURE_VPCLMULQDQ];
599614

600615
npy__cpu_have[NPY_CPU_FEATURE_AVX512_SPR] = npy__cpu_have[NPY_CPU_FEATURE_AVX512_ICL] &&
601-
npy__cpu_have[NPY_CPU_FEATURE_AVX512FP16];
616+
npy__cpu_have[NPY_CPU_FEATURE_AVX512FP16] &&
617+
npy__cpu_have[NPY_CPU_FEATURE_AVX512BF16];
618+
602619

603620

604621
// Legacy groups

numpy/_core/src/common/npy_cpu_features.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ enum npy_cpu_features
5151
NPY_CPU_FEATURE_AVX512VBMI2 = 43,
5252
NPY_CPU_FEATURE_AVX512BITALG = 44,
5353
NPY_CPU_FEATURE_AVX512FP16 = 45,
54+
NPY_CPU_FEATURE_AVX512BF16 = 46,
5455

5556

5657
// X86 CPU Groups
@@ -66,7 +67,7 @@ enum npy_cpu_features
6667
NPY_CPU_FEATURE_AVX512_CNL = 105,
6768
// Ice Lake (F,CD,BW,DQ,VL,IFMA,VBMI,VNNI,VBMI2,BITALG,VPOPCNTDQ,GFNI,VPCLMULDQ,VAES)
6869
NPY_CPU_FEATURE_AVX512_ICL = 106,
69-
// Sapphire Rapids (Ice Lake, AVX512FP16)
70+
// Sapphire Rapids (Ice Lake, AVX512FP16, AVX512BF16)
7071
NPY_CPU_FEATURE_AVX512_SPR = 107,
7172
// x86-64-v2 microarchitectures (SSE[1-4.*], POPCNT, LAHF, CX16)
7273
// On 32-bit, cx16 is not available so it is not included

numpy/_core/tests/test_cpu_features.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -358,15 +358,15 @@ class Test_X86_Features(AbstractTest):
358358
"AVX512VBMI2", "AVX512BITALG", "AVX512VPOPCNTDQ",
359359
"VAES", "VPCLMULQDQ", "GFNI"
360360
]
361-
features_groups["AVX512_SPR"] = features_groups["AVX512_ICL"] + ["AVX512FP16"]
361+
features_groups["AVX512_SPR"] = features_groups["AVX512_ICL"] + ["AVX512FP16", "AVX512BF16"]
362362

363363
features_map = {
364364
"SSE3": "PNI", "SSE41": "SSE4_1", "SSE42": "SSE4_2", "FMA3": "FMA",
365365
"BMI": "BMI1", "LZCNT": "ABM", "LAHF": "LAHF_LM",
366366
"AVX512VNNI": "AVX512_VNNI", "AVX512BITALG": "AVX512_BITALG",
367367
"AVX512VBMI2": "AVX512_VBMI2", "AVX5124FMAPS": "AVX512_4FMAPS",
368368
"AVX5124VNNIW": "AVX512_4VNNIW", "AVX512VPOPCNTDQ": "AVX512_VPOPCNTDQ",
369-
"AVX512FP16": "AVX512_FP16",
369+
"AVX512FP16": "AVX512_FP16", "AVX512BF16": "AVX512_BF16"
370370
}
371371

372372
def load_flags(self):

0 commit comments

Comments
 (0)