JIT: Avx512BW Compare Debug/Release difference #114978

BruceForstall · 2025-04-23T22:37:06Z

// Generated by Fuzzlyn v2.7 on 2025-04-23 21:44:41
// Run on X64 Windows
// Seed: 3309115426102150651-vectort,vector128,vector256,vector512,x86aes,x86avx,x86avx2,x86avx512bw,x86avx512bwvl,x86avx512cd,x86avx512cdvl,x86avx512dq,x86avx512dqvl,x86avx512f,x86avx512fvl,x86avx512fx64,x86avx512vbmi,x86avx512vbmivl,x86bmi1,x86bmi1x64,x86bmi2,x86bmi2x64,x86fma,x86lzcnt,x86lzcntx64,x86pclmulqdq,x86popcnt,x86popcntx64,x86sse,x86ssex64,x86sse2,x86sse2x64,x86sse3,x86sse41,x86sse41x64,x86sse42,x86sse42x64,x86ssse3,x86x86base
// Reduced from 179.3 KiB to 1.4 KiB in 00:01:49
// Debug: Outputs <255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255>
// Release: Outputs <255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>
using System;
using System.Numerics;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;

public class C0
{
}

public class C1
{
    public Vector512<short> F4;
}

public class C2
{
    public Vector128<sbyte> F6;
}

public class Program
{
    public static IRuntime s_rt;
    public static void Main()
    {
        s_rt = new Runtime();
        var vr14 = new C1();
        var vr16 = new C1();
        var vr17 = (byte)0;
        var vr18 = Vector512.CreateScalar(vr17);
        var vr19 = new C2();
        var vr20 = new C0();
        var vr21 = M10(vr14, vr18, vr19, vr20);
        var vr22 = new C2();
        var vr23 = new C0();
        Program.M10(vr16, vr21, vr22, vr23);
    }

    public static Vector512<byte> M10(C1 argThis, Vector512<byte> arg0, C2 arg1, C0 arg2)
    {
        var vr3 = arg1.F6;
        var vr6 = arg1.F6;
        var vr0 = Avx512BW.VL.CompareLessThanOrEqual(vr3, vr6);
        var vr10 = (sbyte)1;
        var vr9 = Vector128.CreateScalar(vr10);
        if (Sse41.TestZ(vr0, vr9))
        {
            s_rt.WriteLine(argThis.F4);
        }

        s_rt.WriteLine(arg0);
        return Avx512BW.CompareEqual(arg0, arg0);
    }
}

public interface IRuntime
{
    void WriteLine<T>(T value);
}

public class Runtime : IRuntime
{
    public void WriteLine<T>(T value) => System.Console.WriteLine(value);
}

cc @dotnet/jit-contrib @dotnet/intel

The text was updated successfully, but these errors were encountered:

dotnet-policy-service · 2025-04-23T22:37:39Z

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

DeepakRajendrakumaran · 2025-04-24T00:45:00Z

Looking at the disasm for following

    public static Vector512<byte> M10(C1 argThis, Vector512<byte> arg0, C2 arg1, C0 arg2)
    {
        var vr3 = arg1.F6;
        var vr6 = arg1.F6;
        var vr0 = Avx512BW.VL.CompareLessThanOrEqual(vr3, vr6);
        var vr10 = (sbyte)1;
        var vr9 = Vector128.CreateScalar(vr10);
        if (Sse41.TestZ(vr0, vr9))
        {
            s_rt.WriteLine(argThis.F4);
        }

        var vr12 = Avx512BW.CompareEqual(arg0, arg0);

        return vr12;
    }

*************** After end code gen, before unwindEmit()
G_M15199_IG01:        ; func=00, offs=0x000000, size=0x0010, bbWeight=1, PerfScore 3.75, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, nogc <-- Prolog IG

IN0016: 000000 push     rdi
IN0017: 000001 push     rsi
IN0018: 000002 push     rbx
IN0019: 000003 sub      rsp, 144
IN001a: 00000A mov      rbx, rcx
IN001b: 00000D mov      rsi, rdx

G_M15199_IG02:        ; offs=0x000010, size=0x0025, bbWeight=1, PerfScore 16.00, gcrefRegs=0240 {rsi r9}, byrefRegs=0000 {}, BB01 [0000], byref, isz

IN0001: 000010 vmovups  xmm0, xmmword ptr [r9+0x08]
IN0002: 000016 vpcmpb   k1, xmm0, xmm0, 2
IN0003: 00001D kmovq    qword ptr [V12 rsp+0x28], k1
IN0004: 000024 vpmovm2b xmm0, k1
IN0005: 00002A vptest   xmm0, xmmword ptr [reloc @RWD00]
IN0006: 000033 jne      SHORT G_M15199_IG04

G_M15199_IG03:        ; offs=0x000035, size=0x0049, bbWeight=0.50, PerfScore 7.38, gcrefRegs=0040 {rsi}, byrefRegs=0000 {}, BB02 [0001], byref

IN0007: 000035 mov      rcx, 0x18CC2400220      ; data for Program:s_rt
IN0008: 00003F mov      rdi, gword ptr [rcx]
IN0009: 000042 mov      rcx, rdi
IN000a: 000045 mov      rdx, 0x7FFD9D2B0450      ; IRuntime
IN000b: 00004F mov      r8, 0x7FFD9D2B6410      ; token handle
IN000c: 000059 call     [CORINFO_HELP_VIRTUAL_FUNC_PTR]
IN000d: 00005F vmovups  zmm0, zmmword ptr [rsi+0x08]
IN000e: 000069 vmovups  zmmword ptr [V11 rsp+0x30], zmm0
IN000f: 000074 lea      rdx, [V11 rsp+0x30]
IN0010: 000079 mov      rcx, rdi
IN0011: 00007C call     rax

G_M15199_IG04:        ; offs=0x00007E, size=0x0016, bbWeight=1, PerfScore 6.25, gcrefRegs=0000 {}, byrefRegs=0000 {}, BB03 [0002], byref

IN0012: 00007E kmovq    k1, qword ptr [V12 rsp+0x28]
IN0013: 000085 vpmovm2b zmm0, k1
IN0014: 00008B vmovups  zmmword ptr [rbx], zmm0
IN0015: 000091 mov      rax, rbx

G_M15199_IG05:        ; offs=0x000094, size=0x000E, bbWeight=1, PerfScore 3.75, epilog, nogc, extend

IN001c: 000094 vzeroupper 
IN001d: 000097 add      rsp, 144
IN001e: 00009E pop      rbx
IN001f: 00009F pop      rsi
IN0020: 0000A0 pop      rdi
IN0021: 0000A1 ret

var vr0 = Avx512BW.VL.CompareLessThanOrEqual(vr3, vr6); works on xmm and creates


IN0002: 000016 vpcmpb   k1, xmm0, xmm0, 2
IN0003: 00001D kmovq    qword ptr [V12 rsp+0x28], k1

vpcmpb k1, xmm0, xmm0, 2 is cmple

So this essentially creates a mask with 16 bits set

Looks like var vr12 = Avx512BW.CompareEqual(arg0, arg0); is reading the result of this


IN0012: 00007E kmovq    k1, qword ptr [V12 rsp+0x28]
IN0013: 000085 vpmovm2b zmm0, k1

But that doesn't work since this requires all 64 bits.

Didn't really check in detail but looks like CSE doesn't account for vector size difference. Look for CSE #03 in attached log

DumpBad.log

tannergooding · 2025-04-28T18:16:04Z

Didn't really check in detail but looks like CSE doesn't account for vector size difference. Look for CSE #3 in attached log

It's unclear why this would even CSE in the first place, as they aren't doing equivalent comparisons. Perhaps this is rather an issue with the immediate operand not being checked or similar?

It's definitely possible the base type or simd size is also getting missed for some scenario.

BruceForstall · 2025-04-29T21:31:48Z

In both comparison cases, the comparison is statically known to be true:

Avx512BW.VL.CompareLessThanOrEqual(arg1.F6, arg1.F6) => true

Avx512BW.CompareEqual(arg0, arg0) => true

value numbering sees the first one:

HWINTRINSIC mask   byte CompareLessThanOrEqualMask

and produces a TYP_MASK constant value with all bits set:

N009 [000010]   HWINTRINSIC => <l:$3c1 {norm=$340 {SimdMaskCns[0xffffffff, 0xffffffff]}, exc=$300 {NullPtrExc($c1)}}, c:$3c0 {norm=$380 {HWI_EVEX_CompareLessThanOrEqualMask($2c0, $2c1, $301)}, exc=$300 {NullPtrExc($c1)}}>
    simdTypeVN is $301 {SimdType(simd16, byte)}

when it gets to:

HWINTRINSIC mask   ubyte CompareEqualMask

it gives it the same value:

N005 [000034]   HWINTRINSIC => <l:$340 {SimdMaskCns[0xffffffff, 0xffffffff]}, c:$381 {HWI_EVEX_CompareEqualMask($542, $543, $30c)}>
    simdTypeVN is $30c {SimdType(simd64, ubyte)}

This leads to a perfectly reasonable CSE of the mask register.

However, when generating the mask register value using CompareLessThanOrEqualMask (instead of just loading a constant?), the JIT generates vpcmpb k1, xmm0, xmm0, 2, and because the registers are XMM, only the low 16 mask bits are set and the high 48 bits are zeroed. The subsequent use expects all 64 mask bits to be set since it uses ZMM registers.

So:

During value numbering, set the SimdMaskCns value based on the SIMD size of the arguments, since that's how the processor interprets the instruction. It seems that the instruction generated is only dependent on the node and simdBaseType of the node, but the number of mask bits set depends on the operand register size: ZMM/YMM/XMM, which depends on the argument types. Actually, maybe value numbering just needs to depend on the gtSimdSize, which in this case is 16 for the first case, and 64 for the second case.
Is value number / CSE of kmask valuable?

BruceForstall · 2025-04-29T22:25:16Z

There is some code in EvaluateBinaryMask<TBase> which even normalizes "all bits set" at the simdSize width to full width, which seems dangerous since it might confuse "all bits set" with "all bits set of the particular simd size".

    if (resultValue == bitMask)
    {
        // Output is equivalent to AllBitsSet, so normalize
        memset(&resultValue, 0xFF, sizeof(uint64_t));
    }

tannergooding · 2025-05-01T14:09:26Z

There is some code in EvaluateBinaryMask which even normalizes "all bits set" at the simdSize width to full width, which seems dangerous since it might confuse "all bits set" with "all bits set of the particular simd size".

This should be generally fine and is intentional since we only have TYP_MASK and it is always 64-bits. The general premise is used in other places as well because AllBitsSet should represent exactly AllBitsSet which means it is usable for anything expecting such a mask regardless of size (as only the used bits are read, other bits are ignored).

The bug seems to be:

However, when generating the mask register value using CompareLessThanOrEqualMask (instead of just loading a constant?), the JIT generates vpcmpb k1, xmm0, xmm0, 2, and because the registers are XMM, only the low 16 mask bits are set and the high 48 bits are zeroed.

Because this isn't actually producing such an AllBitsSet constant.

It's actually not even clear why or what is generating CompareLessThanOrEqualMask here because a GT_CNS_MSK should be hitting this path and generating kxnorq instead:

runtime/src/coreclr/jit/codegenxarch.cpp

Line 573 in 6be6c5d

    
           void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simdmask_t* val)

tannergooding · 2025-05-01T15:44:55Z

Ok, I see. We end up with the liberal VN being a constant but the conservative VN not being a constant.

CSE only looks at the conservative VN when checking for constants because AssertionProp only looks at the conservative VN when doing constant propagation. This means we CSE CompareLessThanOrEqual and it never gets updated.

Likewise for gtFoldExprHWIntrinsic we bail out because neither operand is a constant and we don't have any logic to look and see "are both inputs the same local". -- Although looking at the locals might also fail because these are rather indirections to the same local with a constant offset (IND(ADD(LCL_VAR, CNS_INT)))

So rather than producing a VN of AllBitsSet for TYP_MASK we need to ensure the right number of bits are set. If we wanted to have these share a general AllBitsSet TYP_MASK, we'd need to improve constant folding to support more scenarios.

BruceForstall added area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI avx512 Related to the AVX-512 architecture labels Apr 23, 2025

BruceForstall added this to the 10.0.0 milestone Apr 23, 2025

BruceForstall self-assigned this Apr 23, 2025

BruceForstall added the blocking-clean-ci-optional Blocking optional rolling runs label Apr 29, 2025

tannergooding mentioned this issue May 1, 2025

Ensure VN produces correctly initialized simdmask_t #115227

Merged

BruceForstall assigned tannergooding and unassigned BruceForstall May 1, 2025

tannergooding closed this as completed in #115227 May 2, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

JIT: Avx512BW Compare Debug/Release difference #114978

JIT: Avx512BW Compare Debug/Release difference #114978

BruceForstall commented Apr 23, 2025

dotnet-policy-service bot commented Apr 23, 2025

DeepakRajendrakumaran commented Apr 24, 2025 •

edited

Loading

tannergooding commented Apr 28, 2025

BruceForstall commented Apr 29, 2025 •

edited

Loading

BruceForstall commented Apr 29, 2025

tannergooding commented May 1, 2025

tannergooding commented May 1, 2025 •

edited

Loading

JIT: Avx512BW Compare Debug/Release difference #114978

JIT: Avx512BW Compare Debug/Release difference #114978

Comments

BruceForstall commented Apr 23, 2025

dotnet-policy-service bot commented Apr 23, 2025

DeepakRajendrakumaran commented Apr 24, 2025 • edited Loading

tannergooding commented Apr 28, 2025

BruceForstall commented Apr 29, 2025 • edited Loading

BruceForstall commented Apr 29, 2025

tannergooding commented May 1, 2025

tannergooding commented May 1, 2025 • edited Loading

DeepakRajendrakumaran commented Apr 24, 2025 •

edited

Loading

BruceForstall commented Apr 29, 2025 •

edited

Loading

tannergooding commented May 1, 2025 •

edited

Loading