Thanks to visit codestin.com
Credit goes to github.com

Skip to content

JIT: Add support for strength reduction #100913

@jakobbotsch

Description

@jakobbotsch

Now that we have an SSA based IV analysis (added in #97865) we should implement strength reduction based on it. Example loop:

[MethodImpl(MethodImplOptions.NoInlining)]
private static int StrengthReduce(Span<int> s)
{
    int sum = 0;
    foreach (int val in s)
        sum += val;

    return sum;
}

Codegen x64:

       xor      r8d, r8d
       test     ecx, ecx
       jle      SHORT G_M11380_IG04
       align    [0 bytes for IG03]
						;; size=15 bbWeight=1 PerfScore 5.75

G_M11380_IG03:  ;; offset=0x0013
       add      eax, dword ptr [rdx+4*r8]
       inc      r8d
       cmp      r8d, ecx
       jl       SHORT G_M11380_IG03
						;; size=12 bbWeight=4 PerfScore 18.00

Codegen arm64:

            mov     w3, wzr
            cmp     w2, #0
            ble     G_M1017_IG04
            align   [0 bytes for IG03]
						;; size=24 bbWeight=1 PerfScore 6.50

G_M1017_IG03:  ;; offset=0x0024
            ldr     w4, [x1, w3, UXTW #2]
            add     w0, w4, w0
            add     w3, w3, #1
            cmp     w3, w2
            blt     G_M1017_IG03
						;; size=20 bbWeight=4 PerfScore 22.00

The point of strength reduction is to optimize the loop codegen as if it had been written as follows:

[MethodImpl(MethodImplOptions.NoInlining)]
private static int StrengthReduce(Span<int> s)
{
    int sum = 0;
    ref int p = ref MemoryMarshal.GetReference(s);
    ref int end = ref Unsafe.Add(ref p, s.Length);
    while (Unsafe.IsAddressLessThan(ref p, ref end))
    {
        sum += p;
        p = ref Unsafe.Add(ref p, 1);
    }

    return sum;
}

The codegen would look like:
x64:

       xor      eax, eax
       mov      rdx, bword ptr [rcx]
       mov      ecx, dword ptr [rcx+0x08]
       lea      rcx, bword ptr [rdx+4*rcx]
       cmp      rdx, rcx
       jae      SHORT G_M11380_IG04
       align    [0 bytes for IG03]
						;; size=17 bbWeight=1 PerfScore 6.00

G_M11380_IG03:  ;; offset=0x0011
       add      eax, dword ptr [rdx]
       add      rdx, 4
       cmp      rdx, rcx
       jb       SHORT G_M11380_IG03
						;; size=11 bbWeight=4 PerfScore 18.00

arm64:

            mov     w0, wzr
            ldr     x1, [fp, #0x10]	// [V00 arg0]
            ldr     w2, [fp, #0x18]	// [V00 arg0+0x08]
            ubfiz   x2, x2, #2, #32
            add     x2, x1, x2
            cmp     x1, x2
            bhs     G_M11380_IG04
            align   [0 bytes for IG03]
						;; size=28 bbWeight=1 PerfScore 7.50

G_M11380_IG03:  ;; offset=0x0028
            ldr     w3, [x1]
            add     w0, w0, w3
            add     x1, x1, #4
            cmp     x1, x2
            blo     G_M11380_IG03
						;; size=20 bbWeight=4 PerfScore 22.00

For arm64 there is the additional possibility of using post-increment addressing mode by optimizing the placement of the IV increment once the strength reduction has happened. The loop body is then reducible to:

G_M11380_IG03:  ;; offset=0x0028
            ldr     w3, [x1], #4
            add     w0, w0, w3
            cmp     x1, x2
            blo     G_M11380_IG03

Metadata

Metadata

Assignees

Labels

Priority:2Work that is important, but not critical for the releasearea-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions