-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Closed
Labels
Priority:2Work that is important, but not critical for the releaseWork that is important, but not critical for the releasearea-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI
Milestone
Description
Now that we have an SSA based IV analysis (added in #97865) we should implement strength reduction based on it. Example loop:
[MethodImpl(MethodImplOptions.NoInlining)]
private static int StrengthReduce(Span<int> s)
{
int sum = 0;
foreach (int val in s)
sum += val;
return sum;
}
Codegen x64:
xor r8d, r8d
test ecx, ecx
jle SHORT G_M11380_IG04
align [0 bytes for IG03]
;; size=15 bbWeight=1 PerfScore 5.75
G_M11380_IG03: ;; offset=0x0013
add eax, dword ptr [rdx+4*r8]
inc r8d
cmp r8d, ecx
jl SHORT G_M11380_IG03
;; size=12 bbWeight=4 PerfScore 18.00
Codegen arm64:
mov w3, wzr
cmp w2, #0
ble G_M1017_IG04
align [0 bytes for IG03]
;; size=24 bbWeight=1 PerfScore 6.50
G_M1017_IG03: ;; offset=0x0024
ldr w4, [x1, w3, UXTW #2]
add w0, w4, w0
add w3, w3, #1
cmp w3, w2
blt G_M1017_IG03
;; size=20 bbWeight=4 PerfScore 22.00
The point of strength reduction is to optimize the loop codegen as if it had been written as follows:
[MethodImpl(MethodImplOptions.NoInlining)]
private static int StrengthReduce(Span<int> s)
{
int sum = 0;
ref int p = ref MemoryMarshal.GetReference(s);
ref int end = ref Unsafe.Add(ref p, s.Length);
while (Unsafe.IsAddressLessThan(ref p, ref end))
{
sum += p;
p = ref Unsafe.Add(ref p, 1);
}
return sum;
}
The codegen would look like:
x64:
xor eax, eax
mov rdx, bword ptr [rcx]
mov ecx, dword ptr [rcx+0x08]
lea rcx, bword ptr [rdx+4*rcx]
cmp rdx, rcx
jae SHORT G_M11380_IG04
align [0 bytes for IG03]
;; size=17 bbWeight=1 PerfScore 6.00
G_M11380_IG03: ;; offset=0x0011
add eax, dword ptr [rdx]
add rdx, 4
cmp rdx, rcx
jb SHORT G_M11380_IG03
;; size=11 bbWeight=4 PerfScore 18.00
arm64:
mov w0, wzr
ldr x1, [fp, #0x10] // [V00 arg0]
ldr w2, [fp, #0x18] // [V00 arg0+0x08]
ubfiz x2, x2, #2, #32
add x2, x1, x2
cmp x1, x2
bhs G_M11380_IG04
align [0 bytes for IG03]
;; size=28 bbWeight=1 PerfScore 7.50
G_M11380_IG03: ;; offset=0x0028
ldr w3, [x1]
add w0, w0, w3
add x1, x1, #4
cmp x1, x2
blo G_M11380_IG03
;; size=20 bbWeight=4 PerfScore 22.00
For arm64 there is the additional possibility of using post-increment addressing mode by optimizing the placement of the IV increment once the strength reduction has happened. The loop body is then reducible to:
G_M11380_IG03: ;; offset=0x0028
ldr w3, [x1], #4
add w0, w0, w3
cmp x1, x2
blo G_M11380_IG03
EgorBo, PaulusParssinen and neon-sunsetSergio0694 and omariom
Metadata
Metadata
Assignees
Labels
Priority:2Work that is important, but not critical for the releaseWork that is important, but not critical for the releasearea-CodeGen-coreclrCLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMICLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI