-
Notifications
You must be signed in to change notification settings - Fork 5k
JIT: escape analysis for delegates #115172
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
JIT: escape analysis for delegates #115172
Conversation
Mark delegate invoke calls as non-escaping for `this`. Later, in the stack allocation code rewriting post-pass, if the `this` at a delegate invoke is definitely a stack pointing local, expand the delegate invoke so that physical promotion can see all the accesses. Contributes to dotnet#104936.
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch |
SPMI will not catch this. Local example public static int Main()
{
int y = 1;
int[] a = new int[100];
var f = (int x) => x + y;
int sum = 0;
foreach (int i in a)
{
sum += f(a[i]);
}
Console.WriteLine(sum);
return sum;
} Delegate is stack allocated, promoted, and vanishes... oddly though we do not optimize the array accesses very well... need to dig into that. ; Assembly listing for method X:Main():int (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; 1 inlinees with PGO data; 2 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
; V00 loc0 [V00,T08] ( 3, 2.25) long -> rsi class-hnd exact single-def <int[]>
;* V01 loc1 [V01 ] ( 0, 0 ) long -> zero-ref class-hnd exact <System.Func`2[int,int]>
; V02 loc2 [V02,T04] ( 5, 10.75) int -> rdi
; V03 loc3 [V03,T06] ( 2, 4.75) long -> rbp class-hnd exact single-def <int[]>
;* V04 loc4 [V04,T09] ( 0, 0 ) int -> zero-ref
;* V05 loc5 [V05 ] ( 0, 0 ) int -> zero-ref
; V06 OutArgs [V06 ] ( 1, 1 ) struct (32) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <UNNAMED>
; V07 tmp1 [V07,T07] ( 3, 4.50) ref -> rbx class-hnd exact single-def "NewObj constructor temp" <X+<>c__DisplayClass0_0>
;* V08 tmp2 [V08 ] ( 0, 0 ) long -> zero-ref class-hnd exact "NewArr temp" <int[]>
;* V09 tmp3 [V09 ] ( 0, 0 ) long -> zero-ref class-hnd exact "NewObj constructor temp" <System.Func`2[int,int]>
; V10 tmp4 [V10 ] ( 3, 2.25) struct (416) [rsp+0x20] do-not-enreg[XS] must-init addr-exposed "stack allocated array temp" <int[]>
;* V11 tmp5 [V11 ] ( 0, 0 ) struct (56) zero-ref do-not-enreg[SF] "stack allocated System.Func`2" <System.Func`2[int,int]>
;* V12 tmp6 [V12 ] ( 0, 0 ) long -> zero-ref single-def "V11.[000..008)"
; V13 tmp7 [V13,T05] ( 2, 4.75) ref -> rbx single-def "V11.[008..016)"
;* V14 tmp8 [V14,T10] ( 0, 0 ) long -> zero-ref single-def "V11.[024..032)"
; V15 tmp9 [V15,T00] ( 3, 24 ) int -> rdx "index expr"
; V16 tmp10 [V16,T01] ( 2, 16 ) int -> rdx "argument with side effect"
; V17 rat0 [V17,T02] ( 4, 12.75) long -> rsi "Strength reduced derived IV"
; V18 rat1 [V18,T03] ( 4, 12.75) int -> r14 "Trip count IV"
;
; Lcl frame size = 448
G_M46779_IG01: ;; offset=0x0000
push r14
push rdi
push rsi
push rbp
push rbx
sub rsp, 448
vxorps xmm4, xmm4, xmm4
vmovdqa xmmword ptr [rsp+0x20], xmm4
vmovdqa xmmword ptr [rsp+0x30], xmm4
mov rax, -384
vmovdqa xmmword ptr [rsp+rax+0x1C0], xmm4
vmovdqa xmmword ptr [rsp+rax+0x1D0], xmm4
vmovdqa xmmword ptr [rsp+rax+0x1E0], xmm4
add rax, 48
jne SHORT -5 instr
;; size=72 bbWeight=0.75 PerfScore 12.81
G_M46779_IG02: ;; offset=0x0048
mov rcx, 0x7FF7C2314398 ; X+<>c__DisplayClass0_0
call CORINFO_HELP_NEWSFAST
mov rbx, rax
mov dword ptr [rbx+0x08], 1
mov rdx, 0x7FF7C19BFDD0 ; int[]
mov qword ptr [rsp+0x20], rdx
lea rdx, [rsp+0x20]
mov dword ptr [rdx+0x08], 100
lea rsi, [rsp+0x20]
xor edi, edi
mov rbp, rsi
add rsi, 16
mov r14d, 100
;; size=72 bbWeight=0.75 PerfScore 5.06
G_M46779_IG03: ;; offset=0x0090
mov edx, dword ptr [rsi]
cmp edx, 100
jae SHORT G_M46779_IG06
mov edx, dword ptr [rbp+4*rdx+0x10]
mov rcx, rbx
mov rax, 0x7FF7C2262670 ; function address
call rax
add edi, eax
add rsi, 4
dec r14d
jne SHORT G_M46779_IG03
;; size=37 bbWeight=4 PerfScore 42.00
G_M46779_IG04: ;; offset=0x00B5
mov ecx, edi
call [System.Console:WriteLine(int)]
mov eax, edi
;; size=10 bbWeight=1 PerfScore 3.50
G_M46779_IG05: ;; offset=0x00BF
add rsp, 448
pop rbx
pop rbp
pop rsi
pop rdi
pop r14
ret
;; size=14 bbWeight=1 PerfScore 3.75
G_M46779_IG06: ;; offset=0x00CD
call CORINFO_HELP_RNGCHKFAIL
int3
;; size=6 bbWeight=0 PerfScore 0.00 |
Ah my example was too convoluted, here's a simpler one public static int Main()
{
int y = 1;
int[] a = new int[100];
var f = (int x) => x + y;
int sum = 0;
foreach (int i in a)
{
sum += f(i);
}
Console.WriteLine(sum);
return sum;
} ; Assembly listing for method X:Main():int (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Windows
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; 1 inlinees with PGO data; 2 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
; V00 loc0 [V00,T05] ( 2, 1.50) long -> rsi class-hnd exact single-def <int[]>
;* V01 loc1 [V01 ] ( 0, 0 ) long -> zero-ref class-hnd exact <System.Func`2[int,int]>
; V02 loc2 [V02,T02] ( 5, 10.75) int -> rdi
; V03 loc3 [V03,T06] ( 2, 1.50) long -> rsi class-hnd exact single-def <int[]>
;* V04 loc4 [V04,T07] ( 0, 0 ) int -> zero-ref
;* V05 loc5 [V05 ] ( 0, 0 ) int -> zero-ref
; V06 OutArgs [V06 ] ( 1, 1 ) struct (32) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <UNNAMED>
; V07 tmp1 [V07,T04] ( 3, 4.50) ref -> rbx class-hnd exact single-def "NewObj constructor temp" <X+<>c__DisplayClass0_0>
;* V08 tmp2 [V08 ] ( 0, 0 ) long -> zero-ref class-hnd exact "NewArr temp" <int[]>
;* V09 tmp3 [V09 ] ( 0, 0 ) long -> zero-ref class-hnd exact "NewObj constructor temp" <System.Func`2[int,int]>
; V10 tmp4 [V10 ] ( 3, 2.25) struct (416) [rsp+0x28] do-not-enreg[XS] must-init addr-exposed "stack allocated array temp" <int[]>
;* V11 tmp5 [V11 ] ( 0, 0 ) struct (56) zero-ref do-not-enreg[SF] "stack allocated System.Func`2" <System.Func`2[int,int]>
;* V12 tmp6 [V12 ] ( 0, 0 ) long -> zero-ref single-def "V11.[000..008)"
; V13 tmp7 [V13,T03] ( 2, 4.75) ref -> rbx single-def "V11.[008..016)"
;* V14 tmp8 [V14,T08] ( 0, 0 ) long -> zero-ref single-def "V11.[024..032)"
; V15 rat0 [V15,T00] ( 4, 12.75) long -> rsi "Strength reduced derived IV"
; V16 rat1 [V16,T01] ( 4, 12.75) int -> rbp "Trip count IV"
;
; Lcl frame size = 456
G_M46779_IG01: ;; offset=0x0000
push rdi
push rsi
push rbp
push rbx
sub rsp, 456
xor eax, eax
mov qword ptr [rsp+0x28], rax
vxorps xmm4, xmm4, xmm4
vmovdqa xmmword ptr [rsp+0x30], xmm4
mov rax, -384
vmovdqa xmmword ptr [rsp+rax+0x1C0], xmm4
vmovdqa xmmword ptr [rsp+rax+0x1D0], xmm4
vmovdqa xmmword ptr [rsp+rax+0x1E0], xmm4
add rax, 48
jne SHORT -5 instr
mov qword ptr [rsp+0x1C0], rax
;; size=79 bbWeight=0.75 PerfScore 12.25
G_M46779_IG02: ;; offset=0x004F
mov rcx, 0x7FF7BA704398 ; X+<>c__DisplayClass0_0
call CORINFO_HELP_NEWSFAST
mov rbx, rax
mov dword ptr [rbx+0x08], 1
mov rdx, 0x7FF7B9DAFDD0 ; int[]
mov qword ptr [rsp+0x28], rdx
lea rdx, [rsp+0x28]
mov dword ptr [rdx+0x08], 100
lea rsi, [rsp+0x28]
xor edi, edi
add rsi, 16
mov ebp, 100
;; size=68 bbWeight=0.75 PerfScore 4.88
G_M46779_IG03: ;; offset=0x0093
mov edx, dword ptr [rsi]
mov rcx, rbx
mov rax, 0x7FF7BA652670 ; function address
call rax
add edi, eax
add rsi, 4
dec ebp
jne SHORT G_M46779_IG03
;; size=27 bbWeight=4 PerfScore 29.00
G_M46779_IG04: ;; offset=0x00AE
mov ecx, edi
call [System.Console:WriteLine(int)]
mov eax, edi
;; size=10 bbWeight=1 PerfScore 3.50
G_M46779_IG05: ;; offset=0x00B8
add rsp, 456
pop rbx
pop rbp
pop rsi
pop rdi
ret
;; size=12 bbWeight=1 PerfScore 3.25
; Total bytes of code 196, prolog size 79, PerfScore 52.88, instruction count 45, allocated bytes for code 196 (MethodHash=17874944) for method X:Main():int (FullOpts) |
Wondering what is still missing that prevents stack-allocating the closure (i.e. the |
Field-sensitive analysis for the fields of objects. |
The results are also better when combined with delegate GDV: ; Assembly listing for method Program:Foo():int (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Windows
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; fully interruptible
; with Synthesized PGO: fgCalledCount is 1000
; 1 inlinees with PGO data; 3 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
; V00 loc0 [V00,T07] ( 2, 2 ) long -> rcx class-hnd exact single-def <int[]>
;* V01 loc1 [V01 ] ( 0, 0 ) long -> zero-ref class-hnd exact <System.Func`2[int,int]>
; V02 loc2 [V02,T02] ( 5,203.66) int -> rbx
; V03 loc3 [V03,T08] ( 2, 2 ) long -> rcx class-hnd exact single-def <int[]>
;* V04 loc4 [V04,T09] ( 0, 0 ) int -> zero-ref
; V05 loc5 [V05,T03] ( 2,200.66) int -> r8
; V06 OutArgs [V06 ] ( 1, 1 ) struct (32) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <UNNAMED>
; V07 tmp1 [V07,T06] ( 3, 6 ) ref -> rax class-hnd exact single-def "NewObj constructor temp" <Program+<>c__DisplayClass1_0>
;* V08 tmp2 [V08 ] ( 0, 0 ) long -> zero-ref class-hnd exact "NewArr temp" <int[]>
;* V09 tmp3 [V09 ] ( 0, 0 ) long -> zero-ref class-hnd exact "NewObj constructor temp" <System.Func`2[int,int]>
; V10 tmp4 [V10,T04] ( 2,200.66) int -> r8 "guarded devirt return temp"
;* V11 tmp5 [V11 ] ( 0, 0 ) ref -> zero-ref class-hnd "guarded devirt this exact temp" <Program+<>c__DisplayClass1_0>
; V12 tmp6 [V12 ] ( 3, 3 ) struct (416) [rsp+0x20] do-not-enreg[XS] must-init addr-exposed "stack allocated array temp" <int[]>
;* V13 tmp7 [V13 ] ( 0, 0 ) struct (56) zero-ref do-not-enreg[SF] "stack allocated System.Func`2" <System.Func`2[int,int]>
;* V14 tmp8 [V14 ] ( 0, 0 ) long -> zero-ref single-def "V13.[000..008)"
; V15 tmp9 [V15,T05] ( 2,101.33) ref -> rax single-def "V13.[008..016)"
;* V16 tmp10 [V16,T10] ( 0, 0 ) long -> zero-ref single-def "V13.[024..032)"
; V17 rat0 [V17,T00] ( 4,301.98) long -> rcx "Strength reduced derived IV"
; V18 rat1 [V18,T01] ( 4,301.98) int -> rdx "Trip count IV"
;
; Lcl frame size = 448
G_M53658_IG01: ;; offset=0x0000
push rbx
sub rsp, 448
vxorps xmm4, xmm4, xmm4
vmovdqa xmmword ptr [rsp+0x20], xmm4
vmovdqa xmmword ptr [rsp+0x30], xmm4
mov rax, -384
vmovdqa xmmword ptr [rsp+rax+0x1C0], xmm4
vmovdqa xmmword ptr [rsp+rax+0x1D0], xmm4
vmovdqa xmmword ptr [rsp+rax+0x1E0], xmm4
add rax, 48
jne SHORT -5 instr
;; size=67 bbWeight=1 PerfScore 13.08
G_M53658_IG02: ;; offset=0x0043
mov rcx, 0x7FFA621D2250 ; Program+<>c__DisplayClass1_0
call CORINFO_HELP_NEWSFAST
mov dword ptr [rax+0x08], 1
mov rcx, 0x7FFA61C6FF00 ; int[]
mov qword ptr [rsp+0x20], rcx
lea rcx, [rsp+0x20]
mov dword ptr [rcx+0x08], 100
lea rcx, [rsp+0x20]
xor ebx, ebx
add rcx, 16
mov edx, 100
align [0 bytes for IG03]
;; size=65 bbWeight=1 PerfScore 6.25
G_M53658_IG03: ;; offset=0x0084
mov r8d, dword ptr [rcx]
add r8d, dword ptr [rax+0x08]
add ebx, r8d
add rcx, 4
dec edx
jne SHORT G_M53658_IG03
;; size=18 bbWeight=100.33 PerfScore 677.21
G_M53658_IG04: ;; offset=0x0096
mov ecx, ebx
call [System.Console:WriteLine(int)]
mov eax, ebx
;; size=10 bbWeight=1.00 PerfScore 3.50
G_M53658_IG05: ;; offset=0x00A0
add rsp, 448
pop rbx
ret
;; size=9 bbWeight=1.00 PerfScore 1.75
; Total bytes of code 169, prolog size 67, PerfScore 701.80, instruction count 35, allocated bytes for code 169 (MethodHash=1fee2e65) for method Program:Foo():int (Tier1)
; ============================================================ Could probably remove the closure with some simple transformations later as well. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pull Request Overview
This PR improves JIT escape analysis by marking delegate invoke calls as non-escaping for 'this' and expanding delegate invokes when 'this' is determined to be stack allocated.
- Mark "this" in delegate invokes as non-escaping in the escape check.
- Expand delegate invoke calls in the rewriting pass when the delegate's "this" points to a stack-allocated local.
- Introduce new tests to verify the correct allocation behavior.
Reviewed Changes
Copilot reviewed 2 out of 3 changed files in this pull request and generated 1 comment.
File | Description |
---|---|
src/tests/JIT/opt/ObjectStackAllocation/Delegates.cs | Added tests for verifying delegate allocation behavior |
src/coreclr/jit/objectalloc.cpp | Updated delegate invoke handling for escape analysis and early expansion |
Files not reviewed (1)
- src/tests/JIT/opt/ObjectStackAllocation/Delegates.csproj: Language not supported
@dotnet/jit-contrib PTAL SPMI won't show any diffs here, but a rough guess (from missed contexts) is that there are about 1000 places this kicks in. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice!
Mark delegate invoke calls as non-escaping for
this
. Later, in the stack allocation code rewriting post-pass, if thethis
at a delegate invoke is definitely a stack pointing local, expand the delegate invoke so that physical promotion can see all the accesses.Contributes to #104936.