Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 309185f

Browse files
authored
[wasm] Add limited constant propagation to the jiterpreter for ldc.i4 and ldloca (#99706)
Right now if an interpreter opcode stores a constant (ldc.i4) or effectively-constant (ldloca) expression into an interpreter local, we have to read it back from memory before using it later in a trace. However, there are many scenarios where it would be profitable to not do this, and instead embed the constant into the trace where the load would otherwise happen. This furthermore enables optimizing out null checks in some cases, since if the address being null-checked is constant, we can determine statically whether it is null and omit the runtime check entirely.
1 parent cfe3d2d commit 309185f

File tree

3 files changed

+117
-16
lines changed

3 files changed

+117
-16
lines changed

src/mono/browser/runtime/jiterpreter-support.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1902,6 +1902,7 @@ export type JiterpreterOptions = {
19021902
enableWasmEh: boolean;
19031903
enableSimd: boolean;
19041904
zeroPageOptimization: boolean;
1905+
cprop: boolean;
19051906
// For locations where the jiterpreter heuristic says we will be unable to generate
19061907
// a trace, insert an entry point opcode anyway. This enables collecting accurate
19071908
// stats for options like estimateHeat, but raises overhead.
@@ -1947,6 +1948,7 @@ const optionNames: { [jsName: string]: string } = {
19471948
"enableWasmEh": "jiterpreter-wasm-eh-enabled",
19481949
"enableSimd": "jiterpreter-simd-enabled",
19491950
"zeroPageOptimization": "jiterpreter-zero-page-optimization",
1951+
"cprop": "jiterpreter-constant-propagation",
19501952
"enableStats": "jiterpreter-stats-enabled",
19511953
"disableHeuristic": "jiterpreter-disable-heuristic",
19521954
"estimateHeat": "jiterpreter-estimate-heat",

src/mono/browser/runtime/jiterpreter-trace-generator.ts

Lines changed: 111 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -119,14 +119,44 @@ function is_backward_branch_target(
119119
return false;
120120
}
121121

122+
interface KnownConstantI32 {
123+
type: "i32";
124+
value: number;
125+
}
126+
127+
interface KnownConstantV128 {
128+
type: "v128";
129+
value: Uint8Array;
130+
}
131+
132+
interface KnownConstantLdloca {
133+
type: "ldloca";
134+
offset: number;
135+
}
136+
137+
type KnownConstant = KnownConstantI32 | KnownConstantV128 | KnownConstantLdloca;
122138
type KnownConstantValue = number | Uint8Array;
123-
const knownConstantValues = new Map<number, KnownConstantValue>();
139+
const knownConstants = new Map<number, KnownConstant>();
124140

125-
function get_known_constant_value(builder: WasmBuilder, localOffset: number): KnownConstantValue | undefined {
141+
function get_known_constant(builder: WasmBuilder, localOffset: number): KnownConstant | undefined {
126142
if (isAddressTaken(builder, localOffset))
127143
return undefined;
128144

129-
return knownConstantValues.get(localOffset);
145+
return knownConstants.get(localOffset);
146+
}
147+
148+
function get_known_constant_value(builder: WasmBuilder, localOffset: number): KnownConstantValue | undefined {
149+
const kc = get_known_constant(builder, localOffset);
150+
if (kc === undefined)
151+
return undefined;
152+
153+
switch (kc.type) {
154+
case "i32":
155+
case "v128":
156+
return kc.value;
157+
}
158+
159+
return undefined;
130160
}
131161

132162
// Perform a quick scan through the opcodes potentially in this trace to build a table of
@@ -553,11 +583,20 @@ export function generateWasmBody(
553583
builder.local("pLocals");
554584
// locals[ip[1]] = &locals[ip[2]]
555585
const offset = getArgU16(ip, 2),
556-
flag = isAddressTaken(builder, offset);
586+
flag = isAddressTaken(builder, offset),
587+
destOffset = getArgU16(ip, 1);
557588
if (!flag)
558589
mono_log_error(`${traceName}: Expected local ${offset} to have address taken flag`);
559590
append_ldloca(builder, offset);
560-
append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store);
591+
append_stloc_tail(builder, destOffset, WasmOpcode.i32_store);
592+
// Record this ldloca as a known constant so that later uses of it turn into a lea,
593+
// and the wasm runtime can constant fold them with other constants. It's not uncommon
594+
// to have code that does '&x + c', which (if this optimization works) should
595+
// turn into '&locals + offsetof(x) + c' and get constant folded to have the same cost
596+
// as a regular ldloc
597+
knownConstants.set(destOffset, { type: "ldloca", offset: offset });
598+
// dreg invalidation would blow the known constant away, so disable it
599+
skipDregInvalidation = true;
561600
break;
562601
}
563602

@@ -1712,14 +1751,14 @@ let cknullOffset = -1;
17121751
function eraseInferredState() {
17131752
cknullOffset = -1;
17141753
notNullSince.clear();
1715-
knownConstantValues.clear();
1754+
knownConstants.clear();
17161755
}
17171756

17181757
function invalidate_local(offset: number) {
17191758
if (cknullOffset === offset)
17201759
cknullOffset = -1;
17211760
notNullSince.delete(offset);
1722-
knownConstantValues.delete(offset);
1761+
knownConstants.delete(offset);
17231762
}
17241763

17251764
function invalidate_local_range(start: number, bytes: number) {
@@ -1792,7 +1831,47 @@ function computeMemoryAlignment(offset: number, opcodeOrPrefix: WasmOpcode, simd
17921831
return alignment;
17931832
}
17941833

1834+
function try_append_ldloc_cprop(
1835+
builder: WasmBuilder, offset: number, opcodeOrPrefix: WasmOpcode,
1836+
dryRun: boolean, requireNonzero?: boolean
1837+
) {
1838+
if (builder.options.cprop && (opcodeOrPrefix === WasmOpcode.i32_load)) {
1839+
// It's common to ldc.i4 or ldloca immediately before using the value
1840+
// in these cases the known constant analysis will work consistently, and we can skip the extra
1841+
// memory load to read the constant we just wrote to a local. the resulting traces should be
1842+
// both smaller and faster, while still correct since the ldc still writes to memory
1843+
// of course, if known constant analysis is broken, this will break too, but it's better to
1844+
// learn immediately whether known constant analysis has been broken this whole time
1845+
// at least on x86 this will enable much better native code generation for the trace, since
1846+
// operations like memory stores have forms that accept an immediate as rhs
1847+
const knownConstant = get_known_constant(builder, offset);
1848+
if (knownConstant) {
1849+
switch (knownConstant.type) {
1850+
case "i32":
1851+
if (requireNonzero && (knownConstant.value === 0))
1852+
return false;
1853+
if (!dryRun)
1854+
builder.i32_const(knownConstant.value);
1855+
return true;
1856+
case "ldloca":
1857+
// FIXME: Do we need to invalidate the local again? I don't think we do, we invalidated it
1858+
// when the ldloca operation originally happened, and we're just propagating that address
1859+
// constant forward to its point of use
1860+
// requireNonzero is a no-op since ldloca always produces a nonzero result
1861+
if (!dryRun)
1862+
append_ldloca(builder, knownConstant.offset, 0);
1863+
return true;
1864+
}
1865+
}
1866+
}
1867+
1868+
return false;
1869+
}
1870+
17951871
function append_ldloc(builder: WasmBuilder, offset: number, opcodeOrPrefix: WasmOpcode, simdOpcode?: WasmSimdOpcode) {
1872+
if (try_append_ldloc_cprop(builder, offset, opcodeOrPrefix, false))
1873+
return;
1874+
17961875
builder.local("pLocals");
17971876
mono_assert(opcodeOrPrefix >= WasmOpcode.i32_load, () => `Expected load opcode but got ${opcodeOrPrefix}`);
17981877
builder.appendU8(opcodeOrPrefix);
@@ -1828,8 +1907,6 @@ function append_stloc_tail(builder: WasmBuilder, offset: number, opcodeOrPrefix:
18281907

18291908
// Pass bytesInvalidated=0 if you are reading from the local and the address will never be
18301909
// used for writes
1831-
// Pass transient=true if the address will not persist after use (so it can't be used to later
1832-
// modify the contents of this local)
18331910
function append_ldloca(builder: WasmBuilder, localOffset: number, bytesInvalidated?: number) {
18341911
if (typeof (bytesInvalidated) !== "number")
18351912
bytesInvalidated = 512;
@@ -1985,9 +2062,9 @@ function emit_ldc(builder: WasmBuilder, ip: MintOpcodePtr, opcode: MintOpcode):
19852062
invalidate_local(localOffset);
19862063

19872064
if (typeof (value) === "number")
1988-
knownConstantValues.set(localOffset, value);
2065+
knownConstants.set(localOffset, { type: "i32", value: value });
19892066
else
1990-
knownConstantValues.delete(localOffset);
2067+
knownConstants.delete(localOffset);
19912068

19922069
return true;
19932070
}
@@ -2092,6 +2169,8 @@ function emit_fieldop(
20922169
notNullSince.has(objectOffset) &&
20932170
!isAddressTaken(builder, objectOffset);
20942171

2172+
// TODO: Figure out whether this is commonly used to access fields of structs that
2173+
// live on the stack, and if so, whether we want to do cprop of the ldloca
20952174
if (
20962175
(opcode !== MintOpcode.MINT_LDFLDA_UNSAFE) &&
20972176
(opcode !== MintOpcode.MINT_STFLD_O)
@@ -3088,13 +3167,21 @@ function emit_indirectop(builder: WasmBuilder, ip: MintOpcodePtr, opcode: MintOp
30883167
return false;
30893168
}
30903169

3091-
append_ldloc_cknull(builder, addressVarIndex, ip, false);
3170+
// Check whether ldloc cprop is possible for the address var, if it is, skip doing the ldloc_cknull.
3171+
// We'll also skip loading cknull_ptr later.
3172+
const addressCprop = try_append_ldloc_cprop(builder, addressVarIndex, WasmOpcode.i32_load, true, true);
3173+
if (!addressCprop)
3174+
append_ldloc_cknull(builder, addressVarIndex, ip, false);
30923175

30933176
if (isLoad) {
30943177
// pre-load pLocals for the store operation
30953178
builder.local("pLocals");
30963179
// Load address
3097-
builder.local("cknull_ptr");
3180+
if (addressCprop)
3181+
mono_assert(try_append_ldloc_cprop(builder, addressVarIndex, WasmOpcode.i32_load, false, true), "Unknown jiterpreter cprop failure");
3182+
else
3183+
builder.local("cknull_ptr");
3184+
30983185
// For ldind_offset we need to load an offset from another local
30993186
// and then add it to the null checked address
31003187
if (isAddMul) {
@@ -3126,13 +3213,21 @@ function emit_indirectop(builder: WasmBuilder, ip: MintOpcodePtr, opcode: MintOp
31263213
append_stloc_tail(builder, valueVarIndex, setter);
31273214
} else if (opcode === MintOpcode.MINT_STIND_REF) {
31283215
// Load destination address
3129-
builder.local("cknull_ptr");
3216+
if (addressCprop)
3217+
mono_assert(try_append_ldloc_cprop(builder, addressVarIndex, WasmOpcode.i32_load, false, true), "Unknown jiterpreter cprop failure");
3218+
else
3219+
builder.local("cknull_ptr");
3220+
31303221
// Load address of value so that copy_managed_pointer can grab it
31313222
append_ldloca(builder, valueVarIndex, 0);
31323223
builder.callImport("copy_ptr");
31333224
} else {
31343225
// Pre-load address for the store operation
3135-
builder.local("cknull_ptr");
3226+
if (addressCprop)
3227+
mono_assert(try_append_ldloc_cprop(builder, addressVarIndex, WasmOpcode.i32_load, false, true), "Unknown jiterpreter cprop failure");
3228+
else
3229+
builder.local("cknull_ptr");
3230+
31363231
// For ldind_offset we need to load an offset from another local
31373232
// and then add it to the null checked address
31383233
if (isOffset && offsetVarIndex >= 0) {
@@ -3429,7 +3524,7 @@ function emit_simd(
34293524
const view = localHeapViewU8().slice(<any>ip + 4, <any>ip + 4 + sizeOfV128);
34303525
builder.v128_const(view);
34313526
append_simd_store(builder, ip);
3432-
knownConstantValues.set(getArgU16(ip, 1), view);
3527+
knownConstants.set(getArgU16(ip, 1), { type: "v128", value: view });
34333528
} else {
34343529
// dest
34353530
append_ldloca(builder, getArgU16(ip, 1), sizeOfV128);

src/mono/mono/utils/options-def.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,10 @@ DEFINE_BOOL(jiterpreter_backward_branches_enabled, "jiterpreter-backward-branche
121121
DEFINE_BOOL(jiterpreter_enable_simd, "jiterpreter-simd-enabled", TRUE, "Attempt to use WebAssembly SIMD support")
122122
// Since the zero page is unallocated, loading array/string/span lengths from null ptrs will yield zero
123123
DEFINE_BOOL(jiterpreter_zero_page_optimization, "jiterpreter-zero-page-optimization", TRUE, "Exploit the zero page being unallocated to optimize out null checks")
124+
// We can produce higher quality code by embedding known constants directly into traces instead of loading
125+
// the constant from its storage location in the interpreter's locals in memory, even if we can't skip
126+
// the write of the constant into memory.
127+
DEFINE_BOOL(jiterpreter_constant_propagation, "jiterpreter-constant-propagation", TRUE, "Propagate ldc.i4 and ldloca expressions forward to locations where those constants are used")
124128
// When compiling a jit_call wrapper, bypass sharedvt wrappers if possible by inlining their
125129
// logic into the compiled wrapper and calling the target AOTed function with native call convention
126130
DEFINE_BOOL(jiterpreter_direct_jit_call, "jiterpreter-direct-jit-calls", TRUE, "Bypass gsharedvt wrappers when compiling JIT call wrappers")

0 commit comments

Comments
 (0)