-
-
Notifications
You must be signed in to change notification settings - Fork 32.1k
GH-115802: JIT "small" code for macOS and Linux #115826
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
52bb3da
Implement the small code model for x86_64 and aarch64 macOS and Linux
brandtbucher 81fe5ed
blacken
brandtbucher 36de1cd
Remove references to IMAGE_REL_AMD64_REL32 and clean up comments
brandtbucher 74860c1
Add comment
brandtbucher aa53fab
Move assert back
brandtbucher 462095c
fixup
brandtbucher c28b7de
Catch up with main
brandtbucher File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -47,18 +47,18 @@ jit_error(const char *message) | |||||
PyErr_Format(PyExc_RuntimeWarning, "JIT %s (%d)", message, hint); | ||||||
} | ||||||
|
||||||
static char * | ||||||
static unsigned char * | ||||||
jit_alloc(size_t size) | ||||||
{ | ||||||
assert(size); | ||||||
assert(size % get_page_size() == 0); | ||||||
#ifdef MS_WINDOWS | ||||||
int flags = MEM_COMMIT | MEM_RESERVE; | ||||||
char *memory = VirtualAlloc(NULL, size, flags, PAGE_READWRITE); | ||||||
unsigned char *memory = VirtualAlloc(NULL, size, flags, PAGE_READWRITE); | ||||||
int failed = memory == NULL; | ||||||
#else | ||||||
int flags = MAP_ANONYMOUS | MAP_PRIVATE; | ||||||
char *memory = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); | ||||||
unsigned char *memory = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); | ||||||
int failed = memory == MAP_FAILED; | ||||||
#endif | ||||||
if (failed) { | ||||||
|
@@ -69,7 +69,7 @@ jit_alloc(size_t size) | |||||
} | ||||||
|
||||||
static int | ||||||
jit_free(char *memory, size_t size) | ||||||
jit_free(unsigned char *memory, size_t size) | ||||||
{ | ||||||
assert(size); | ||||||
assert(size % get_page_size() == 0); | ||||||
|
@@ -86,7 +86,7 @@ jit_free(char *memory, size_t size) | |||||
} | ||||||
|
||||||
static int | ||||||
mark_executable(char *memory, size_t size) | ||||||
mark_executable(unsigned char *memory, size_t size) | ||||||
{ | ||||||
if (size == 0) { | ||||||
return 0; | ||||||
|
@@ -113,7 +113,7 @@ mark_executable(char *memory, size_t size) | |||||
} | ||||||
|
||||||
static int | ||||||
mark_readable(char *memory, size_t size) | ||||||
mark_readable(unsigned char *memory, size_t size) | ||||||
{ | ||||||
if (size == 0) { | ||||||
return 0; | ||||||
|
@@ -169,18 +169,20 @@ set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start, | |||||
// Fill all of stencil's holes in the memory pointed to by base, using the | ||||||
// values in patches. | ||||||
static void | ||||||
patch(char *base, const Stencil *stencil, uint64_t *patches) | ||||||
patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) | ||||||
{ | ||||||
for (uint64_t i = 0; i < stencil->holes_size; i++) { | ||||||
const Hole *hole = &stencil->holes[i]; | ||||||
void *location = base + hole->offset; | ||||||
unsigned char *location = base + hole->offset; | ||||||
uint64_t value = patches[hole->value] + (uint64_t)hole->symbol + hole->addend; | ||||||
uint8_t *loc8 = (uint8_t *)location; | ||||||
uint32_t *loc32 = (uint32_t *)location; | ||||||
uint64_t *loc64 = (uint64_t *)location; | ||||||
// LLD is a great reference for performing relocations... just keep in | ||||||
// mind that Tools/jit/build.py does filtering and preprocessing for us! | ||||||
// Here's a good place to start for each platform: | ||||||
// - aarch64-apple-darwin: | ||||||
// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64.cpp | ||||||
// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.cpp | ||||||
// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.h | ||||||
// - aarch64-unknown-linux-gnu: | ||||||
|
@@ -208,6 +210,47 @@ patch(char *base, const Stencil *stencil, uint64_t *patches) | |||||
// 64-bit absolute address. | ||||||
*loc64 = value; | ||||||
continue; | ||||||
case HoleKind_R_X86_64_GOTPCRELX: | ||||||
case HoleKind_R_X86_64_REX_GOTPCRELX: | ||||||
case HoleKind_X86_64_RELOC_GOT: | ||||||
case HoleKind_X86_64_RELOC_GOT_LOAD: { | ||||||
// 32-bit relative address. | ||||||
// Try to relax the GOT load into an immediate value: | ||||||
uint64_t relaxed = *(uint64_t *)(value + 4) - 4; | ||||||
if ((int64_t)relaxed - (int64_t)location >= -(1LL << 31) && | ||||||
(int64_t)relaxed - (int64_t)location + 1 < (1LL << 31)) | ||||||
{ | ||||||
if (loc8[-2] == 0x8B) { | ||||||
// mov reg, dword ptr [rip + AAA] -> lea reg, [rip + XXX] | ||||||
loc8[-2] = 0x8D; | ||||||
value = relaxed; | ||||||
} | ||||||
else if (loc8[-2] == 0xFF && loc8[-1] == 0x15) { | ||||||
// call qword ptr [rip + AAA] -> nop; call XXX | ||||||
loc8[-2] = 0x90; | ||||||
loc8[-1] = 0xE8; | ||||||
value = relaxed; | ||||||
} | ||||||
else if (loc8[-2] == 0xFF && loc8[-1] == 0x25) { | ||||||
// jmp qword ptr [rip + AAA] -> nop; jmp XXX | ||||||
loc8[-2] = 0x90; | ||||||
loc8[-1] = 0xE9; | ||||||
value = relaxed; | ||||||
} | ||||||
} | ||||||
} | ||||||
// Fall through... | ||||||
case HoleKind_R_X86_64_GOTPCREL: | ||||||
case HoleKind_R_X86_64_PC32: | ||||||
case HoleKind_X86_64_RELOC_SIGNED: | ||||||
case HoleKind_X86_64_RELOC_BRANCH: | ||||||
// 32-bit relative address. | ||||||
value -= (uint64_t)location; | ||||||
// Check that we're not out of range of 32 signed bits: | ||||||
assert((int64_t)value >= -(1LL << 31)); | ||||||
assert((int64_t)value < (1LL << 31)); | ||||||
loc32[0] = (uint32_t)value; | ||||||
continue; | ||||||
case HoleKind_R_AARCH64_CALL26: | ||||||
case HoleKind_R_AARCH64_JUMP26: | ||||||
// 28-bit relative branch. | ||||||
|
@@ -249,10 +292,53 @@ patch(char *base, const Stencil *stencil, uint64_t *patches) | |||||
set_bits(loc32, 5, value, 48, 16); | ||||||
continue; | ||||||
case HoleKind_ARM64_RELOC_GOT_LOAD_PAGE21: | ||||||
case HoleKind_R_AARCH64_ADR_GOT_PAGE: | ||||||
// 21-bit count of pages between this page and an absolute address's | ||||||
// page... I know, I know, it's weird. Pairs nicely with | ||||||
// ARM64_RELOC_GOT_LOAD_PAGEOFF12 (below). | ||||||
assert(IS_AARCH64_ADRP(*loc32)); | ||||||
// Try to relax the pair of GOT loads into an immediate value: | ||||||
const Hole *next_hole = &stencil->holes[i + 1]; | ||||||
if (i + 1 < stencil->holes_size && | ||||||
(next_hole->kind == HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12 || | ||||||
next_hole->kind == HoleKind_R_AARCH64_LD64_GOT_LO12_NC) && | ||||||
next_hole->offset == hole->offset + 4 && | ||||||
next_hole->symbol == hole->symbol && | ||||||
next_hole->addend == hole->addend && | ||||||
next_hole->value == hole->value) | ||||||
{ | ||||||
unsigned char rd = get_bits(loc32[0], 0, 5); | ||||||
assert(IS_AARCH64_LDR_OR_STR(loc32[1])); | ||||||
unsigned char rt = get_bits(loc32[1], 0, 5); | ||||||
unsigned char rn = get_bits(loc32[1], 5, 5); | ||||||
assert(rd == rn && rn == rt); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
uint64_t relaxed = *(uint64_t *)value; | ||||||
if (relaxed < (1UL << 16)) { | ||||||
// adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop | ||||||
loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | rd; | ||||||
loc32[1] = 0xD503201F; | ||||||
i++; | ||||||
continue; | ||||||
} | ||||||
if (relaxed < (1ULL << 32)) { | ||||||
// adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY | ||||||
loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | rd; | ||||||
loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | rd; | ||||||
i++; | ||||||
continue; | ||||||
} | ||||||
relaxed = (uint64_t)value - (uint64_t)location; | ||||||
if ((relaxed & 0x3) == 0 && | ||||||
(int64_t)relaxed >= -(1L << 19) && | ||||||
(int64_t)relaxed < (1L << 19)) | ||||||
{ | ||||||
// adrp reg, AAA; ldr reg, [reg + BBB] -> ldr x0, XXX; nop | ||||||
loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | rd; | ||||||
loc32[1] = 0xD503201F; | ||||||
i++; | ||||||
continue; | ||||||
} | ||||||
} | ||||||
// Number of pages between this page and the value's page: | ||||||
value = (value >> 12) - ((uint64_t)location >> 12); | ||||||
// Check that we're not out of range of 21 signed bits: | ||||||
|
@@ -264,6 +350,7 @@ patch(char *base, const Stencil *stencil, uint64_t *patches) | |||||
set_bits(loc32, 5, value, 2, 19); | ||||||
continue; | ||||||
case HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12: | ||||||
case HoleKind_R_AARCH64_LD64_GOT_LO12_NC: | ||||||
// 12-bit low part of an absolute address. Pairs nicely with | ||||||
// ARM64_RELOC_GOT_LOAD_PAGE21 (above). | ||||||
assert(IS_AARCH64_LDR_OR_STR(*loc32) || IS_AARCH64_ADD_OR_SUB(*loc32)); | ||||||
|
@@ -285,7 +372,7 @@ patch(char *base, const Stencil *stencil, uint64_t *patches) | |||||
} | ||||||
|
||||||
static void | ||||||
copy_and_patch(char *base, const Stencil *stencil, uint64_t *patches) | ||||||
copy_and_patch(unsigned char *base, const Stencil *stencil, uint64_t *patches) | ||||||
{ | ||||||
memcpy(base, stencil->body, stencil->body_size); | ||||||
patch(base, stencil, patches); | ||||||
|
@@ -294,8 +381,8 @@ copy_and_patch(char *base, const Stencil *stencil, uint64_t *patches) | |||||
static void | ||||||
emit(const StencilGroup *group, uint64_t patches[]) | ||||||
{ | ||||||
copy_and_patch((char *)patches[HoleValue_CODE], &group->code, patches); | ||||||
copy_and_patch((char *)patches[HoleValue_DATA], &group->data, patches); | ||||||
copy_and_patch((unsigned char *)patches[HoleValue_DATA], &group->data, patches); | ||||||
copy_and_patch((unsigned char *)patches[HoleValue_CODE], &group->code, patches); | ||||||
} | ||||||
|
||||||
// Compiles executor in-place. Don't forget to call _PyJIT_Free later! | ||||||
|
@@ -316,14 +403,14 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size | |||||
assert((page_size & (page_size - 1)) == 0); | ||||||
code_size += page_size - (code_size & (page_size - 1)); | ||||||
data_size += page_size - (data_size & (page_size - 1)); | ||||||
char *memory = jit_alloc(code_size + data_size); | ||||||
unsigned char *memory = jit_alloc(code_size + data_size); | ||||||
if (memory == NULL) { | ||||||
return -1; | ||||||
} | ||||||
// Loop again to emit the code: | ||||||
char *code = memory; | ||||||
char *data = memory + code_size; | ||||||
char *top = code; | ||||||
unsigned char *code = memory; | ||||||
unsigned char *data = memory + code_size; | ||||||
unsigned char *top = code; | ||||||
if (trace[0].opcode == _START_EXECUTOR) { | ||||||
// Don't want to execute this more than once: | ||||||
top += stencil_groups[_START_EXECUTOR].code.body_size; | ||||||
|
@@ -360,7 +447,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size | |||||
void | ||||||
_PyJIT_Free(_PyExecutorObject *executor) | ||||||
{ | ||||||
char *memory = (char *)executor->jit_code; | ||||||
unsigned char *memory = (unsigned char *)executor->jit_code; | ||||||
size_t size = executor->jit_size; | ||||||
if (memory) { | ||||||
executor->jit_code = NULL; | ||||||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,6 +37,7 @@ class _Target(typing.Generic[_S, _R]): | |
triple: str | ||
_: dataclasses.KW_ONLY | ||
alignment: int = 1 | ||
args: typing.Sequence[str] = () | ||
prefix: str = "" | ||
debug: bool = False | ||
force: bool = False | ||
|
@@ -121,21 +122,14 @@ async def _compile( | |
"-fno-builtin", | ||
# SET_FUNCTION_ATTRIBUTE on 32-bit Windows debug builds: | ||
"-fno-jump-tables", | ||
# Position-independent code adds indirection to every load and jump: | ||
"-fno-pic", | ||
"-fno-plt", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you make sure each option has a comment saying what it does. |
||
# Don't make calls to weird stack-smashing canaries: | ||
"-fno-stack-protector", | ||
# We have three options for code model: | ||
# - "small": the default, assumes that code and data reside in the | ||
# lowest 2GB of memory (128MB on aarch64) | ||
# - "medium": assumes that code resides in the lowest 2GB of memory, | ||
# and makes no assumptions about data (not available on aarch64) | ||
# - "large": makes no assumptions about either code or data | ||
"-mcmodel=large", | ||
"-o", | ||
f"{o}", | ||
"-std=c11", | ||
f"{c}", | ||
*self.args, | ||
] | ||
await _llvm.run("clang", args, echo=self.verbose) | ||
return await self._parse(o) | ||
|
@@ -284,7 +278,23 @@ def _handle_section( | |
def _handle_relocation( | ||
self, base: int, relocation: _schema.ELFRelocation, raw: bytes | ||
) -> _stencils.Hole: | ||
symbol: str | None | ||
match relocation: | ||
case { | ||
"Addend": addend, | ||
"Offset": offset, | ||
"Symbol": {"Value": s}, | ||
"Type": { | ||
"Value": "R_AARCH64_ADR_GOT_PAGE" | ||
| "R_AARCH64_LD64_GOT_LO12_NC" | ||
| "R_X86_64_GOTPCREL" | ||
| "R_X86_64_GOTPCRELX" | ||
| "R_X86_64_REX_GOTPCRELX" as kind | ||
}, | ||
}: | ||
offset += base | ||
s = s.removeprefix(self.prefix) | ||
value, symbol = _stencils.HoleValue.GOT, s | ||
case { | ||
"Addend": addend, | ||
"Offset": offset, | ||
|
@@ -358,6 +368,34 @@ def _handle_relocation( | |
s = s.removeprefix(self.prefix) | ||
value, symbol = _stencils.HoleValue.GOT, s | ||
addend = 0 | ||
case { | ||
"Offset": offset, | ||
"Symbol": {"Value": s}, | ||
"Type": {"Value": "X86_64_RELOC_GOT" | "X86_64_RELOC_GOT_LOAD" as kind}, | ||
}: | ||
offset += base | ||
s = s.removeprefix(self.prefix) | ||
value, symbol = _stencils.HoleValue.GOT, s | ||
addend = ( | ||
int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4 | ||
) | ||
case { | ||
"Offset": offset, | ||
"Section": {"Value": s}, | ||
"Type": {"Value": "X86_64_RELOC_SIGNED" as kind}, | ||
} | { | ||
"Offset": offset, | ||
"Symbol": {"Value": s}, | ||
"Type": { | ||
"Value": "X86_64_RELOC_BRANCH" | "X86_64_RELOC_SIGNED" as kind | ||
}, | ||
}: | ||
offset += base | ||
s = s.removeprefix(self.prefix) | ||
value, symbol = _stencils.symbol_to_value(s) | ||
addend = ( | ||
int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4 | ||
) | ||
case { | ||
"Offset": offset, | ||
"Section": {"Value": s}, | ||
|
@@ -379,15 +417,19 @@ def _handle_relocation( | |
def get_target(host: str) -> _COFF | _ELF | _MachO: | ||
"""Build a _Target for the given host "triple" and options.""" | ||
if re.fullmatch(r"aarch64-apple-darwin.*", host): | ||
return _MachO(host, alignment=8, prefix="_") | ||
args = ["-mcmodel=large"] | ||
return _MachO(host, alignment=8, args=args, prefix="_") | ||
if re.fullmatch(r"aarch64-.*-linux-gnu", host): | ||
return _ELF(host, alignment=8) | ||
args = ["-mcmodel=large"] | ||
return _ELF(host, alignment=8, args=args) | ||
if re.fullmatch(r"i686-pc-windows-msvc", host): | ||
return _COFF(host, prefix="_") | ||
args = ["-mcmodel=large"] | ||
return _COFF(host, args=args, prefix="_") | ||
if re.fullmatch(r"x86_64-apple-darwin.*", host): | ||
return _MachO(host, prefix="_") | ||
if re.fullmatch(r"x86_64-pc-windows-msvc", host): | ||
return _COFF(host) | ||
args = ["-mcmodel=large"] | ||
return _COFF(host, args=args) | ||
if re.fullmatch(r"x86_64-.*-linux-gnu", host): | ||
return _ELF(host) | ||
raise ValueError(host) |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
rt
andrn
are only used in the assert, so I'm getting an unused variable warning.