Thanks to visit codestin.com
Credit goes to github.com

Skip to content

GH-115802: JIT "small" code for macOS and Linux #115826

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 103 additions & 16 deletions Python/jit.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,18 @@ jit_error(const char *message)
PyErr_Format(PyExc_RuntimeWarning, "JIT %s (%d)", message, hint);
}

static char *
static unsigned char *
jit_alloc(size_t size)
{
assert(size);
assert(size % get_page_size() == 0);
#ifdef MS_WINDOWS
int flags = MEM_COMMIT | MEM_RESERVE;
char *memory = VirtualAlloc(NULL, size, flags, PAGE_READWRITE);
unsigned char *memory = VirtualAlloc(NULL, size, flags, PAGE_READWRITE);
int failed = memory == NULL;
#else
int flags = MAP_ANONYMOUS | MAP_PRIVATE;
char *memory = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
unsigned char *memory = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
int failed = memory == MAP_FAILED;
#endif
if (failed) {
Expand All @@ -69,7 +69,7 @@ jit_alloc(size_t size)
}

static int
jit_free(char *memory, size_t size)
jit_free(unsigned char *memory, size_t size)
{
assert(size);
assert(size % get_page_size() == 0);
Expand All @@ -86,7 +86,7 @@ jit_free(char *memory, size_t size)
}

static int
mark_executable(char *memory, size_t size)
mark_executable(unsigned char *memory, size_t size)
{
if (size == 0) {
return 0;
Expand All @@ -113,7 +113,7 @@ mark_executable(char *memory, size_t size)
}

static int
mark_readable(char *memory, size_t size)
mark_readable(unsigned char *memory, size_t size)
{
if (size == 0) {
return 0;
Expand Down Expand Up @@ -169,18 +169,20 @@ set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start,
// Fill all of stencil's holes in the memory pointed to by base, using the
// values in patches.
static void
patch(char *base, const Stencil *stencil, uint64_t *patches)
patch(unsigned char *base, const Stencil *stencil, uint64_t *patches)
{
for (uint64_t i = 0; i < stencil->holes_size; i++) {
const Hole *hole = &stencil->holes[i];
void *location = base + hole->offset;
unsigned char *location = base + hole->offset;
uint64_t value = patches[hole->value] + (uint64_t)hole->symbol + hole->addend;
uint8_t *loc8 = (uint8_t *)location;
uint32_t *loc32 = (uint32_t *)location;
uint64_t *loc64 = (uint64_t *)location;
// LLD is a great reference for performing relocations... just keep in
// mind that Tools/jit/build.py does filtering and preprocessing for us!
// Here's a good place to start for each platform:
// - aarch64-apple-darwin:
// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64.cpp
// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.cpp
// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.h
// - aarch64-unknown-linux-gnu:
Expand Down Expand Up @@ -208,6 +210,47 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
// 64-bit absolute address.
*loc64 = value;
continue;
case HoleKind_R_X86_64_GOTPCRELX:
case HoleKind_R_X86_64_REX_GOTPCRELX:
case HoleKind_X86_64_RELOC_GOT:
case HoleKind_X86_64_RELOC_GOT_LOAD: {
// 32-bit relative address.
// Try to relax the GOT load into an immediate value:
uint64_t relaxed = *(uint64_t *)(value + 4) - 4;
if ((int64_t)relaxed - (int64_t)location >= -(1LL << 31) &&
(int64_t)relaxed - (int64_t)location + 1 < (1LL << 31))
{
if (loc8[-2] == 0x8B) {
// mov reg, dword ptr [rip + AAA] -> lea reg, [rip + XXX]
loc8[-2] = 0x8D;
value = relaxed;
}
else if (loc8[-2] == 0xFF && loc8[-1] == 0x15) {
// call qword ptr [rip + AAA] -> nop; call XXX
loc8[-2] = 0x90;
loc8[-1] = 0xE8;
value = relaxed;
}
else if (loc8[-2] == 0xFF && loc8[-1] == 0x25) {
// jmp qword ptr [rip + AAA] -> nop; jmp XXX
loc8[-2] = 0x90;
loc8[-1] = 0xE9;
value = relaxed;
}
}
}
// Fall through...
case HoleKind_R_X86_64_GOTPCREL:
case HoleKind_R_X86_64_PC32:
case HoleKind_X86_64_RELOC_SIGNED:
case HoleKind_X86_64_RELOC_BRANCH:
// 32-bit relative address.
value -= (uint64_t)location;
// Check that we're not out of range of 32 signed bits:
assert((int64_t)value >= -(1LL << 31));
assert((int64_t)value < (1LL << 31));
loc32[0] = (uint32_t)value;
continue;
case HoleKind_R_AARCH64_CALL26:
case HoleKind_R_AARCH64_JUMP26:
// 28-bit relative branch.
Expand Down Expand Up @@ -249,10 +292,53 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
set_bits(loc32, 5, value, 48, 16);
continue;
case HoleKind_ARM64_RELOC_GOT_LOAD_PAGE21:
case HoleKind_R_AARCH64_ADR_GOT_PAGE:
// 21-bit count of pages between this page and an absolute address's
// page... I know, I know, it's weird. Pairs nicely with
// ARM64_RELOC_GOT_LOAD_PAGEOFF12 (below).
assert(IS_AARCH64_ADRP(*loc32));
// Try to relax the pair of GOT loads into an immediate value:
const Hole *next_hole = &stencil->holes[i + 1];
if (i + 1 < stencil->holes_size &&
(next_hole->kind == HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12 ||
next_hole->kind == HoleKind_R_AARCH64_LD64_GOT_LO12_NC) &&
next_hole->offset == hole->offset + 4 &&
next_hole->symbol == hole->symbol &&
next_hole->addend == hole->addend &&
next_hole->value == hole->value)
{
unsigned char rd = get_bits(loc32[0], 0, 5);
assert(IS_AARCH64_LDR_OR_STR(loc32[1]));
unsigned char rt = get_bits(loc32[1], 0, 5);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rt and rn are only used in the assert, so I'm getting an unused variable warning.

unsigned char rn = get_bits(loc32[1], 5, 5);
assert(rd == rn && rn == rt);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
assert(rd == rn && rn == rt);
assert(rd == get_bits(loc32[1], 5, 5) && rd == get_bits(loc32[1], 0, 5));

uint64_t relaxed = *(uint64_t *)value;
if (relaxed < (1UL << 16)) {
// adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop
loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | rd;
loc32[1] = 0xD503201F;
i++;
continue;
}
if (relaxed < (1ULL << 32)) {
// adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY
loc32[0] = 0xD2800000 | (get_bits(relaxed, 0, 16) << 5) | rd;
loc32[1] = 0xF2A00000 | (get_bits(relaxed, 16, 16) << 5) | rd;
i++;
continue;
}
relaxed = (uint64_t)value - (uint64_t)location;
if ((relaxed & 0x3) == 0 &&
(int64_t)relaxed >= -(1L << 19) &&
(int64_t)relaxed < (1L << 19))
{
// adrp reg, AAA; ldr reg, [reg + BBB] -> ldr x0, XXX; nop
loc32[0] = 0x58000000 | (get_bits(relaxed, 2, 19) << 5) | rd;
loc32[1] = 0xD503201F;
i++;
continue;
}
}
// Number of pages between this page and the value's page:
value = (value >> 12) - ((uint64_t)location >> 12);
// Check that we're not out of range of 21 signed bits:
Expand All @@ -264,6 +350,7 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
set_bits(loc32, 5, value, 2, 19);
continue;
case HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12:
case HoleKind_R_AARCH64_LD64_GOT_LO12_NC:
// 12-bit low part of an absolute address. Pairs nicely with
// ARM64_RELOC_GOT_LOAD_PAGE21 (above).
assert(IS_AARCH64_LDR_OR_STR(*loc32) || IS_AARCH64_ADD_OR_SUB(*loc32));
Expand All @@ -285,7 +372,7 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
}

static void
copy_and_patch(char *base, const Stencil *stencil, uint64_t *patches)
copy_and_patch(unsigned char *base, const Stencil *stencil, uint64_t *patches)
{
memcpy(base, stencil->body, stencil->body_size);
patch(base, stencil, patches);
Expand All @@ -294,8 +381,8 @@ copy_and_patch(char *base, const Stencil *stencil, uint64_t *patches)
static void
emit(const StencilGroup *group, uint64_t patches[])
{
copy_and_patch((char *)patches[HoleValue_CODE], &group->code, patches);
copy_and_patch((char *)patches[HoleValue_DATA], &group->data, patches);
copy_and_patch((unsigned char *)patches[HoleValue_DATA], &group->data, patches);
copy_and_patch((unsigned char *)patches[HoleValue_CODE], &group->code, patches);
}

// Compiles executor in-place. Don't forget to call _PyJIT_Free later!
Expand All @@ -316,14 +403,14 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size
assert((page_size & (page_size - 1)) == 0);
code_size += page_size - (code_size & (page_size - 1));
data_size += page_size - (data_size & (page_size - 1));
char *memory = jit_alloc(code_size + data_size);
unsigned char *memory = jit_alloc(code_size + data_size);
if (memory == NULL) {
return -1;
}
// Loop again to emit the code:
char *code = memory;
char *data = memory + code_size;
char *top = code;
unsigned char *code = memory;
unsigned char *data = memory + code_size;
unsigned char *top = code;
if (trace[0].opcode == _START_EXECUTOR) {
// Don't want to execute this more than once:
top += stencil_groups[_START_EXECUTOR].code.body_size;
Expand Down Expand Up @@ -360,7 +447,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size
void
_PyJIT_Free(_PyExecutorObject *executor)
{
char *memory = (char *)executor->jit_code;
unsigned char *memory = (unsigned char *)executor->jit_code;
size_t size = executor->jit_size;
if (memory) {
executor->jit_code = NULL;
Expand Down
10 changes: 10 additions & 0 deletions Tools/jit/_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,23 @@
"IMAGE_REL_AMD64_ADDR64",
"IMAGE_REL_I386_DIR32",
"R_AARCH64_ABS64",
"R_AARCH64_ADR_GOT_PAGE",
"R_AARCH64_CALL26",
"R_AARCH64_JUMP26",
"R_AARCH64_LD64_GOT_LO12_NC",
"R_AARCH64_MOVW_UABS_G0_NC",
"R_AARCH64_MOVW_UABS_G1_NC",
"R_AARCH64_MOVW_UABS_G2_NC",
"R_AARCH64_MOVW_UABS_G3",
"R_X86_64_64",
"R_X86_64_GOTPCREL",
"R_X86_64_GOTPCRELX",
"R_X86_64_PC32",
"R_X86_64_REX_GOTPCRELX",
"X86_64_RELOC_BRANCH",
"X86_64_RELOC_GOT",
"X86_64_RELOC_GOT_LOAD",
"X86_64_RELOC_SIGNED",
"X86_64_RELOC_UNSIGNED",
]

Expand Down
68 changes: 55 additions & 13 deletions Tools/jit/_targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class _Target(typing.Generic[_S, _R]):
triple: str
_: dataclasses.KW_ONLY
alignment: int = 1
args: typing.Sequence[str] = ()
prefix: str = ""
debug: bool = False
force: bool = False
Expand Down Expand Up @@ -121,21 +122,14 @@ async def _compile(
"-fno-builtin",
# SET_FUNCTION_ATTRIBUTE on 32-bit Windows debug builds:
"-fno-jump-tables",
# Position-independent code adds indirection to every load and jump:
"-fno-pic",
"-fno-plt",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you make sure each option has a comment saying what it does.

# Don't make calls to weird stack-smashing canaries:
"-fno-stack-protector",
# We have three options for code model:
# - "small": the default, assumes that code and data reside in the
# lowest 2GB of memory (128MB on aarch64)
# - "medium": assumes that code resides in the lowest 2GB of memory,
# and makes no assumptions about data (not available on aarch64)
# - "large": makes no assumptions about either code or data
"-mcmodel=large",
"-o",
f"{o}",
"-std=c11",
f"{c}",
*self.args,
]
await _llvm.run("clang", args, echo=self.verbose)
return await self._parse(o)
Expand Down Expand Up @@ -284,7 +278,23 @@ def _handle_section(
def _handle_relocation(
self, base: int, relocation: _schema.ELFRelocation, raw: bytes
) -> _stencils.Hole:
symbol: str | None
match relocation:
case {
"Addend": addend,
"Offset": offset,
"Symbol": {"Value": s},
"Type": {
"Value": "R_AARCH64_ADR_GOT_PAGE"
| "R_AARCH64_LD64_GOT_LO12_NC"
| "R_X86_64_GOTPCREL"
| "R_X86_64_GOTPCRELX"
| "R_X86_64_REX_GOTPCRELX" as kind
},
}:
offset += base
s = s.removeprefix(self.prefix)
value, symbol = _stencils.HoleValue.GOT, s
case {
"Addend": addend,
"Offset": offset,
Expand Down Expand Up @@ -358,6 +368,34 @@ def _handle_relocation(
s = s.removeprefix(self.prefix)
value, symbol = _stencils.HoleValue.GOT, s
addend = 0
case {
"Offset": offset,
"Symbol": {"Value": s},
"Type": {"Value": "X86_64_RELOC_GOT" | "X86_64_RELOC_GOT_LOAD" as kind},
}:
offset += base
s = s.removeprefix(self.prefix)
value, symbol = _stencils.HoleValue.GOT, s
addend = (
int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4
)
case {
"Offset": offset,
"Section": {"Value": s},
"Type": {"Value": "X86_64_RELOC_SIGNED" as kind},
} | {
"Offset": offset,
"Symbol": {"Value": s},
"Type": {
"Value": "X86_64_RELOC_BRANCH" | "X86_64_RELOC_SIGNED" as kind
},
}:
offset += base
s = s.removeprefix(self.prefix)
value, symbol = _stencils.symbol_to_value(s)
addend = (
int.from_bytes(raw[offset : offset + 4], "little", signed=True) - 4
)
case {
"Offset": offset,
"Section": {"Value": s},
Expand All @@ -379,15 +417,19 @@ def _handle_relocation(
def get_target(host: str) -> _COFF | _ELF | _MachO:
"""Build a _Target for the given host "triple" and options."""
if re.fullmatch(r"aarch64-apple-darwin.*", host):
return _MachO(host, alignment=8, prefix="_")
args = ["-mcmodel=large"]
return _MachO(host, alignment=8, args=args, prefix="_")
if re.fullmatch(r"aarch64-.*-linux-gnu", host):
return _ELF(host, alignment=8)
args = ["-mcmodel=large"]
return _ELF(host, alignment=8, args=args)
if re.fullmatch(r"i686-pc-windows-msvc", host):
return _COFF(host, prefix="_")
args = ["-mcmodel=large"]
return _COFF(host, args=args, prefix="_")
if re.fullmatch(r"x86_64-apple-darwin.*", host):
return _MachO(host, prefix="_")
if re.fullmatch(r"x86_64-pc-windows-msvc", host):
return _COFF(host)
args = ["-mcmodel=large"]
return _COFF(host, args=args)
if re.fullmatch(r"x86_64-.*-linux-gnu", host):
return _ELF(host)
raise ValueError(host)
Loading