@@ -47,18 +47,18 @@ jit_error(const char *message)
47
47
PyErr_Format (PyExc_RuntimeWarning , "JIT %s (%d)" , message , hint );
48
48
}
49
49
50
- static char *
50
+ static unsigned char *
51
51
jit_alloc (size_t size )
52
52
{
53
53
assert (size );
54
54
assert (size % get_page_size () == 0 );
55
55
#ifdef MS_WINDOWS
56
56
int flags = MEM_COMMIT | MEM_RESERVE ;
57
- char * memory = VirtualAlloc (NULL , size , flags , PAGE_READWRITE );
57
+ unsigned char * memory = VirtualAlloc (NULL , size , flags , PAGE_READWRITE );
58
58
int failed = memory == NULL ;
59
59
#else
60
60
int flags = MAP_ANONYMOUS | MAP_PRIVATE ;
61
- char * memory = mmap (NULL , size , PROT_READ | PROT_WRITE , flags , -1 , 0 );
61
+ unsigned char * memory = mmap (NULL , size , PROT_READ | PROT_WRITE , flags , -1 , 0 );
62
62
int failed = memory == MAP_FAILED ;
63
63
#endif
64
64
if (failed ) {
@@ -69,7 +69,7 @@ jit_alloc(size_t size)
69
69
}
70
70
71
71
static int
72
- jit_free (char * memory , size_t size )
72
+ jit_free (unsigned char * memory , size_t size )
73
73
{
74
74
assert (size );
75
75
assert (size % get_page_size () == 0 );
@@ -86,7 +86,7 @@ jit_free(char *memory, size_t size)
86
86
}
87
87
88
88
static int
89
- mark_executable (char * memory , size_t size )
89
+ mark_executable (unsigned char * memory , size_t size )
90
90
{
91
91
if (size == 0 ) {
92
92
return 0 ;
@@ -113,7 +113,7 @@ mark_executable(char *memory, size_t size)
113
113
}
114
114
115
115
static int
116
- mark_readable (char * memory , size_t size )
116
+ mark_readable (unsigned char * memory , size_t size )
117
117
{
118
118
if (size == 0 ) {
119
119
return 0 ;
@@ -169,18 +169,20 @@ set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start,
169
169
// Fill all of stencil's holes in the memory pointed to by base, using the
170
170
// values in patches.
171
171
static void
172
- patch (char * base , const Stencil * stencil , uint64_t * patches )
172
+ patch (unsigned char * base , const Stencil * stencil , uint64_t * patches )
173
173
{
174
174
for (uint64_t i = 0 ; i < stencil -> holes_size ; i ++ ) {
175
175
const Hole * hole = & stencil -> holes [i ];
176
- void * location = base + hole -> offset ;
176
+ unsigned char * location = base + hole -> offset ;
177
177
uint64_t value = patches [hole -> value ] + (uint64_t )hole -> symbol + hole -> addend ;
178
+ uint8_t * loc8 = (uint8_t * )location ;
178
179
uint32_t * loc32 = (uint32_t * )location ;
179
180
uint64_t * loc64 = (uint64_t * )location ;
180
181
// LLD is a great reference for performing relocations... just keep in
181
182
// mind that Tools/jit/build.py does filtering and preprocessing for us!
182
183
// Here's a good place to start for each platform:
183
184
// - aarch64-apple-darwin:
185
+ // - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64.cpp
184
186
// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.cpp
185
187
// - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.h
186
188
// - aarch64-unknown-linux-gnu:
@@ -208,6 +210,47 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
208
210
// 64-bit absolute address.
209
211
* loc64 = value ;
210
212
continue ;
213
+ case HoleKind_R_X86_64_GOTPCRELX :
214
+ case HoleKind_R_X86_64_REX_GOTPCRELX :
215
+ case HoleKind_X86_64_RELOC_GOT :
216
+ case HoleKind_X86_64_RELOC_GOT_LOAD : {
217
+ // 32-bit relative address.
218
+ // Try to relax the GOT load into an immediate value:
219
+ uint64_t relaxed = * (uint64_t * )(value + 4 ) - 4 ;
220
+ if ((int64_t )relaxed - (int64_t )location >= - (1LL << 31 ) &&
221
+ (int64_t )relaxed - (int64_t )location + 1 < (1LL << 31 ))
222
+ {
223
+ if (loc8 [-2 ] == 0x8B ) {
224
+ // mov reg, dword ptr [rip + AAA] -> lea reg, [rip + XXX]
225
+ loc8 [-2 ] = 0x8D ;
226
+ value = relaxed ;
227
+ }
228
+ else if (loc8 [-2 ] == 0xFF && loc8 [-1 ] == 0x15 ) {
229
+ // call qword ptr [rip + AAA] -> nop; call XXX
230
+ loc8 [-2 ] = 0x90 ;
231
+ loc8 [-1 ] = 0xE8 ;
232
+ value = relaxed ;
233
+ }
234
+ else if (loc8 [-2 ] == 0xFF && loc8 [-1 ] == 0x25 ) {
235
+ // jmp qword ptr [rip + AAA] -> nop; jmp XXX
236
+ loc8 [-2 ] = 0x90 ;
237
+ loc8 [-1 ] = 0xE9 ;
238
+ value = relaxed ;
239
+ }
240
+ }
241
+ }
242
+ // Fall through...
243
+ case HoleKind_R_X86_64_GOTPCREL :
244
+ case HoleKind_R_X86_64_PC32 :
245
+ case HoleKind_X86_64_RELOC_SIGNED :
246
+ case HoleKind_X86_64_RELOC_BRANCH :
247
+ // 32-bit relative address.
248
+ value -= (uint64_t )location ;
249
+ // Check that we're not out of range of 32 signed bits:
250
+ assert ((int64_t )value >= - (1LL << 31 ));
251
+ assert ((int64_t )value < (1LL << 31 ));
252
+ loc32 [0 ] = (uint32_t )value ;
253
+ continue ;
211
254
case HoleKind_R_AARCH64_CALL26 :
212
255
case HoleKind_R_AARCH64_JUMP26 :
213
256
// 28-bit relative branch.
@@ -249,10 +292,53 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
249
292
set_bits (loc32 , 5 , value , 48 , 16 );
250
293
continue ;
251
294
case HoleKind_ARM64_RELOC_GOT_LOAD_PAGE21 :
295
+ case HoleKind_R_AARCH64_ADR_GOT_PAGE :
252
296
// 21-bit count of pages between this page and an absolute address's
253
297
// page... I know, I know, it's weird. Pairs nicely with
254
298
// ARM64_RELOC_GOT_LOAD_PAGEOFF12 (below).
255
299
assert (IS_AARCH64_ADRP (* loc32 ));
300
+ // Try to relax the pair of GOT loads into an immediate value:
301
+ const Hole * next_hole = & stencil -> holes [i + 1 ];
302
+ if (i + 1 < stencil -> holes_size &&
303
+ (next_hole -> kind == HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12 ||
304
+ next_hole -> kind == HoleKind_R_AARCH64_LD64_GOT_LO12_NC ) &&
305
+ next_hole -> offset == hole -> offset + 4 &&
306
+ next_hole -> symbol == hole -> symbol &&
307
+ next_hole -> addend == hole -> addend &&
308
+ next_hole -> value == hole -> value )
309
+ {
310
+ unsigned char rd = get_bits (loc32 [0 ], 0 , 5 );
311
+ assert (IS_AARCH64_LDR_OR_STR (loc32 [1 ]));
312
+ unsigned char rt = get_bits (loc32 [1 ], 0 , 5 );
313
+ unsigned char rn = get_bits (loc32 [1 ], 5 , 5 );
314
+ assert (rd == rn && rn == rt );
315
+ uint64_t relaxed = * (uint64_t * )value ;
316
+ if (relaxed < (1UL << 16 )) {
317
+ // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; nop
318
+ loc32 [0 ] = 0xD2800000 | (get_bits (relaxed , 0 , 16 ) << 5 ) | rd ;
319
+ loc32 [1 ] = 0xD503201F ;
320
+ i ++ ;
321
+ continue ;
322
+ }
323
+ if (relaxed < (1ULL << 32 )) {
324
+ // adrp reg, AAA; ldr reg, [reg + BBB] -> movz reg, XXX; movk reg, YYY
325
+ loc32 [0 ] = 0xD2800000 | (get_bits (relaxed , 0 , 16 ) << 5 ) | rd ;
326
+ loc32 [1 ] = 0xF2A00000 | (get_bits (relaxed , 16 , 16 ) << 5 ) | rd ;
327
+ i ++ ;
328
+ continue ;
329
+ }
330
+ relaxed = (uint64_t )value - (uint64_t )location ;
331
+ if ((relaxed & 0x3 ) == 0 &&
332
+ (int64_t )relaxed >= - (1L << 19 ) &&
333
+ (int64_t )relaxed < (1L << 19 ))
334
+ {
335
+ // adrp reg, AAA; ldr reg, [reg + BBB] -> ldr x0, XXX; nop
336
+ loc32 [0 ] = 0x58000000 | (get_bits (relaxed , 2 , 19 ) << 5 ) | rd ;
337
+ loc32 [1 ] = 0xD503201F ;
338
+ i ++ ;
339
+ continue ;
340
+ }
341
+ }
256
342
// Number of pages between this page and the value's page:
257
343
value = (value >> 12 ) - ((uint64_t )location >> 12 );
258
344
// Check that we're not out of range of 21 signed bits:
@@ -264,6 +350,7 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
264
350
set_bits (loc32 , 5 , value , 2 , 19 );
265
351
continue ;
266
352
case HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12 :
353
+ case HoleKind_R_AARCH64_LD64_GOT_LO12_NC :
267
354
// 12-bit low part of an absolute address. Pairs nicely with
268
355
// ARM64_RELOC_GOT_LOAD_PAGE21 (above).
269
356
assert (IS_AARCH64_LDR_OR_STR (* loc32 ) || IS_AARCH64_ADD_OR_SUB (* loc32 ));
@@ -285,7 +372,7 @@ patch(char *base, const Stencil *stencil, uint64_t *patches)
285
372
}
286
373
287
374
static void
288
- copy_and_patch (char * base , const Stencil * stencil , uint64_t * patches )
375
+ copy_and_patch (unsigned char * base , const Stencil * stencil , uint64_t * patches )
289
376
{
290
377
memcpy (base , stencil -> body , stencil -> body_size );
291
378
patch (base , stencil , patches );
@@ -294,8 +381,8 @@ copy_and_patch(char *base, const Stencil *stencil, uint64_t *patches)
294
381
static void
295
382
emit (const StencilGroup * group , uint64_t patches [])
296
383
{
297
- copy_and_patch ((char * )patches [HoleValue_CODE ], & group -> code , patches );
298
- copy_and_patch ((char * )patches [HoleValue_DATA ], & group -> data , patches );
384
+ copy_and_patch ((unsigned char * )patches [HoleValue_DATA ], & group -> data , patches );
385
+ copy_and_patch ((unsigned char * )patches [HoleValue_CODE ], & group -> code , patches );
299
386
}
300
387
301
388
// Compiles executor in-place. Don't forget to call _PyJIT_Free later!
@@ -316,14 +403,14 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size
316
403
assert ((page_size & (page_size - 1 )) == 0 );
317
404
code_size += page_size - (code_size & (page_size - 1 ));
318
405
data_size += page_size - (data_size & (page_size - 1 ));
319
- char * memory = jit_alloc (code_size + data_size );
406
+ unsigned char * memory = jit_alloc (code_size + data_size );
320
407
if (memory == NULL ) {
321
408
return -1 ;
322
409
}
323
410
// Loop again to emit the code:
324
- char * code = memory ;
325
- char * data = memory + code_size ;
326
- char * top = code ;
411
+ unsigned char * code = memory ;
412
+ unsigned char * data = memory + code_size ;
413
+ unsigned char * top = code ;
327
414
if (trace [0 ].opcode == _START_EXECUTOR ) {
328
415
// Don't want to execute this more than once:
329
416
top += stencil_groups [_START_EXECUTOR ].code .body_size ;
@@ -360,7 +447,7 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction *trace, size
360
447
void
361
448
_PyJIT_Free (_PyExecutorObject * executor )
362
449
{
363
- char * memory = (char * )executor -> jit_code ;
450
+ unsigned char * memory = (unsigned char * )executor -> jit_code ;
364
451
size_t size = executor -> jit_size ;
365
452
if (memory ) {
366
453
executor -> jit_code = NULL ;
0 commit comments