-
-
Notifications
You must be signed in to change notification settings - Fork 32.1k
gh-115999: Implement thread-local bytecode and enable specialization for BINARY_OP
#123926
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
776a1e1
2b40870
344d7ad
f203d00
82b456a
b021704
aea69c5
552277d
50a6089
3f1d941
7d2eb27
d5476b9
e3b367a
b2375bf
2707f8e
3fdcb28
4a55ce5
8b3ff60
862afa1
0b4d952
7795e99
693a4cc
b43531e
9025f43
c44c7d9
e2a6656
e6513d1
a18396f
81fe1a2
837645e
f13e132
942f628
66cb24d
ad12bd4
1bbbbbc
e63e403
8b97771
d34adeb
6d4fe73
c2d8693
b104782
deb5216
2f11cc7
04f1ac3
aa330b1
7dfd1ca
7c9da24
dd144d0
ad180d1
95d2264
b6380de
adb59ef
39c947d
2cc5830
96ec126
5ecebd9
815b2fe
fb90d23
4e42414
814e4ca
ba3930a
cb8a774
0f8a55b
70ce0fe
f512353
4be2b1f
61c7aa9
ab6222c
6bbb220
4580e3c
b992f44
4c040d3
5b7658c
bec5bce
c9054b7
1a48ab2
b16ae5f
176b24e
c107495
07f9140
4cbe237
38ff315
338f7e5
bcd1bb2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -68,6 +68,10 @@ typedef struct _PyInterpreterFrame { | |
PyObject *f_locals; /* Strong reference, may be NULL. Only valid if not on C stack */ | ||
PyFrameObject *frame_obj; /* Strong reference, may be NULL. Only valid if not on C stack */ | ||
_Py_CODEUNIT *instr_ptr; /* Instruction currently executing (or about to begin) */ | ||
#ifdef Py_GIL_DISABLED | ||
Yhg1s marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/* Index of thread-local bytecode containing instr_ptr. */ | ||
int32_t tlbc_index; | ||
#endif | ||
_PyStackRef *stackpointer; | ||
uint16_t return_offset; /* Only relevant during a function call */ | ||
char owner; | ||
|
@@ -76,14 +80,27 @@ typedef struct _PyInterpreterFrame { | |
} _PyInterpreterFrame; | ||
|
||
#define _PyInterpreterFrame_LASTI(IF) \ | ||
((int)((IF)->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(IF)))) | ||
((int)((IF)->instr_ptr - _PyFrame_GetBytecode((IF)))) | ||
|
||
static inline PyCodeObject *_PyFrame_GetCode(_PyInterpreterFrame *f) { | ||
PyObject *executable = PyStackRef_AsPyObjectBorrow(f->f_executable); | ||
assert(PyCode_Check(executable)); | ||
return (PyCodeObject *)executable; | ||
} | ||
|
||
static inline _Py_CODEUNIT * | ||
_PyFrame_GetBytecode(_PyInterpreterFrame *f) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You were storing the bytecode in the frame directly before, IIRC. Does it makes things faster overall, or is it just more compact? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yep. You suggested storing
|
||
{ | ||
#ifdef Py_GIL_DISABLED | ||
PyCodeObject *co = _PyFrame_GetCode(f); | ||
_PyCodeArray *tlbc = _Py_atomic_load_ptr_acquire(&co->co_tlbc); | ||
assert(f->tlbc_index >= 0 && f->tlbc_index < tlbc->size); | ||
return (_Py_CODEUNIT *)tlbc->entries[f->tlbc_index]; | ||
#else | ||
return _PyCode_CODE(_PyFrame_GetCode(f)); | ||
#endif | ||
} | ||
|
||
static inline PyFunctionObject *_PyFrame_GetFunction(_PyInterpreterFrame *f) { | ||
PyObject *func = PyStackRef_AsPyObjectBorrow(f->f_funcobj); | ||
assert(PyFunction_Check(func)); | ||
|
@@ -144,13 +161,33 @@ static inline void _PyFrame_Copy(_PyInterpreterFrame *src, _PyInterpreterFrame * | |
#endif | ||
} | ||
|
||
#ifdef Py_GIL_DISABLED | ||
static inline void | ||
_PyFrame_InitializeTLBC(PyThreadState *tstate, _PyInterpreterFrame *frame, | ||
PyCodeObject *code) | ||
{ | ||
_Py_CODEUNIT *tlbc = _PyCode_GetTLBCFast(tstate, code); | ||
if (tlbc == NULL) { | ||
// No thread-local bytecode exists for this thread yet; use the main | ||
// thread's copy, deferring thread-local bytecode creation to the | ||
// execution of RESUME. | ||
frame->instr_ptr = _PyCode_CODE(code); | ||
frame->tlbc_index = 0; | ||
} | ||
else { | ||
frame->instr_ptr = tlbc; | ||
frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index; | ||
} | ||
} | ||
#endif | ||
|
||
/* Consumes reference to func and locals. | ||
Does not initialize frame->previous, which happens | ||
when frame is linked into the frame stack. | ||
*/ | ||
static inline void | ||
_PyFrame_Initialize( | ||
_PyInterpreterFrame *frame, _PyStackRef func, | ||
PyThreadState *tstate, _PyInterpreterFrame *frame, _PyStackRef func, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The only purpose of passing the thread state is to initialize the tlbc index, IIUC. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The tlbc index is only present in free-threaded builds. To do this I think we'd need to either have separate versions of |
||
PyObject *locals, PyCodeObject *code, int null_locals_from, _PyInterpreterFrame *previous) | ||
{ | ||
frame->previous = previous; | ||
|
@@ -162,7 +199,12 @@ _PyFrame_Initialize( | |
frame->f_locals = locals; | ||
frame->stackpointer = frame->localsplus + code->co_nlocalsplus; | ||
frame->frame_obj = NULL; | ||
#ifdef Py_GIL_DISABLED | ||
_PyFrame_InitializeTLBC(tstate, frame, code); | ||
#else | ||
(void)tstate; | ||
frame->instr_ptr = _PyCode_CODE(code); | ||
#endif | ||
frame->return_offset = 0; | ||
frame->owner = FRAME_OWNED_BY_THREAD; | ||
|
||
|
@@ -224,7 +266,8 @@ _PyFrame_IsIncomplete(_PyInterpreterFrame *frame) | |
return true; | ||
} | ||
return frame->owner != FRAME_OWNED_BY_GENERATOR && | ||
frame->instr_ptr < _PyCode_CODE(_PyFrame_GetCode(frame)) + _PyFrame_GetCode(frame)->_co_firsttraceable; | ||
frame->instr_ptr < _PyFrame_GetBytecode(frame) + | ||
_PyFrame_GetCode(frame)->_co_firsttraceable; | ||
} | ||
|
||
static inline _PyInterpreterFrame * | ||
|
@@ -315,7 +358,8 @@ _PyFrame_PushUnchecked(PyThreadState *tstate, _PyStackRef func, int null_locals_ | |
_PyInterpreterFrame *new_frame = (_PyInterpreterFrame *)tstate->datastack_top; | ||
tstate->datastack_top += code->co_framesize; | ||
assert(tstate->datastack_top < tstate->datastack_limit); | ||
_PyFrame_Initialize(new_frame, func, NULL, code, null_locals_from, previous); | ||
_PyFrame_Initialize(tstate, new_frame, func, NULL, code, null_locals_from, | ||
previous); | ||
return new_frame; | ||
} | ||
|
||
|
@@ -339,7 +383,11 @@ _PyFrame_PushTrampolineUnchecked(PyThreadState *tstate, PyCodeObject *code, int | |
assert(stackdepth <= code->co_stacksize); | ||
frame->stackpointer = frame->localsplus + code->co_nlocalsplus + stackdepth; | ||
frame->frame_obj = NULL; | ||
#ifdef Py_GIL_DISABLED | ||
_PyFrame_InitializeTLBC(tstate, frame, code); | ||
#else | ||
frame->instr_ptr = _PyCode_CODE(code); | ||
#endif | ||
frame->owner = FRAME_OWNED_BY_THREAD; | ||
frame->return_offset = 0; | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
#ifndef Py_INTERNAL_INDEX_POOL_H | ||
#define Py_INTERNAL_INDEX_POOL_H | ||
|
||
#include "Python.h" | ||
|
||
#ifdef __cplusplus | ||
extern "C" { | ||
#endif | ||
|
||
#ifndef Py_BUILD_CORE | ||
# error "this header requires Py_BUILD_CORE define" | ||
#endif | ||
|
||
#ifdef Py_GIL_DISABLED | ||
|
||
// This contains code for allocating unique indices in an array. It is used by | ||
// the free-threaded build to assign each thread a globally unique index into | ||
// each code object's thread-local bytecode array. | ||
|
||
// A min-heap of indices | ||
typedef struct _PyIndexHeap { | ||
int32_t *values; | ||
|
||
// Number of items stored in values | ||
Py_ssize_t size; | ||
|
||
// Maximum number of items that can be stored in values | ||
Py_ssize_t capacity; | ||
} _PyIndexHeap; | ||
|
||
// An unbounded pool of indices. Indices are allocated starting from 0. They | ||
// may be released back to the pool once they are no longer in use. | ||
typedef struct _PyIndexPool { | ||
PyMutex mutex; | ||
|
||
// Min heap of indices available for allocation | ||
_PyIndexHeap free_indices; | ||
|
||
// Next index to allocate if no free indices are available | ||
int32_t next_index; | ||
} _PyIndexPool; | ||
|
||
// Allocate the smallest available index. Returns -1 on error. | ||
extern int32_t _PyIndexPool_AllocIndex(_PyIndexPool *indices); | ||
|
||
// Release `index` back to the pool | ||
extern void _PyIndexPool_FreeIndex(_PyIndexPool *indices, int32_t index); | ||
|
||
extern void _PyIndexPool_Fini(_PyIndexPool *indices); | ||
|
||
#endif // Py_GIL_DISABLED | ||
|
||
#ifdef __cplusplus | ||
} | ||
#endif | ||
#endif // !Py_INTERNAL_INDEX_POOL_H |
Uh oh!
There was an error while loading. Please reload this page.