From 70e80991e4f90888eb828d33d4ef54d7c7ba8c79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Naz=C4=B1m=20Can=20Alt=C4=B1nova?= Date: Tue, 8 Jul 2025 19:21:56 +0200 Subject: [PATCH 1/9] Add perf trampoline support for macOS --- Python/asm_trampoline.S | 13 +++++++++++++ Python/perf_jit_trampoline.c | 26 ++++++++++++++++++++++---- configure | 2 ++ configure.ac | 3 ++- 4 files changed, 39 insertions(+), 5 deletions(-) diff --git a/Python/asm_trampoline.S b/Python/asm_trampoline.S index 616752459ba4d9..643d584b28e4de 100644 --- a/Python/asm_trampoline.S +++ b/Python/asm_trampoline.S @@ -1,5 +1,9 @@ .text +#if defined(__APPLE__) + .globl __Py_trampoline_func_start +#else .globl _Py_trampoline_func_start +#endif # The following assembly is equivalent to: # PyObject * # trampoline(PyThreadState *ts, _PyInterpreterFrame *f, @@ -7,7 +11,11 @@ # { # return evaluator(ts, f, throwflag); # } +#if defined(__APPLE__) +__Py_trampoline_func_start: +#else _Py_trampoline_func_start: +#endif #ifdef __x86_64__ #if defined(__CET__) && (__CET__ & 1) endbr64 @@ -34,9 +42,14 @@ _Py_trampoline_func_start: addi sp,sp,16 jr ra #endif +#if defined(__APPLE__) + .globl __Py_trampoline_func_end +__Py_trampoline_func_end: +#else .globl _Py_trampoline_func_end _Py_trampoline_func_end: .section .note.GNU-stack,"",@progbits +#endif # Note for indicating the assembly code supports CET #if defined(__x86_64__) && defined(__CET__) && (__CET__ & 1) .section .note.gnu.property,"a" diff --git a/Python/perf_jit_trampoline.c b/Python/perf_jit_trampoline.c index 2ca18c23593547..f05779f8b84adb 100644 --- a/Python/perf_jit_trampoline.c +++ b/Python/perf_jit_trampoline.c @@ -66,7 +66,9 @@ #ifdef PY_HAVE_PERF_TRAMPOLINE /* Standard library includes for perf jitdump implementation */ -#include // ELF architecture constants +#if defined(__linux__) +# include // ELF architecture constants +#endif #include // File control operations #include // Standard I/O operations #include // Standard library functions @@ -74,7 +76,9 @@ #include // System data types #include // System calls (sysconf, getpid) #include // Time functions (gettimeofday) -#include // System call interface +#if defined(__linux__) +# include // System call interface +#endif // ============================================================================= // CONSTANTS AND CONFIGURATION @@ -102,6 +106,16 @@ */ #define PERF_JIT_CODE_PADDING 0x100 + +/* These constants are defined inside , which we can't use outside of linux. */ +#if !defined(__linux__) +# define EM_386 3 +# define EM_X86_64 62 +# define EM_ARM 40 +# define EM_AARCH64 183 +# define EM_RISCV 243 +#endif + /* Convenient access to the global trampoline API state */ #define trampoline_api _PyRuntime.ceval.perf.trampoline_api @@ -195,7 +209,7 @@ struct BaseEvent { typedef struct { struct BaseEvent base; // Common event header uint32_t process_id; // Process ID where code was generated - uint32_t thread_id; // Thread ID where code was generated + uint64_t thread_id; // Thread ID where code was generated uint64_t vma; // Virtual memory address where code is loaded uint64_t code_address; // Address of the actual machine code uint64_t code_size; // Size of the machine code in bytes @@ -1166,7 +1180,11 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, ev.base.size = sizeof(ev) + (name_length+1) + size; ev.base.time_stamp = get_current_monotonic_ticks(); ev.process_id = getpid(); +#if defined(__APPLE__) + pthread_threadid_np(NULL, &ev.thread_id); +#else ev.thread_id = syscall(SYS_gettid); // Get thread ID via system call +#endif ev.vma = base; // Virtual memory address ev.code_address = base; // Same as VMA for our use case ev.code_size = size; @@ -1262,4 +1280,4 @@ _PyPerf_Callbacks _Py_perfmap_jit_callbacks = { &perf_map_jit_fini, // Cleanup function }; -#endif /* PY_HAVE_PERF_TRAMPOLINE */ \ No newline at end of file +#endif /* PY_HAVE_PERF_TRAMPOLINE */ diff --git a/configure b/configure index 94a0b810333ce9..376860c8d10056 100755 --- a/configure +++ b/configure @@ -13815,6 +13815,8 @@ case $PLATFORM_TRIPLET in #( perf_trampoline=yes ;; #( aarch64-linux-gnu) : perf_trampoline=yes ;; #( + darwin) : + perf_trampoline=yes ;; #( *) : perf_trampoline=no ;; diff --git a/configure.ac b/configure.ac index ade71bc011eb87..05b5e5b4ce35d9 100644 --- a/configure.ac +++ b/configure.ac @@ -3690,12 +3690,13 @@ case "$ac_sys_system" in esac AC_MSG_RESULT([$SHLIBS]) -dnl perf trampoline is Linux specific and requires an arch-specific +dnl perf trampoline is Linux and macOS specific and requires an arch-specific dnl trampoline in assembly. AC_MSG_CHECKING([perf trampoline]) AS_CASE([$PLATFORM_TRIPLET], [x86_64-linux-gnu], [perf_trampoline=yes], [aarch64-linux-gnu], [perf_trampoline=yes], + [darwin], [perf_trampoline=yes], [perf_trampoline=no] ) AC_MSG_RESULT([$perf_trampoline]) From c496c67e4dd943079fe5d32dd1c451952cf5b140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Naz=C4=B1m=20Can=20Alt=C4=B1nova?= Date: Wed, 9 Jul 2025 00:11:37 +0200 Subject: [PATCH 2/9] Make sure that test_perfmaps.py test is not skipped on macOS --- Lib/test/test_perfmaps.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_perfmaps.py b/Lib/test/test_perfmaps.py index d4c6fe0124af18..647c32656abd6d 100644 --- a/Lib/test/test_perfmaps.py +++ b/Lib/test/test_perfmaps.py @@ -1,5 +1,5 @@ import os -import sys +import sysconfig import unittest try: @@ -7,10 +7,14 @@ except ImportError: raise unittest.SkipTest("requires _testinternalcapi") +def supports_trampoline_profiling(): + perf_trampoline = sysconfig.get_config_var("PY_HAVE_PERF_TRAMPOLINE") + if not perf_trampoline: + return False + return int(perf_trampoline) == 1 -if sys.platform != 'linux': - raise unittest.SkipTest('Linux only') - +if not supports_trampoline_profiling(): + raise unittest.SkipTest("perf trampoline profiling not supported") class TestPerfMapWriting(unittest.TestCase): def test_write_perf_map_entry(self): From dcd692871a900b63f5ef68c715ab7af2bcac2060 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Naz=C4=B1m=20Can=20Alt=C4=B1nova?= Date: Wed, 9 Jul 2025 00:13:22 +0200 Subject: [PATCH 3/9] Update the docs for perfmaps to mention that macOS is supported --- Doc/c-api/perfmaps.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Doc/c-api/perfmaps.rst b/Doc/c-api/perfmaps.rst index 77b5e3c0876bbb..81fb5673f008aa 100644 --- a/Doc/c-api/perfmaps.rst +++ b/Doc/c-api/perfmaps.rst @@ -5,11 +5,12 @@ Support for Perf Maps ---------------------- -On supported platforms (as of this writing, only Linux), the runtime can take +On supported platforms (as of this writing, Linux and macOS), the runtime can take advantage of *perf map files* to make Python functions visible to an external -profiling tool (such as `perf `_). -A running process may create a file in the ``/tmp`` directory, which contains entries -that can map a section of executable code to a name. This interface is described in the +profiling tool (such as `perf `_ or +`samply `_). A running process may create a +file in the ``/tmp`` directory, which contains entries that can map a section +of executable code to a name. This interface is described in the `documentation of the Linux Perf tool `_. From 9e1f9407ebc209a59b8a1af21326079b0875aefd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Naz=C4=B1m=20Can=20Alt=C4=B1nova?= Date: Wed, 9 Jul 2025 01:06:03 +0200 Subject: [PATCH 4/9] Add myself to Misc/ACKS --- Misc/ACKS | 1 + 1 file changed, 1 insertion(+) diff --git a/Misc/ACKS b/Misc/ACKS index d1490e1e46ccfd..0f72aeac1670d9 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -43,6 +43,7 @@ Ray Allen Billy G. Allie Jamiel Almeida Kevin Altis +Nazım Can Altınova Samy Lahfa Skyler Leigh Amador Joe Amenta From f6636273ed97755508685b7202904a300fab2dff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Naz=C4=B1m=20Can=20Alt=C4=B1nova?= Date: Wed, 9 Jul 2025 11:15:46 +0200 Subject: [PATCH 5/9] Add a Misc/NEWS.d entry --- .../2025-07-09-11-15-42.gh-issue-136459.m4Udh8.rst | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-07-09-11-15-42.gh-issue-136459.m4Udh8.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-07-09-11-15-42.gh-issue-136459.m4Udh8.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-07-09-11-15-42.gh-issue-136459.m4Udh8.rst new file mode 100644 index 00000000000000..b74ff6b3b3347f --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-07-09-11-15-42.gh-issue-136459.m4Udh8.rst @@ -0,0 +1,3 @@ +Add support for perf trampoline on macOS, to allow profilers wit JIT map +support to read python calls. While profiling, ``PYTHONPERFSUPPORT=1`` can +be appended to enable the trampoline. From a444cd3e3730d710a3e4853b3dc23532562c60b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Naz=C4=B1m=20Can=20Alt=C4=B1nova?= Date: Wed, 9 Jul 2025 23:03:55 +0200 Subject: [PATCH 6/9] Define constants per-platform --- Python/perf_jit_trampoline.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/Python/perf_jit_trampoline.c b/Python/perf_jit_trampoline.c index f05779f8b84adb..2ff5a18f9c0647 100644 --- a/Python/perf_jit_trampoline.c +++ b/Python/perf_jit_trampoline.c @@ -109,11 +109,17 @@ /* These constants are defined inside , which we can't use outside of linux. */ #if !defined(__linux__) -# define EM_386 3 -# define EM_X86_64 62 -# define EM_ARM 40 -# define EM_AARCH64 183 -# define EM_RISCV 243 +# if defined(__i386__) || defined(_M_IX86) +# define EM_386 3 +# elif defined(__arm__) || defined(_M_ARM) +# define EM_ARM 40 +# elif defined(__x86_64__) || defined(_M_X64) +# define EM_X86_64 62 +# elif defined(__aarch64__) +# define EM_AARCH64 183 +# elif defined(__riscv) +# define EM_RISCV 243 +# endif #endif /* Convenient access to the global trampoline API state */ From 7d84315e62ceadbf6c1f7b5755417a03068cc8e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Naz=C4=B1m=20Can=20Alt=C4=B1nova?= Date: Wed, 9 Jul 2025 23:25:15 +0200 Subject: [PATCH 7/9] Do not mmap the jitdump file on macOS On macOS, we don't need to call mmap because samply has already detected the file path during the call to `open` before (it interposes `open` with a preloaded library), and because the mmap call can be slow. --- Python/perf_jit_trampoline.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Python/perf_jit_trampoline.c b/Python/perf_jit_trampoline.c index 2ff5a18f9c0647..9312381d9bfee4 100644 --- a/Python/perf_jit_trampoline.c +++ b/Python/perf_jit_trampoline.c @@ -962,6 +962,10 @@ static void* perf_map_jit_init(void) { return NULL; // Failed to get page size } +#if defined(__APPLE__) + // On macOS, samply uses a preload to find jitdumps and this mmap can be slow. + perf_jit_map_state.mapped_buffer = NULL; +#else /* * Map the first page of the jitdump file * @@ -984,6 +988,7 @@ static void* perf_map_jit_init(void) { close(fd); return NULL; // Memory mapping failed } +#endif perf_jit_map_state.mapped_size = page_size; From 057388de5ea50189ea7fb52ee05ea565cc6e3db4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Naz=C4=B1m=20Can=20Alt=C4=B1nova?= Date: Thu, 10 Jul 2025 15:11:47 +0200 Subject: [PATCH 8/9] Update the perf profiling doc to include samply --- Doc/howto/perf_profiling.rst | 51 +++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/Doc/howto/perf_profiling.rst b/Doc/howto/perf_profiling.rst index 96d757ac452b5e..62f6699612ebeb 100644 --- a/Doc/howto/perf_profiling.rst +++ b/Doc/howto/perf_profiling.rst @@ -2,34 +2,35 @@ .. _perf_profiling: -============================================== -Python support for the Linux ``perf`` profiler -============================================== +======================================================== +Python support for the ``perf map`` compatible profilers +======================================================== :author: Pablo Galindo -`The Linux perf profiler `_ -is a very powerful tool that allows you to profile and obtain -information about the performance of your application. -``perf`` also has a very vibrant ecosystem of tools -that aid with the analysis of the data that it produces. +`The Linux perf profiler `_ and +`samply `_ are powerful tools that allow you to +profile and obtain information about the performance of your application. +Both tools have vibrant ecosystems that aid with the analysis of the data they produce. -The main problem with using the ``perf`` profiler with Python applications is that -``perf`` only gets information about native symbols, that is, the names of +The main problem with using these profilers with Python applications is that +they only get information about native symbols, that is, the names of functions and procedures written in C. This means that the names and file names -of Python functions in your code will not appear in the output of ``perf``. +of Python functions in your code will not appear in the profiler output. Since Python 3.12, the interpreter can run in a special mode that allows Python -functions to appear in the output of the ``perf`` profiler. When this mode is +functions to appear in the output of compatible profilers. When this mode is enabled, the interpreter will interpose a small piece of code compiled on the -fly before the execution of every Python function and it will teach ``perf`` the +fly before the execution of every Python function and it will teach the profiler the relationship between this piece of code and the associated Python function using :doc:`perf map files <../c-api/perfmaps>`. .. note:: - Support for the ``perf`` profiler is currently only available for Linux on - select architectures. Check the output of the ``configure`` build step or + Support for profiling is available on Linux and macOS on select architectures. + ``perf`` is available on Linux, while ``samply`` can be used on both Linux and macOS. + ``samply`` support on macOS is available starting from Python 3.14. + Check the output of the ``configure`` build step or check the output of ``python -m sysconfig | grep HAVE_PERF_TRAMPOLINE`` to see if your system is supported. @@ -148,6 +149,26 @@ Instead, if we run the same experiment with ``perf`` support enabled we get: +Using ``samply`` profiler +------------------------- + +``samply`` is a modern profiler that can be used as an alternative to ``perf``. +It uses the same perf map files that Python generates, making it compatible +with Python's profiling support. ``samply`` is particularly useful on macOS +where ``perf`` is not available. + +To use ``samply`` with Python, first install it following the instructions at +https://github.com/mstange/samply, then run:: + + $ samply record PYTHONPERFSUPPORT=1 python my_script.py + +This will open a web interface where you can analyze the profiling data +interactively. The advantage of ``samply`` is that it provides a modern +web-based interface for analyzing profiling data and works on both Linux +and macOS. + +On macOS, ``samply`` support requires Python 3.14 or later. + How to enable ``perf`` profiling support ---------------------------------------- From 8b03dc13edf32bd81bf26d287de20a85a14b1f26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Naz=C4=B1m=20Can=20Alt=C4=B1nova?= Date: Thu, 10 Jul 2025 15:57:14 +0200 Subject: [PATCH 9/9] Add some tests for samply profiling --- Lib/test/test_samply_profiler.py | 244 +++++++++++++++++++++++++++++++ 1 file changed, 244 insertions(+) create mode 100644 Lib/test/test_samply_profiler.py diff --git a/Lib/test/test_samply_profiler.py b/Lib/test/test_samply_profiler.py new file mode 100644 index 00000000000000..ec0ed37ffd047b --- /dev/null +++ b/Lib/test/test_samply_profiler.py @@ -0,0 +1,244 @@ +import unittest +import subprocess +import sys +import sysconfig +import os +import pathlib +from test import support +from test.support.script_helper import ( + make_script, +) +from test.support.os_helper import temp_dir + + +if not support.has_subprocess_support: + raise unittest.SkipTest("test module requires subprocess") + +if support.check_sanitizer(address=True, memory=True, ub=True, function=True): + # gh-109580: Skip the test because it does crash randomly if Python is + # built with ASAN. + raise unittest.SkipTest("test crash randomly on ASAN/MSAN/UBSAN build") + + +def supports_trampoline_profiling(): + perf_trampoline = sysconfig.get_config_var("PY_HAVE_PERF_TRAMPOLINE") + if not perf_trampoline: + return False + return int(perf_trampoline) == 1 + + +if not supports_trampoline_profiling(): + raise unittest.SkipTest("perf trampoline profiling not supported") + + +def samply_command_works(): + try: + cmd = ["samply", "--help"] + except (subprocess.SubprocessError, OSError): + return False + + # Check that we can run a simple samply run + with temp_dir() as script_dir: + try: + output_file = script_dir + "/profile.json.gz" + cmd = ( + "samply", + "record", + "--save-only", + "--output", + output_file, + sys.executable, + "-c", + 'print("hello")', + ) + env = {**os.environ, "PYTHON_JIT": "0"} + stdout = subprocess.check_output( + cmd, cwd=script_dir, text=True, stderr=subprocess.STDOUT, env=env + ) + except (subprocess.SubprocessError, OSError): + return False + + if "hello" not in stdout: + return False + + return True + + +def run_samply(cwd, *args, **env_vars): + env = os.environ.copy() + if env_vars: + env.update(env_vars) + env["PYTHON_JIT"] = "0" + output_file = cwd + "/profile.json.gz" + base_cmd = ( + "samply", + "record", + "--save-only", + "-o", output_file, + ) + proc = subprocess.run( + base_cmd + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env, + ) + if proc.returncode: + print(proc.stderr, file=sys.stderr) + raise ValueError(f"Samply failed with return code {proc.returncode}") + + import gzip + with gzip.open(output_file, mode="rt", encoding="utf-8") as f: + return f.read() + + +@unittest.skipUnless(samply_command_works(), "samply command doesn't work") +class TestSamplyProfilerMixin: + def run_samply(self, script_dir, perf_mode, script): + raise NotImplementedError() + + def test_python_calls_appear_in_the_stack_if_perf_activated(self): + with temp_dir() as script_dir: + code = """if 1: + def foo(n): + x = 0 + for i in range(n): + x += i + + def bar(n): + foo(n) + + def baz(n): + bar(n) + + baz(10000000) + """ + script = make_script(script_dir, "perftest", code) + output = self.run_samply(script_dir, script) + + self.assertIn(f"py::foo:{script}", output) + self.assertIn(f"py::bar:{script}", output) + self.assertIn(f"py::baz:{script}", output) + + def test_python_calls_do_not_appear_in_the_stack_if_perf_deactivated(self): + with temp_dir() as script_dir: + code = """if 1: + def foo(n): + x = 0 + for i in range(n): + x += i + + def bar(n): + foo(n) + + def baz(n): + bar(n) + + baz(10000000) + """ + script = make_script(script_dir, "perftest", code) + output = self.run_samply( + script_dir, script, activate_trampoline=False + ) + + self.assertNotIn(f"py::foo:{script}", output) + self.assertNotIn(f"py::bar:{script}", output) + self.assertNotIn(f"py::baz:{script}", output) + + +@unittest.skipUnless(samply_command_works(), "samply command doesn't work") +class TestSamplyProfiler(unittest.TestCase, TestSamplyProfilerMixin): + def run_samply(self, script_dir, script, activate_trampoline=True): + if activate_trampoline: + return run_samply(script_dir, sys.executable, "-Xperf", script) + return run_samply(script_dir, sys.executable, script) + + def setUp(self): + super().setUp() + self.perf_files = set(pathlib.Path("/tmp/").glob("perf-*.map")) + + def tearDown(self) -> None: + super().tearDown() + files_to_delete = ( + set(pathlib.Path("/tmp/").glob("perf-*.map")) - self.perf_files + ) + for file in files_to_delete: + file.unlink() + + def test_pre_fork_compile(self): + code = """if 1: + import sys + import os + import sysconfig + from _testinternalcapi import ( + compile_perf_trampoline_entry, + perf_trampoline_set_persist_after_fork, + ) + + def foo_fork(): + pass + + def bar_fork(): + foo_fork() + + def foo(): + import time; time.sleep(1) + + def bar(): + foo() + + def compile_trampolines_for_all_functions(): + perf_trampoline_set_persist_after_fork(1) + for _, obj in globals().items(): + if callable(obj) and hasattr(obj, '__code__'): + compile_perf_trampoline_entry(obj.__code__) + + if __name__ == "__main__": + compile_trampolines_for_all_functions() + pid = os.fork() + if pid == 0: + print(os.getpid()) + bar_fork() + else: + bar() + """ + + with temp_dir() as script_dir: + script = make_script(script_dir, "perftest", code) + env = {**os.environ, "PYTHON_JIT": "0"} + with subprocess.Popen( + [sys.executable, "-Xperf", script], + universal_newlines=True, + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + env=env, + ) as process: + stdout, stderr = process.communicate() + + self.assertEqual(process.returncode, 0) + self.assertNotIn("Error:", stderr) + child_pid = int(stdout.strip()) + perf_file = pathlib.Path(f"/tmp/perf-{process.pid}.map") + perf_child_file = pathlib.Path(f"/tmp/perf-{child_pid}.map") + self.assertTrue(perf_file.exists()) + self.assertTrue(perf_child_file.exists()) + + perf_file_contents = perf_file.read_text() + self.assertIn(f"py::foo:{script}", perf_file_contents) + self.assertIn(f"py::bar:{script}", perf_file_contents) + self.assertIn(f"py::foo_fork:{script}", perf_file_contents) + self.assertIn(f"py::bar_fork:{script}", perf_file_contents) + + child_perf_file_contents = perf_child_file.read_text() + self.assertIn(f"py::foo_fork:{script}", child_perf_file_contents) + self.assertIn(f"py::bar_fork:{script}", child_perf_file_contents) + + # Pre-compiled perf-map entries of a forked process must be + # identical in both the parent and child perf-map files. + perf_file_lines = perf_file_contents.split("\n") + for line in perf_file_lines: + if f"py::foo_fork:{script}" in line or f"py::bar_fork:{script}" in line: + self.assertIn(line, child_perf_file_contents) + + +if __name__ == "__main__": + unittest.main()