diff --git a/Doc/c-api/perfmaps.rst b/Doc/c-api/perfmaps.rst index 77b5e3c0876bbb..81fb5673f008aa 100644 --- a/Doc/c-api/perfmaps.rst +++ b/Doc/c-api/perfmaps.rst @@ -5,11 +5,12 @@ Support for Perf Maps ---------------------- -On supported platforms (as of this writing, only Linux), the runtime can take +On supported platforms (as of this writing, Linux and macOS), the runtime can take advantage of *perf map files* to make Python functions visible to an external -profiling tool (such as `perf `_). -A running process may create a file in the ``/tmp`` directory, which contains entries -that can map a section of executable code to a name. This interface is described in the +profiling tool (such as `perf `_ or +`samply `_). A running process may create a +file in the ``/tmp`` directory, which contains entries that can map a section +of executable code to a name. This interface is described in the `documentation of the Linux Perf tool `_. diff --git a/Doc/howto/perf_profiling.rst b/Doc/howto/perf_profiling.rst index 96d757ac452b5e..62f6699612ebeb 100644 --- a/Doc/howto/perf_profiling.rst +++ b/Doc/howto/perf_profiling.rst @@ -2,34 +2,35 @@ .. _perf_profiling: -============================================== -Python support for the Linux ``perf`` profiler -============================================== +======================================================== +Python support for the ``perf map`` compatible profilers +======================================================== :author: Pablo Galindo -`The Linux perf profiler `_ -is a very powerful tool that allows you to profile and obtain -information about the performance of your application. -``perf`` also has a very vibrant ecosystem of tools -that aid with the analysis of the data that it produces. +`The Linux perf profiler `_ and +`samply `_ are powerful tools that allow you to +profile and obtain information about the performance of your application. +Both tools have vibrant ecosystems that aid with the analysis of the data they produce. -The main problem with using the ``perf`` profiler with Python applications is that -``perf`` only gets information about native symbols, that is, the names of +The main problem with using these profilers with Python applications is that +they only get information about native symbols, that is, the names of functions and procedures written in C. This means that the names and file names -of Python functions in your code will not appear in the output of ``perf``. +of Python functions in your code will not appear in the profiler output. Since Python 3.12, the interpreter can run in a special mode that allows Python -functions to appear in the output of the ``perf`` profiler. When this mode is +functions to appear in the output of compatible profilers. When this mode is enabled, the interpreter will interpose a small piece of code compiled on the -fly before the execution of every Python function and it will teach ``perf`` the +fly before the execution of every Python function and it will teach the profiler the relationship between this piece of code and the associated Python function using :doc:`perf map files <../c-api/perfmaps>`. .. note:: - Support for the ``perf`` profiler is currently only available for Linux on - select architectures. Check the output of the ``configure`` build step or + Support for profiling is available on Linux and macOS on select architectures. + ``perf`` is available on Linux, while ``samply`` can be used on both Linux and macOS. + ``samply`` support on macOS is available starting from Python 3.14. + Check the output of the ``configure`` build step or check the output of ``python -m sysconfig | grep HAVE_PERF_TRAMPOLINE`` to see if your system is supported. @@ -148,6 +149,26 @@ Instead, if we run the same experiment with ``perf`` support enabled we get: +Using ``samply`` profiler +------------------------- + +``samply`` is a modern profiler that can be used as an alternative to ``perf``. +It uses the same perf map files that Python generates, making it compatible +with Python's profiling support. ``samply`` is particularly useful on macOS +where ``perf`` is not available. + +To use ``samply`` with Python, first install it following the instructions at +https://github.com/mstange/samply, then run:: + + $ samply record PYTHONPERFSUPPORT=1 python my_script.py + +This will open a web interface where you can analyze the profiling data +interactively. The advantage of ``samply`` is that it provides a modern +web-based interface for analyzing profiling data and works on both Linux +and macOS. + +On macOS, ``samply`` support requires Python 3.14 or later. + How to enable ``perf`` profiling support ---------------------------------------- diff --git a/Lib/test/test_perfmaps.py b/Lib/test/test_perfmaps.py index d4c6fe0124af18..647c32656abd6d 100644 --- a/Lib/test/test_perfmaps.py +++ b/Lib/test/test_perfmaps.py @@ -1,5 +1,5 @@ import os -import sys +import sysconfig import unittest try: @@ -7,10 +7,14 @@ except ImportError: raise unittest.SkipTest("requires _testinternalcapi") +def supports_trampoline_profiling(): + perf_trampoline = sysconfig.get_config_var("PY_HAVE_PERF_TRAMPOLINE") + if not perf_trampoline: + return False + return int(perf_trampoline) == 1 -if sys.platform != 'linux': - raise unittest.SkipTest('Linux only') - +if not supports_trampoline_profiling(): + raise unittest.SkipTest("perf trampoline profiling not supported") class TestPerfMapWriting(unittest.TestCase): def test_write_perf_map_entry(self): diff --git a/Lib/test/test_samply_profiler.py b/Lib/test/test_samply_profiler.py new file mode 100644 index 00000000000000..ec0ed37ffd047b --- /dev/null +++ b/Lib/test/test_samply_profiler.py @@ -0,0 +1,244 @@ +import unittest +import subprocess +import sys +import sysconfig +import os +import pathlib +from test import support +from test.support.script_helper import ( + make_script, +) +from test.support.os_helper import temp_dir + + +if not support.has_subprocess_support: + raise unittest.SkipTest("test module requires subprocess") + +if support.check_sanitizer(address=True, memory=True, ub=True, function=True): + # gh-109580: Skip the test because it does crash randomly if Python is + # built with ASAN. + raise unittest.SkipTest("test crash randomly on ASAN/MSAN/UBSAN build") + + +def supports_trampoline_profiling(): + perf_trampoline = sysconfig.get_config_var("PY_HAVE_PERF_TRAMPOLINE") + if not perf_trampoline: + return False + return int(perf_trampoline) == 1 + + +if not supports_trampoline_profiling(): + raise unittest.SkipTest("perf trampoline profiling not supported") + + +def samply_command_works(): + try: + cmd = ["samply", "--help"] + except (subprocess.SubprocessError, OSError): + return False + + # Check that we can run a simple samply run + with temp_dir() as script_dir: + try: + output_file = script_dir + "/profile.json.gz" + cmd = ( + "samply", + "record", + "--save-only", + "--output", + output_file, + sys.executable, + "-c", + 'print("hello")', + ) + env = {**os.environ, "PYTHON_JIT": "0"} + stdout = subprocess.check_output( + cmd, cwd=script_dir, text=True, stderr=subprocess.STDOUT, env=env + ) + except (subprocess.SubprocessError, OSError): + return False + + if "hello" not in stdout: + return False + + return True + + +def run_samply(cwd, *args, **env_vars): + env = os.environ.copy() + if env_vars: + env.update(env_vars) + env["PYTHON_JIT"] = "0" + output_file = cwd + "/profile.json.gz" + base_cmd = ( + "samply", + "record", + "--save-only", + "-o", output_file, + ) + proc = subprocess.run( + base_cmd + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env, + ) + if proc.returncode: + print(proc.stderr, file=sys.stderr) + raise ValueError(f"Samply failed with return code {proc.returncode}") + + import gzip + with gzip.open(output_file, mode="rt", encoding="utf-8") as f: + return f.read() + + +@unittest.skipUnless(samply_command_works(), "samply command doesn't work") +class TestSamplyProfilerMixin: + def run_samply(self, script_dir, perf_mode, script): + raise NotImplementedError() + + def test_python_calls_appear_in_the_stack_if_perf_activated(self): + with temp_dir() as script_dir: + code = """if 1: + def foo(n): + x = 0 + for i in range(n): + x += i + + def bar(n): + foo(n) + + def baz(n): + bar(n) + + baz(10000000) + """ + script = make_script(script_dir, "perftest", code) + output = self.run_samply(script_dir, script) + + self.assertIn(f"py::foo:{script}", output) + self.assertIn(f"py::bar:{script}", output) + self.assertIn(f"py::baz:{script}", output) + + def test_python_calls_do_not_appear_in_the_stack_if_perf_deactivated(self): + with temp_dir() as script_dir: + code = """if 1: + def foo(n): + x = 0 + for i in range(n): + x += i + + def bar(n): + foo(n) + + def baz(n): + bar(n) + + baz(10000000) + """ + script = make_script(script_dir, "perftest", code) + output = self.run_samply( + script_dir, script, activate_trampoline=False + ) + + self.assertNotIn(f"py::foo:{script}", output) + self.assertNotIn(f"py::bar:{script}", output) + self.assertNotIn(f"py::baz:{script}", output) + + +@unittest.skipUnless(samply_command_works(), "samply command doesn't work") +class TestSamplyProfiler(unittest.TestCase, TestSamplyProfilerMixin): + def run_samply(self, script_dir, script, activate_trampoline=True): + if activate_trampoline: + return run_samply(script_dir, sys.executable, "-Xperf", script) + return run_samply(script_dir, sys.executable, script) + + def setUp(self): + super().setUp() + self.perf_files = set(pathlib.Path("/tmp/").glob("perf-*.map")) + + def tearDown(self) -> None: + super().tearDown() + files_to_delete = ( + set(pathlib.Path("/tmp/").glob("perf-*.map")) - self.perf_files + ) + for file in files_to_delete: + file.unlink() + + def test_pre_fork_compile(self): + code = """if 1: + import sys + import os + import sysconfig + from _testinternalcapi import ( + compile_perf_trampoline_entry, + perf_trampoline_set_persist_after_fork, + ) + + def foo_fork(): + pass + + def bar_fork(): + foo_fork() + + def foo(): + import time; time.sleep(1) + + def bar(): + foo() + + def compile_trampolines_for_all_functions(): + perf_trampoline_set_persist_after_fork(1) + for _, obj in globals().items(): + if callable(obj) and hasattr(obj, '__code__'): + compile_perf_trampoline_entry(obj.__code__) + + if __name__ == "__main__": + compile_trampolines_for_all_functions() + pid = os.fork() + if pid == 0: + print(os.getpid()) + bar_fork() + else: + bar() + """ + + with temp_dir() as script_dir: + script = make_script(script_dir, "perftest", code) + env = {**os.environ, "PYTHON_JIT": "0"} + with subprocess.Popen( + [sys.executable, "-Xperf", script], + universal_newlines=True, + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + env=env, + ) as process: + stdout, stderr = process.communicate() + + self.assertEqual(process.returncode, 0) + self.assertNotIn("Error:", stderr) + child_pid = int(stdout.strip()) + perf_file = pathlib.Path(f"/tmp/perf-{process.pid}.map") + perf_child_file = pathlib.Path(f"/tmp/perf-{child_pid}.map") + self.assertTrue(perf_file.exists()) + self.assertTrue(perf_child_file.exists()) + + perf_file_contents = perf_file.read_text() + self.assertIn(f"py::foo:{script}", perf_file_contents) + self.assertIn(f"py::bar:{script}", perf_file_contents) + self.assertIn(f"py::foo_fork:{script}", perf_file_contents) + self.assertIn(f"py::bar_fork:{script}", perf_file_contents) + + child_perf_file_contents = perf_child_file.read_text() + self.assertIn(f"py::foo_fork:{script}", child_perf_file_contents) + self.assertIn(f"py::bar_fork:{script}", child_perf_file_contents) + + # Pre-compiled perf-map entries of a forked process must be + # identical in both the parent and child perf-map files. + perf_file_lines = perf_file_contents.split("\n") + for line in perf_file_lines: + if f"py::foo_fork:{script}" in line or f"py::bar_fork:{script}" in line: + self.assertIn(line, child_perf_file_contents) + + +if __name__ == "__main__": + unittest.main() diff --git a/Misc/ACKS b/Misc/ACKS index d1490e1e46ccfd..0f72aeac1670d9 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -43,6 +43,7 @@ Ray Allen Billy G. Allie Jamiel Almeida Kevin Altis +Nazım Can Altınova Samy Lahfa Skyler Leigh Amador Joe Amenta diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-07-09-11-15-42.gh-issue-136459.m4Udh8.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-07-09-11-15-42.gh-issue-136459.m4Udh8.rst new file mode 100644 index 00000000000000..b74ff6b3b3347f --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-07-09-11-15-42.gh-issue-136459.m4Udh8.rst @@ -0,0 +1,3 @@ +Add support for perf trampoline on macOS, to allow profilers wit JIT map +support to read python calls. While profiling, ``PYTHONPERFSUPPORT=1`` can +be appended to enable the trampoline. diff --git a/Python/asm_trampoline.S b/Python/asm_trampoline.S index 616752459ba4d9..643d584b28e4de 100644 --- a/Python/asm_trampoline.S +++ b/Python/asm_trampoline.S @@ -1,5 +1,9 @@ .text +#if defined(__APPLE__) + .globl __Py_trampoline_func_start +#else .globl _Py_trampoline_func_start +#endif # The following assembly is equivalent to: # PyObject * # trampoline(PyThreadState *ts, _PyInterpreterFrame *f, @@ -7,7 +11,11 @@ # { # return evaluator(ts, f, throwflag); # } +#if defined(__APPLE__) +__Py_trampoline_func_start: +#else _Py_trampoline_func_start: +#endif #ifdef __x86_64__ #if defined(__CET__) && (__CET__ & 1) endbr64 @@ -34,9 +42,14 @@ _Py_trampoline_func_start: addi sp,sp,16 jr ra #endif +#if defined(__APPLE__) + .globl __Py_trampoline_func_end +__Py_trampoline_func_end: +#else .globl _Py_trampoline_func_end _Py_trampoline_func_end: .section .note.GNU-stack,"",@progbits +#endif # Note for indicating the assembly code supports CET #if defined(__x86_64__) && defined(__CET__) && (__CET__ & 1) .section .note.gnu.property,"a" diff --git a/Python/perf_jit_trampoline.c b/Python/perf_jit_trampoline.c index 2ca18c23593547..9312381d9bfee4 100644 --- a/Python/perf_jit_trampoline.c +++ b/Python/perf_jit_trampoline.c @@ -66,7 +66,9 @@ #ifdef PY_HAVE_PERF_TRAMPOLINE /* Standard library includes for perf jitdump implementation */ -#include // ELF architecture constants +#if defined(__linux__) +# include // ELF architecture constants +#endif #include // File control operations #include // Standard I/O operations #include // Standard library functions @@ -74,7 +76,9 @@ #include // System data types #include // System calls (sysconf, getpid) #include // Time functions (gettimeofday) -#include // System call interface +#if defined(__linux__) +# include // System call interface +#endif // ============================================================================= // CONSTANTS AND CONFIGURATION @@ -102,6 +106,22 @@ */ #define PERF_JIT_CODE_PADDING 0x100 + +/* These constants are defined inside , which we can't use outside of linux. */ +#if !defined(__linux__) +# if defined(__i386__) || defined(_M_IX86) +# define EM_386 3 +# elif defined(__arm__) || defined(_M_ARM) +# define EM_ARM 40 +# elif defined(__x86_64__) || defined(_M_X64) +# define EM_X86_64 62 +# elif defined(__aarch64__) +# define EM_AARCH64 183 +# elif defined(__riscv) +# define EM_RISCV 243 +# endif +#endif + /* Convenient access to the global trampoline API state */ #define trampoline_api _PyRuntime.ceval.perf.trampoline_api @@ -195,7 +215,7 @@ struct BaseEvent { typedef struct { struct BaseEvent base; // Common event header uint32_t process_id; // Process ID where code was generated - uint32_t thread_id; // Thread ID where code was generated + uint64_t thread_id; // Thread ID where code was generated uint64_t vma; // Virtual memory address where code is loaded uint64_t code_address; // Address of the actual machine code uint64_t code_size; // Size of the machine code in bytes @@ -942,6 +962,10 @@ static void* perf_map_jit_init(void) { return NULL; // Failed to get page size } +#if defined(__APPLE__) + // On macOS, samply uses a preload to find jitdumps and this mmap can be slow. + perf_jit_map_state.mapped_buffer = NULL; +#else /* * Map the first page of the jitdump file * @@ -964,6 +988,7 @@ static void* perf_map_jit_init(void) { close(fd); return NULL; // Memory mapping failed } +#endif perf_jit_map_state.mapped_size = page_size; @@ -1166,7 +1191,11 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, ev.base.size = sizeof(ev) + (name_length+1) + size; ev.base.time_stamp = get_current_monotonic_ticks(); ev.process_id = getpid(); +#if defined(__APPLE__) + pthread_threadid_np(NULL, &ev.thread_id); +#else ev.thread_id = syscall(SYS_gettid); // Get thread ID via system call +#endif ev.vma = base; // Virtual memory address ev.code_address = base; // Same as VMA for our use case ev.code_size = size; @@ -1262,4 +1291,4 @@ _PyPerf_Callbacks _Py_perfmap_jit_callbacks = { &perf_map_jit_fini, // Cleanup function }; -#endif /* PY_HAVE_PERF_TRAMPOLINE */ \ No newline at end of file +#endif /* PY_HAVE_PERF_TRAMPOLINE */ diff --git a/configure b/configure index 94a0b810333ce9..376860c8d10056 100755 --- a/configure +++ b/configure @@ -13815,6 +13815,8 @@ case $PLATFORM_TRIPLET in #( perf_trampoline=yes ;; #( aarch64-linux-gnu) : perf_trampoline=yes ;; #( + darwin) : + perf_trampoline=yes ;; #( *) : perf_trampoline=no ;; diff --git a/configure.ac b/configure.ac index ade71bc011eb87..05b5e5b4ce35d9 100644 --- a/configure.ac +++ b/configure.ac @@ -3690,12 +3690,13 @@ case "$ac_sys_system" in esac AC_MSG_RESULT([$SHLIBS]) -dnl perf trampoline is Linux specific and requires an arch-specific +dnl perf trampoline is Linux and macOS specific and requires an arch-specific dnl trampoline in assembly. AC_MSG_CHECKING([perf trampoline]) AS_CASE([$PLATFORM_TRIPLET], [x86_64-linux-gnu], [perf_trampoline=yes], [aarch64-linux-gnu], [perf_trampoline=yes], + [darwin], [perf_trampoline=yes], [perf_trampoline=no] ) AC_MSG_RESULT([$perf_trampoline])