Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion mypy_self_check.ini
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ pretty = True
always_false = MYPYC
plugins = mypy.plugins.proper_plugin
python_version = 3.9
exclude = mypy/typeshed/|mypyc/test-data/|mypyc/lib-rt/
exclude = mypy/typeshed/|mypyc/test-data/
enable_error_code = ignore-without-code,redundant-expr
enable_incomplete_feature = PreciseTupleTypes
show_error_code_links = True
Expand Down
17 changes: 9 additions & 8 deletions mypyc/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any, NamedTuple, NoReturn, Union, cast

import mypyc.build_setup # noqa: F401
from mypy.build import BuildSource
from mypy.errors import CompileError
from mypy.fscache import FileSystemCache
Expand All @@ -36,7 +37,7 @@
from mypy.util import write_junit_xml
from mypyc.annotate import generate_annotated_html
from mypyc.codegen import emitmodule
from mypyc.common import IS_FREE_THREADED, RUNTIME_C_FILES, X86_64, shared_lib_name
from mypyc.common import IS_FREE_THREADED, RUNTIME_C_FILES, shared_lib_name
from mypyc.errors import Errors
from mypyc.ir.pprint import format_modules
from mypyc.namegen import exported_name
Expand Down Expand Up @@ -70,6 +71,13 @@ class ModDesc(NamedTuple):
"base64/arch/neon64/codec.c",
],
[
"base64/arch/avx/enc_loop_asm.c",
"base64/arch/avx2/enc_loop.c",
"base64/arch/avx2/enc_loop_asm.c",
"base64/arch/avx2/enc_reshuffle.c",
"base64/arch/avx2/enc_translate.c",
"base64/arch/avx2/dec_loop.c",
"base64/arch/avx2/dec_reshuffle.c",
"base64/arch/generic/32/enc_loop.c",
"base64/arch/generic/64/enc_loop.c",
"base64/arch/generic/32/dec_loop.c",
Expand Down Expand Up @@ -661,9 +669,6 @@ def mypycify(
# See https://github.com/mypyc/mypyc/issues/956
"-Wno-cpp",
]
if X86_64:
# Enable SIMD extensions. All CPUs released since ~2010 support SSE4.2.
cflags.append("-msse4.2")
if log_trace:
cflags.append("-DMYPYC_LOG_TRACE")
if experimental_features:
Expand Down Expand Up @@ -692,10 +697,6 @@ def mypycify(
# that we actually get the compilation speed and memory
# use wins that multi-file mode is intended for.
cflags += ["/GL-", "/wd9025"] # warning about overriding /GL
if X86_64:
# Enable SIMD extensions. All CPUs released since ~2010 support SSE4.2.
# Also Windows 11 requires SSE4.2 since 24H2.
cflags.append("/arch:SSE4.2")
if log_trace:
cflags.append("/DMYPYC_LOG_TRACE")
if experimental_features:
Expand Down
63 changes: 63 additions & 0 deletions mypyc/build_setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import platform
import sys

try:
# Import setuptools so that it monkey-patch overrides distutils
import setuptools # noqa: F401
except ImportError:
pass

if sys.version_info >= (3, 12):
# From setuptools' monkeypatch
from distutils import ccompiler # type: ignore[import-not-found]
else:
from distutils import ccompiler

EXTRA_FLAGS_PER_COMPILER_TYPE_PER_PATH_COMPONENT = {
"unix": {
"base64/arch/ssse3": ["-mssse3", "-DBASE64_WITH_SSSE3"],
"base64/arch/sse41": ["-msse4.1", "-DBASE64_WITH_SSE41"],
"base64/arch/sse42": ["-msse4.2", "-DBASE64_WITH_SSE42"],
"base64/arch/avx2": ["-mavx2", "-DBASE64_WITH_AVX2"],
"base64/arch/avx": ["-mavx", "-DBASE64_WITH_AVX"],
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think these BASE64_WITH... #defines need to enabled in all files. Otherwise the codec choosing code doesn't get triggered (which happens in codec_choose.c). With these changes we compile the SIMD versions, but I don't think they will be used at runtime. I ran a microbenchmark and performance was slower on an AMD system with this PR.

Here's the benchmark I used (added it to run-base64.test temporarily):

[case testXXX_librt_experimental]
import time
from librt.base64 import b64encode

a = b"foo"
b = a * 10000

def bench1(b: bytes, n: int) -> None:
    for i in range(n):
        b64encode(b)

bench1(b, 1000000)  # Warmup

t0 = time.time()
n = 1000 * 200
bench1(b, n)
td = time.time() - t0
print(len(b) * n / td / 1024 / 1024, "MB/s")

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah you need to force optimization level to 3 to get meaningful benchmark results (e.g. patch mypyc.build.mypycify and force opt_level to be 3).

Copy link
Copy Markdown
Contributor Author

@mr-c mr-c Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JukkaL Thanks for looking into this. My laptop died this morning, so feel free to push additional fixes to my branch

Copy link
Copy Markdown
Contributor Author

@mr-c mr-c Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI: In the lib-rt setup.py the 3rd optimization level is enabled

mypy/mypyc/lib-rt/setup.py

Lines 130 to 131 in 379cd1e

if compiler.compiler_type == "unix": # type: ignore[attr-defined]
cflags += ["-O3"]

I'm surprised you had an issue with mypycify, as 3 is the default level

opt_level: str = "3",

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think these BASE64_WITH... #defines need to enabled in all files. Otherwise the codec choosing code doesn't get triggered (which happens in codec_choose.c).

Okay, I agree that for X86-64, all the HAVE_* definitions should always be enabled.

I guess the easiest way is to edit mypyc/lib-rt/base64/config.h to set those flags inside a #if defined(__x86_64__) && defined(__LP64__) check and trim the above flags to just setting -mavx2 and similar.

Copy link
Copy Markdown
Contributor Author

@mr-c mr-c Nov 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm back; I got a new laptop charger :-D

Thank you for the micro benchmark, it helps a lot!

FYI: In the lib-rt setup.py the 3rd optimization level is enabled

mypy/mypyc/lib-rt/setup.py

Lines 130 to 131 in 379cd1e

if compiler.compiler_type == "unix": # type: ignore[attr-defined]
cflags += ["-O3"]

I'm surprised you had an issue with mypycify, as 3 is the default level

opt_level: str = "3",

Ah, it got overridden by this line, so setting MYPYC_OPT_LEVEL=3 pytest -n0 -vvv -s mypyc -k testXXX_librt_experimental is easier than patching

opt_level = int(os.environ.get("MYPYC_OPT_LEVEL", 0))

I've added a commit to set the HAVE_{SSSE3,SSE41,SSE42,AVX,AVX2} flags automatically for amd64/x86-64 systems, removing the need for the BASE64_WITH_* definitions on the compile time.

The baseline speed on my system using your benchmarking was 9,089 MB/s before my changes, now it is 14,461 MB/s. It also showed that all the -mavx2 -mavx flags were being added also to the final linking stage, which is obviously not appropriate:

INFO root:spawn.py:77 gcc -shared -L/home/mi/crusoe/.pyenv/versions/3.13.2/lib -Wl,-rpath,/home/mi/crusoe/.pyenv/versions/3.13.2/lib -L/home/mi/crusoe/.pyenv/versions/3.13.2/lib -Wl,-rpath,/home/mi/crusoe/.pyenv/versions/3.13.2/lib build/temp.linux-x86_64-cpython-313/build/base64/arch/avx/codec.o build/temp.linux-x86_64-cpython-313/build/base64/arch/avx2/codec.o build/temp.linux-x86_64-cpython-313/build/base64/arch/avx512/codec.o build/temp.linux-x86_64-cpython-313/build/base64/arch/generic/codec.o build/temp.linux-x86_64-cpython-313/build/base64/arch/neon32/codec.o build/temp.linux-x86_64-cpython-313/build/base64/arch/neon64/codec.o build/temp.linux-x86_64-cpython-313/build/base64/arch/sse41/codec.o build/temp.linux-x86_64-cpython-313/build/base64/arch/sse42/codec.o build/temp.linux-x86_64-cpython-313/build/base64/arch/ssse3/codec.o build/temp.linux-x86_64-cpython-313/build/base64/codec_choose.o build/temp.linux-x86_64-cpython-313/build/base64/lib.o build/temp.linux-x86_64-cpython-313/build/base64/tables/tables.o build/temp.linux-x86_64-cpython-313/build/bytes_ops.o build/temp.linux-x86_64-cpython-313/build/dict_ops.o build/temp.linux-x86_64-cpython-313/build/exc_ops.o build/temp.linux-x86_64-cpython-313/build/float_ops.o build/temp.linux-x86_64-cpython-313/build/generic_ops.o build/temp.linux-x86_64-cpython-313/build/getargs.o build/temp.linux-x86_64-cpython-313/build/getargsfast.o build/temp.linux-x86_64-cpython-313/build/init.o build/temp.linux-x86_64-cpython-313/build/int_ops.o build/temp.linux-x86_64-cpython-313/build/librt_base64.o build/temp.linux-x86_64-cpython-313/build/list_ops.o build/temp.linux-x86_64-cpython-313/build/misc_ops.o build/temp.linux-x86_64-cpython-313/build/pythonsupport.o build/temp.linux-x86_64-cpython-313/build/set_ops.o build/temp.linux-x86_64-cpython-313/build/str_ops.o build/temp.linux-x86_64-cpython-313/build/tuple_ops.o -L/home/mi/crusoe/.pyenv/versions/3.13.2/lib -o build/lib.linux-x86_64-cpython-313/librt/base64.cpython-313-x86_64-linux-gnu.so -mssse3 -msse4.2 -msse4.1 -mavx -mavx2 -mavx

So the next commit limits the matches to when the term ends in .c. The new speed is 14,242 MB/s, a 57% improvement from the baseline (before this PR).

},
"msvc": {
"base64/arch/sse42": ["/arch:SSE4.2", "/DBASE64_WITH_SSE42"],
"base64/arch/avx2": ["/arch:AVX2", "/DBASE64_WITH_AVX2"],
"base64/arch/avx": ["/arch:AVX", "/DBASE64_WITH_AVX"],
},
}

ccompiler.CCompiler.__spawn = ccompiler.CCompiler.spawn # type: ignore[attr-defined]
X86_64 = platform.machine() in ("x86_64", "AMD64", "amd64")


def spawn(self, cmd, **kwargs) -> None: # type: ignore[no-untyped-def]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any reason why not to annotate this?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I tried annotating this before, but the signature varies too much between setuptools/distutils versions and Python versions.

compiler_type: str = self.compiler_type
extra_options = EXTRA_FLAGS_PER_COMPILER_TYPE_PER_PATH_COMPONENT[compiler_type]
new_cmd = list(cmd)
if X86_64 and extra_options is not None:
# filenames are closer to the end of command line
for argument in reversed(new_cmd):
# Check if argument contains a filename. We must check for all
# possible extensions; checking for target extension is faster.
if self.obj_extension and not str(argument).endswith(self.obj_extension):
continue

for path in extra_options.keys():
if path in str(argument):
if compiler_type == "bcpp":
compiler = new_cmd.pop()
# Borland accepts a source file name at the end,
# insert the options before it
new_cmd.extend(extra_options[path])
new_cmd.append(compiler)
else:
new_cmd.extend(extra_options[path])

# path component is found, no need to search any further
break
self.__spawn(new_cmd, **kwargs)


ccompiler.CCompiler.spawn = spawn # type: ignore[method-assign]
3 changes: 0 additions & 3 deletions mypyc/common.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import platform
import sys
import sysconfig
from typing import Any, Final
Expand Down Expand Up @@ -45,8 +44,6 @@

IS_32_BIT_PLATFORM: Final = int(SIZEOF_SIZE_T) == 4

X86_64: Final = platform.machine() in ("x86_64", "AMD64", "amd64")

PLATFORM_SIZE = 4 if IS_32_BIT_PLATFORM else 8

# Maximum value for a short tagged integer.
Expand Down
2 changes: 1 addition & 1 deletion mypyc/lib-rt/base64/arch/avx/codec.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#include "../ssse3/dec_loop.c"

#if BASE64_AVX_USE_ASM
# include "enc_loop_asm.c"
# include "./enc_loop_asm.c"
#else
# include "../ssse3/enc_translate.c"
# include "../ssse3/enc_reshuffle.c"
Expand Down
12 changes: 6 additions & 6 deletions mypyc/lib-rt/base64/arch/avx2/codec.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@
# endif
#endif

#include "dec_reshuffle.c"
#include "dec_loop.c"
#include "./dec_reshuffle.c"
#include "./dec_loop.c"

#if BASE64_AVX2_USE_ASM
# include "enc_loop_asm.c"
# include "./enc_loop_asm.c"
#else
# include "enc_translate.c"
# include "enc_reshuffle.c"
# include "enc_loop.c"
# include "./enc_translate.c"
# include "./enc_reshuffle.c"
# include "./enc_loop.c"
#endif

#endif // HAVE_AVX2
Expand Down
11 changes: 0 additions & 11 deletions mypyc/lib-rt/base64/config.h
Original file line number Diff line number Diff line change
@@ -1,27 +1,16 @@
#ifndef BASE64_CONFIG_H
#define BASE64_CONFIG_H

#define BASE64_WITH_SSSE3 0
#define HAVE_SSSE3 BASE64_WITH_SSSE3

#define BASE64_WITH_SSE41 0
#define HAVE_SSE41 BASE64_WITH_SSE41

#if defined(__x86_64__) || defined(_M_X64)
#define BASE64_WITH_SSE42 1
#else
#define BASE64_WITH_SSE42 0
#endif

#define HAVE_SSE42 BASE64_WITH_SSE42

#define BASE64_WITH_AVX 0
#define HAVE_AVX BASE64_WITH_AVX

#define BASE64_WITH_AVX2 0
#define HAVE_AVX2 BASE64_WITH_AVX2

#define BASE64_WITH_AVX512 0
#define HAVE_AVX512 BASE64_WITH_AVX512

#define BASE64_WITH_NEON32 0
Expand Down
55 changes: 49 additions & 6 deletions mypyc/lib-rt/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,56 @@
"pythonsupport.c",
]

EXTRA_FLAGS_PER_COMPILER_TYPE_PER_PATH_COMPONENT = {
"unix": {
"base64/arch/ssse3": ["-mssse3", "-DBASE64_WITH_SSSE3"],
"base64/arch/sse41": ["-msse4.1", "-DBASE64_WITH_SSE41"],
"base64/arch/sse42": ["-msse4.2", "-DBASE64_WITH_SSE42"],
"base64/arch/avx2": ["-mavx2", "-DBASE64_WITH_AVX2"],
"base64/arch/avx": ["-mavx", "-DBASE64_WITH_AVX"],
},
"msvc": {
"base64/arch/sse42": ["/arch:SSE4.2", "/DBASE64_WITH_SSE42"],
"base64/arch/avx2": ["/arch:AVX2", "/DBASE64_WITH_AVX2"],
"base64/arch/avx": ["/arch:AVX", "/DBASE64_WITH_AVX"],
},
}

ccompiler.CCompiler.__spawn = ccompiler.CCompiler.spawn # type: ignore[attr-defined]
X86_64 = platform.machine() in ("x86_64", "AMD64", "amd64")


def spawn(self, cmd, **kwargs) -> None: # type: ignore[no-untyped-def]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks similar if not the same, any particular reason why not have it in a shared location, or is this because of the fact these are setup files?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the later; the existing code also has duplication issues as already noted

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that there is more duplicated we can try to share more of the code, but it can happen outside this PR.

compiler_type: str = self.compiler_type
extra_options = EXTRA_FLAGS_PER_COMPILER_TYPE_PER_PATH_COMPONENT[compiler_type]
new_cmd = list(cmd)
if X86_64 and extra_options is not None:
# filenames are closer to the end of command line
for argument in reversed(new_cmd):
# Check if argument contains a filename. We must check for all
# possible extensions; checking for target extension is faster.
if self.obj_extension and not str(argument).endswith(self.obj_extension):
continue

for path in extra_options.keys():
if path in str(argument):
if compiler_type == "bcpp":
compiler = new_cmd.pop()
# Borland accepts a source file name at the end,
# insert the options before it
new_cmd.extend(extra_options[path])
new_cmd.append(compiler)
else:
new_cmd.extend(extra_options[path])

# path component is found, no need to search any further
break
self.__spawn(new_cmd, **kwargs)


ccompiler.CCompiler.spawn = spawn # type: ignore[method-assign]


class BuildExtGtest(build_ext):
def get_library_names(self) -> list[str]:
return ["gtest"]
Expand Down Expand Up @@ -80,14 +127,10 @@ def run(self) -> None:
compiler = ccompiler.new_compiler()
sysconfig.customize_compiler(compiler)
cflags: list[str] = []
if compiler.compiler_type == "unix":
if compiler.compiler_type == "unix": # type: ignore[attr-defined]
cflags += ["-O3"]
if X86_64:
cflags.append("-msse4.2") # Enable SIMD (see also mypyc/build.py)
elif compiler.compiler_type == "msvc":
elif compiler.compiler_type == "msvc": # type: ignore[attr-defined]
cflags += ["/O2"]
if X86_64:
cflags.append("/arch:SSE4.2") # Enable SIMD (see also mypyc/build.py)

setup(
ext_modules=[
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ def run(self) -> None:
os.path.join("mypyc", "lib-rt", "setup.py"),
# Uses __file__ at top level https://github.com/mypyc/mypyc/issues/700
os.path.join("mypyc", "__main__.py"),
os.path.join("mypyc", "build_setup.py"), # for monkeypatching
)

everything = [os.path.join("mypy", x) for x in find_package_data("mypy", ["*.py"])] + [
Expand Down