MNT: Reorganize non-constant global statics into structs #26607

ngoldbaum · 2024-06-03T22:06:33Z

This reorganizes most of the mutable static globals in numpy into four structs exposed to the internal API via multiarraymodule.h

npy_interned_str for interned strings
npy_static_pydata for immutable PyObjects that are initialized during module initialization
npy_static_cdata for immutable C data that is initialized during module initialization
npy_ma_thread_unsafe_state for state stored in a thread-unsafe manner

With the goal of refactoring the items in npy_ma_thread_unsafe_state to be thread-safe in followup PRs. See also the tracking issue for items that still need to be fixed.

ngoldbaum · 2024-06-04T20:14:15Z

On my system I see that import numpy._core._multiarray_umath spends a negligible fraction of the total import time importing the _multiarray_umath symbol itself, spending much more time importing pure python code:

○  python -X importtime -c "import numpy._core._multiarray_umath"
import time: self [us] | cumulative | imported package
import time:       131 |        131 |   _io
import time:        17 |         17 |   marshal
import time:       121 |        121 |   posix
import time:       151 |        419 | _frozen_importlib_external
import time:       160 |        160 |   time
import time:        62 |        221 | zipimport
import time:        62 |         62 |     _codecs
import time:       111 |        172 |   codecs
import time:       210 |        210 |   encodings.aliases
import time:       313 |        694 | encodings
import time:        98 |         98 | encodings.utf_8
import time:        55 |         55 | _signal
import time:        14 |         14 |     _abc
import time:        62 |         75 |   abc
import time:        81 |        156 | io
import time:        21 |         21 |       _stat
import time:        24 |         44 |     stat
import time:       409 |        409 |     _collections_abc
import time:        16 |         16 |       genericpath
import time:        29 |         44 |     posixpath
import time:       138 |        635 |   os
import time:        35 |         35 |   _sitebuiltins
import time:       332 |        332 |   _distutils_hack
import time:        70 |         70 |     __future__
import time:       118 |        118 |           types
import time:        69 |         69 |             _operator
import time:       129 |        198 |           operator
import time:        91 |         91 |               itertools
import time:        56 |         56 |               keyword
import time:        82 |         82 |               reprlib
import time:        30 |         30 |               _collections
import time:       376 |        633 |             collections
import time:        24 |         24 |             _functools
import time:       307 |        963 |           functools
import time:       633 |       1910 |         enum
import time:        31 |         31 |           _sre
import time:       123 |        123 |             re._constants
import time:       255 |        377 |           re._parser
import time:        55 |         55 |           re._casefix
import time:       180 |        640 |         re._compiler
import time:        73 |         73 |         copyreg
import time:       276 |       2897 |       re
import time:       890 |        890 |       _ast
import time:       300 |        300 |       contextlib
import time:      1044 |       5130 |     ast
import time:       148 |        148 |         warnings
import time:        83 |        230 |       importlib
import time:        27 |         27 |       importlib.machinery
import time:        69 |         69 |       importlib._abc
import time:        67 |         67 |             fnmatch
import time:        31 |         31 |               _winapi
import time:        26 |         26 |               nt
import time:        25 |         25 |               nt
import time:        23 |         23 |               nt
import time:        26 |         26 |               nt
import time:        23 |         23 |               nt
import time:        22 |         22 |               nt
import time:        43 |        217 |             ntpath
import time:        30 |         30 |             errno
import time:        52 |         52 |               urllib
import time:       335 |        335 |               math
import time:       634 |        634 |               ipaddress
import time:       638 |       1658 |             urllib.parse
import time:       403 |       2372 |           pathlib
import time:       191 |        191 |               zlib
import time:        97 |         97 |                 _compression
import time:       143 |        143 |                 _bz2
import time:       180 |        419 |               bz2
import time:       296 |        296 |                 _lzma
import time:       111 |        407 |               lzma
import time:       312 |       1327 |             shutil
import time:       126 |        126 |                 _bisect
import time:        62 |        188 |               bisect
import time:       139 |        139 |               _random
import time:       185 |        185 |               _sha2
import time:       193 |        703 |             random
import time:        91 |         91 |               _weakrefset
import time:       211 |        302 |             weakref
import time:       223 |       2554 |           tempfile
import time:       120 |        120 |                 _opcode
import time:       205 |        325 |               opcode
import time:       341 |        666 |             dis
import time:        90 |         90 |             collections.abc
import time:        72 |         72 |                 token
import time:        19 |         19 |                 _tokenize
import time:       431 |        522 |               tokenize
import time:        67 |        588 |             linecache
import time:       991 |       2333 |           inspect
import time:        16 |         16 |             _typing
import time:      1148 |       1163 |           typing
import time:       177 |        177 |           importlib.resources.abc
import time:       145 |        145 |           importlib.resources._adapters
import time:       199 |       8940 |         importlib.resources._common
import time:        95 |         95 |         importlib.resources._legacy
import time:        70 |       9104 |       importlib.resources
import time:       251 |       9680 |     importlib.abc
import time:       325 |        325 |       threading
import time:        61 |        386 |     importlib.util
import time:       153 |        153 |           _json
import time:       178 |        331 |         json.scanner
import time:       201 |        532 |       json.decoder
import time:       176 |        176 |       json.encoder
import time:       138 |        844 |     json
import time:        33 |         33 |         _locale
import time:       700 |        733 |       locale
import time:       259 |        259 |       signal
import time:       173 |        173 |       fcntl
import time:        28 |         28 |       msvcrt
import time:       199 |        199 |       _posixsubprocess
import time:       212 |        212 |       select
import time:       309 |        309 |       selectors
import time:       282 |       2192 |     subprocess
import time:       284 |      18583 |   _pandas_editable_loader
import time:        35 |         35 |   sitecustomize
import time:        27 |         27 |   usercustomize
import time:       369 |      20013 | site
import time:        78 |         78 |           numpy._utils._convertions
import time:        71 |        148 |         numpy._utils
import time:       184 |        332 |       numpy._globals
import time:        79 |         79 |       numpy._expired_attrs_2_0
import time:        49 |         49 |       numpy.version
import time:        13 |         13 |         numpy._distributor_init_local
import time:        56 |         69 |       numpy._distributor_init
import time:        68 |         68 |                 numpy._utils._inspect
import time:       178 |        178 |                   numpy.exceptions
import time:       157 |        157 |                   numpy._core._exceptions
import time:       230 |        230 |                     _datetime
import time:        72 |        301 |                   datetime
import time:        86 |         86 |                   numpy.dtypes
import time:      2317 |       3038 |                 numpy._core._multiarray_umath
import time:       133 |       3238 |               numpy._core.overrides
import time:       277 |       3514 |             numpy._core.multiarray
import time:       104 |        104 |             numpy._core.umath
import time:       178 |        178 |               numbers
import time:        65 |         65 |               numpy._core._string_helpers
import time:       146 |        146 |               numpy._core._type_aliases
import time:       124 |        124 |               numpy._core._dtype
import time:       161 |        673 |             numpy._core.numerictypes
import time:       224 |        224 |                         _struct
import time:        93 |        317 |                       struct
import time:       129 |        129 |                       _compat_pickle
import time:       250 |        250 |                       _pickle
import time:       426 |       1120 |                     pickle
import time:       143 |        143 |                         _contextvars
import time:        59 |        202 |                       contextvars
import time:       118 |        319 |                     numpy._core._ufunc_config
import time:       114 |       1553 |                   numpy._core._methods
import time:       406 |       1958 |                 numpy._core.fromnumeric
import time:       224 |       2182 |               numpy._core.shape_base
import time:       322 |        322 |               numpy._core.arrayprint
import time:        67 |         67 |               numpy._core._asarray
import time:       399 |       2968 |             numpy._core.numeric
import time:       151 |        151 |             numpy._core.records
import time:        91 |         91 |             numpy._core.memmap
import time:       103 |        103 |             numpy._core.function_base
import time:        77 |         77 |             numpy._core._machar
import time:       137 |        137 |             numpy._core.getlimits
import time:       124 |        124 |             numpy._core.einsumfunc
import time:       493 |        493 |             numpy._core._add_newdocs
import time:       178 |        178 |             numpy._core._add_newdocs_scalars
import time:        66 |         66 |             numpy._core._dtype_ctypes
import time:       339 |        339 |                 _ctypes
import time:       157 |        157 |                 ctypes._endian
import time:       489 |        984 |               ctypes
import time:       354 |       1338 |             numpy._core._internal
import time:        80 |         80 |             numpy._pytesttester
import time:       244 |      10334 |           numpy._core
import time:        10 |      10344 |         numpy._core._multiarray_umath
import time:       157 |      10500 |       numpy.__config__
import time:        47 |         47 |           numpy.lib._array_utils_impl
import time:       126 |        172 |         numpy.lib.array_utils
import time:        48 |         48 |         numpy.lib.introspect
import time:        86 |         86 |         numpy.lib.mixins
import time:       394 |        394 |                 textwrap
import time:        35 |         35 |                   _wmi
import time:       281 |        316 |                 platform
import time:       114 |        823 |               numpy.lib._utils_impl
import time:       154 |        977 |             numpy.lib.format
import time:       111 |        111 |             numpy.lib._datasource
import time:       585 |        585 |             numpy.lib._iotools
import time:       267 |       1939 |           numpy.lib._npyio_impl
import time:        46 |       1984 |         numpy.lib.npyio
import time:        55 |         55 |               numpy.lib._ufunclike_impl
import time:       133 |        188 |             numpy.lib._type_check_impl
import time:       119 |        307 |           numpy.lib._scimath_impl
import time:        68 |        374 |         numpy.lib.scimath
import time:        98 |         98 |           numpy.lib._stride_tricks_impl
import time:        54 |        151 |         numpy.lib.stride_tricks
import time:        63 |         63 |                 numpy.linalg.linalg
import time:       133 |        133 |                   numpy.lib._twodim_base_impl
import time:       477 |        477 |                   numpy.linalg._umath_linalg
import time:       132 |        132 |                     numpy._typing._nested_sequence
import time:        54 |         54 |                     numpy._typing._nbit
import time:       366 |        366 |                     numpy._typing._char_codes
import time:       151 |        151 |                     numpy._typing._scalars
import time:        64 |         64 |                     numpy._typing._shape
import time:       643 |        643 |                     numpy._typing._dtype_like
import time:       806 |        806 |                     numpy._typing._array_like
import time:       238 |       2449 |                   numpy._typing
import time:       678 |       3735 |                 numpy.linalg._linalg
import time:        83 |       3880 |               numpy.linalg
import time:       178 |       4058 |             numpy.matrixlib.defmatrix
import time:        59 |       4116 |           numpy.matrixlib
import time:       173 |        173 |             numpy.lib._histograms_impl
import time:       462 |        634 |           numpy.lib._function_base_impl
import time:       211 |       4961 |         numpy.lib._index_tricks_impl
import time:       189 |        189 |         numpy.lib._nanfunctions_impl
import time:       223 |        223 |         numpy.lib._shape_base_impl
import time:       319 |        319 |         numpy.lib._arraysetops_impl
import time:       219 |        219 |         numpy.lib._polynomial_impl
import time:        78 |         78 |         numpy.lib._arrayterator_impl
import time:       134 |        134 |         numpy.lib._arraypad_impl
import time:        73 |         73 |         numpy.lib._version
import time:       217 |       9221 |       numpy.lib
import time:       969 |      21217 |     numpy
import time:         8 |      21224 |   numpy._core
import time:         9 |      21233 | numpy._core._multiarray_umath

When I test on this PR I don't see any significant timing difference in importing numpy, as expected given the above analysis.

ngoldbaum · 2024-06-04T22:44:36Z

I ran the full benchmark suite. It looks like there are some performance changes but it's kind of obnoxious to pick out reproducible changes from the big list spin bench -c spits out.

So far I've found:


float to complex64 casts

| Change   | Before [a2d19725] <main>   | After [95a592b5] <global-state-refactor>   |   Ratio | Benchmark (Parameter)                                           |
|----------|----------------------------|--------------------------------------------|---------|-----------------------------------------------------------------|
| +        | 3.22±0.01μs                | 5.82±0.03μs                                |    1.81 | bench_ufunc.NDArrayAsType.time_astype(('float32', 'complex64')) |
| +        | 3.44±0.03μs                | 5.87±0.02μs                                |    1.71 | bench_ufunc.NDArrayAsType.time_astype(('float64', 'complex64')) |

float32 partition

| Change   | Before [a2d19725] <main>   | After [95a592b5] <global-state-refactor>   |   Ratio | Benchmark (Parameter)                                                                 |
|----------|----------------------------|--------------------------------------------|---------|---------------------------------------------------------------------------------------|
| +        | 628±10μs                   | 1.05±0.02ms                                |    1.68 | bench_function_base.Partition.time_partition('float32', ('reversed',), 10)            |
| +        | 627±10μs                   | 1.05±0.03ms                                |    1.68 | bench_function_base.Partition.time_partition('float32', ('reversed',), 1000)          |
| +        | 628±10μs                   | 1.02±0ms                                   |    1.62 | bench_function_base.Partition.time_partition('float32', ('reversed',), 100)           |
| +        | 93.2±0.5μs                 | 142±2μs                                    |    1.52 | bench_function_base.Partition.time_partition('float32', ('sorted_block', 1000), 10)   |
| +        | 93.5±0.4μs                 | 142±2μs                                    |    1.52 | bench_function_base.Partition.time_partition('float32', ('sorted_block', 1000), 100)  |
| +        | 58.8±0.2μs                 | 84.9±1μs                                   |    1.44 | bench_function_base.Partition.time_partition('float32', ('ordered',), 10)             |
| +        | 58.9±0.2μs                 | 82.9±0.2μs                                 |    1.41 | bench_function_base.Partition.time_partition('float32', ('ordered',), 100)            |
| +        | 59.9±0.04μs                | 83.2±0.2μs                                 |    1.39 | bench_function_base.Partition.time_partition('float32', ('ordered',), 1000)           |
| +        | 207±0.7μs                  | 278±4μs                                    |    1.34 | bench_function_base.Partition.time_partition('float32', ('sorted_block', 1000), 1000) |
| +        | 189±4μs                    | 249±9μs                                    |    1.32 | bench_function_base.Partition.time_partition('float32', ('sorted_block', 10), 1000)   |
| +        | 190±6μs                    | 244±8μs                                    |    1.28 | bench_function_base.Partition.time_partition('float32', ('sorted_block', 10), 100)    |
| +        | 107±0.8μs                  | 136±0.2μs                                  |    1.27 | bench_function_base.Partition.time_partition('float32', ('sorted_block', 100), 1000)  |
| +        | 194±2μs                    | 244±10μs                                   |    1.26 | bench_function_base.Partition.time_partition('float32', ('sorted_block', 10), 10)     |
| +        | 111±1μs                    | 140±0.8μs                                  |    1.26 | bench_function_base.Partition.time_partition('float32', ('sorted_block', 100), 10)    |
| +        | 112±0.4μs                  | 138±0.2μs                                  |    1.23 | bench_function_base.Partition.time_partition('float32', ('sorted_block', 100), 100)   |
| +        | 441±10μs                   | 475±20μs                                   |    1.08 | bench_function_base.Partition.time_partition('float32', ('random',), 10)              |
| +        | 441±10μs                   | 475±10μs                                   |    1.08 | bench_function_base.Partition.time_partition('float32', ('random',), 100)             |
| -        | 456±0.6μs                  | 430±2μs                                    |    0.94 | bench_function_base.Partition.time_partition('int16', ('random',), 10)                |

see the full list here: https://gist.github.com/ngoldbaum/7e8ee9a129a96a32536f228e4214018b

ngoldbaum · 2024-06-05T02:46:23Z

I tried comparing main with itself on the full benchmark suite using spin bench -c and saw no changes, so the performance changes seen above certainly aren't noise like I said earlier. Will try to dig in to understand why some of the differences end up being random large performance regressions or improvements depending on the test run...

mattip · 2024-06-05T08:29:23Z

numpy/_core/src/multiarray/multiarraymodule.c

-    if (npy_ma_str_current_allocator == NULL) {
+    // this is module-level global heap allocation, it is currently
+    // never freed
+    npy_ma_str = PyMem_Calloc(sizeof(npy_ma_str_struct), 1);


Should this (and the others like it) check that it is not called twice?

mattip · 2024-06-05T08:35:13Z

Looks like a move in the right direction to me. I wonder if the performance changes if you statically allocate the structs rather than calling PyMem_Calloc. Does the dynamic allocation affect the size of the shared object or is it too small to notice?

ngoldbaum · 2024-06-05T22:03:14Z

I wonder if the performance changes if you statically allocate the structs rather than calling PyMem_Calloc.

Good point! It does seem to help, at least on Linux. There are still some heap allocations though, I still need to look closer at whether they can be made static.

ngoldbaum · 2024-06-06T15:41:32Z

So the latest version of this PR drops moving things into the npy_ma_state struct to avoid heap allocations entirely. I think that does improve performance but it's difficult for me to tell which benchmarks give useful results and which don't.

Earlier I said I compared main with itself using spin bench, but I did that using spin bench -c main with main checked out, and it looks like there is some state that asv persists between runs on the same commit hash, because when I do the same thing but with two identical commits with different hashes (generated by doing git commit --amend with no changes to the working directory), I see random changes in the results for some benchmarks:

$ spin bench -c f3643a2d85ef4c18769608e111d465ce4a778474 -v -t "bench_linalg.Linalg"
... eliding a bunch of output ...
| Change   | Before [f3643a2d]    | After [9e40ee24] <global-state-refactor>   |   Ratio | Benchmark (Parameter)                      |
|----------|----------------------|--------------------------------------------|---------|--------------------------------------------|
| +        | 783±3μs              | 1.45±0.7ms                                 |    1.85 | bench_linalg.Linalg.time_svd('int16')      |
| +        | 32.6±0.05μs          | 45.9±10μs                                  |    1.41 | bench_linalg.Linalg.time_det('complex64')  |
| -        | 35.5±0.04μs          | 32.5±0.03μs                                |    0.92 | bench_linalg.Linalg.time_det('float32')    |
| -        | 2.18±0.06ms          | 785±0.9μs                                  |    0.36 | bench_linalg.Linalg.time_svd('float64')    |
| -        | 2.20±0.04ms          | 792±1μs                                    |    0.36 | bench_linalg.Linalg.time_svd('int64')      |
| -        | 109±70μs             | 35.9±0.2μs                                 |    0.33 | bench_linalg.Linalg.time_det('complex128') |

Here f3643a2 is the most recent commit in this PR and 9e40ee24 is the commit I generated with a no-op git commit --amend.

I'm going to start a full benchmark run like this to determine which benchmarks have random results and I will ignore those for determining whether or not this PR has a performance impact.

rgommers · 2024-06-06T16:05:01Z

I see what the bug is there. param definition uses set, and the run-to-run variation makes asv misbehave. This diff fixes the problem:

diff --git a/benchmarks/benchmarks/bench_linalg.py b/benchmarks/benchmarks/bench_linalg.py
index 3077357237..f3eb819c18 100644
--- a/benchmarks/benchmarks/bench_linalg.py
+++ b/benchmarks/benchmarks/bench_linalg.py
@@ -72,7 +72,7 @@ def time_tensordot_a_b_axes_1_0_0_1(self):
 
 
 class Linalg(Benchmark):
-    params = set(TYPES1) - set(['float16'])
+    params = sorted(list(set(TYPES1) - set(['float16'])))
     param_names = ['dtype']
 
     def setup(self, typename):

ngoldbaum · 2024-06-06T16:09:07Z

Nice! That makes sense. This is just one benchmark I could run quickly to prove there's an issue, I'll take a look at the full list of randomly changing benchmarks to see if there are similar problems.

seberg

Left some comments. I am still a bit saddened by having these giant init functions and wonder if we shouldn't do a local init pattern instead for some things.

I may look into some maintanence here in general, I think we can invent some cuter patterns (even if it adds a bit of complexity in the helpers).
I also think we should just drop the _ma_ for "multiarray", it is a leftover of when multiarray and ufunc were two modules.
I.e. we could shorten it to npy_static.<...> even.

numpy/_core/src/multiarray/multiarraymodule.h

seberg · 2024-06-06T13:30:15Z

numpy/_core/src/multiarray/multiarraymodule.h

+    union {
+        npy_uint8  bytes[8];
+        npy_uint64 uint64;
+    } unpack_lookup_big[256];


I wonder if it makes sense to split out the non-objects because at some point, I assume that modules may need to decref all of these (or implement a tp_traverse, but that is the same thing).

Also, this table for example is truly static even with subinterpreters. The only issue is initialization.

Also, this table for example is truly static even with subinterpreters. The only issue is initialization.

Yup, everything in this struct is static after module initialization. I guess if we ever supported subinterpreters someone could make this be initialized once for all subinterpreters or just do it every time. I don't think it makes a ton of difference...

seberg · 2024-06-06T13:39:31Z

numpy/_core/src/multiarray/multiarraymodule.c

-    npy_ma_str___dlpack__ = PyUnicode_InternFromString("__dlpack__");
-    if (npy_ma_str___dlpack__ == NULL) {
+    npy_ma_str.__dlpack__ = PyUnicode_InternFromString("__dlpack__");
+    if (npy_ma_str.__dlpack__ == NULL) {


I have some thoughts on how to shrink this code (even if a bit tricky 20 lines, I feel it might be nice). I'll make a PR later.

I also think we should move this into it's own file.

I also think we should move this into it's own file.

Sorry, just to clarify - what should be moved into its own file? Filling all the static structs? Or just the string interning?

For the former, fair enough, that occured to me, and it would separate multiarraymodule.h from the global statics.

I would say all of the static ones in one file maybe? Seemed worthwhile to try to use this opportunity to clean things up a bit nicer (but I am happy to help with it!)

numpy/_core/src/umath/ufunc_object.c

ngoldbaum · 2024-06-07T21:58:37Z

I just split out the static data structs into their own file. I left the thread unsafe state struct in multiarraymodule.h since I don't think it makes sense to live in the new header and I like how it makes it clear when code accesses thread unsafe state and also to make it clearer where the state is defined. To justify that choice: I completely missed some global state on the first pass of this PR and added it to the thread unsafe state struct in this pass.

I looked at initializing e.g. the ArrayMethod objects used in the casts in a module init and I felt like it wasn't any clearer to initialize locally in an initialization function inside the same file as where the global is used or to initialize it along with all the other cached globals and it sort of feel like a nice pattern to centralize this stuff. All IMO of course.

I've been looking at benchmarking closely the past few days and I think all the benchmark results I shared earlier for this PR are noise. Many of the things that show up are due to bugs I fixed in #26637, #26638, and #26639. The rest I suspect are due to jitter on the laptop I was using to run the benchmarks as well as asv caching results. So far I haven't been able to find a single performance change reported by asv that is reproduce outside of the asv environment or that persists if I purge the asv results database or make a no-op empty commit to test with.

I'm starting an asv run with higher-than-default settings for rounds, repeat, and sample_time to spin over the weekend, which should hopefully leave me with a more manageable list of performance changes to sift through, if any.

numpy/_core/src/multiarray/npy_static_data.c

seberg

Had a look through with some nitpicky comments. Thanks for reorganizing into a new file, and the macro's for strings and imports are also at least much more compact!

Overall, I think this is good to go for me, unfortunately it might create merge conflicts pretty quickly, but I guess we'll just have to deal with them (for backporting)?

(There are the nitpicky comments and a merge conflict, but I can also just make a pass and merge if you think it's ready and it is clear that we are merging now.)

seberg · 2024-06-13T11:29:12Z

numpy/_core/src/multiarray/multiarraymodule.h

+     *      struct {
+     *          atomic_int initialized;
+     *          PyObject *value;
+     *      }


Just out of curiosity, so assuming that the value is never NULL after initialization, grabbing the lock and double checking for value == NULL is not valid (i.e. there is no safe pattern to do it)?
Because if that worked, the above seems unnecessary bloat.

You need to use at least one atomic load otherwise it's possible that the compiler might reorder things like:

// original code if (flag) { do_some_work(); do_more_work(); } // after optimization if (flag) { do_some_work(); } if (flag) { do_more_work() }

because data races are UB. In this case it would just lead to a memory leak. For the argparse cache I think it could cause two threads to simultaneously run initialize_keywords, corrupting the cache.

Yeah, I suspected that much, was just wondering if the atomic_int is strictly necessary (or there is a way to do an atomic_load(value), I guess).

Ah no it's not necessary, I can do it with an atomic load instead.

numpy/_core/src/multiarray/multiarraymodule.h

numpy/_core/src/multiarray/npy_static_data.c

numpy/_core/src/umath/override.c

numpy/_core/src/umath/ufunc_object.c

ngoldbaum · 2024-06-13T19:22:13Z

It seems gcc generates code that seg faults if you try to write to a field of a const static struct via a non-const pointer so unfortunately I don't think I can easily make the structs const. I could have two sets of structs - const structs used only to read from that are filled in at the end of initialization and non-const structs filled in during initialization - but that didn't seem worth the additional complexity to me.

ngoldbaum · 2024-06-13T19:25:33Z

I think this is ready to merge now. Since this will definitely conflict with the set_string_function PR, why don't we merge that one first, I'll fix the conflicts here, and then we can merge this one?

Also if there are worries about generating lots of conflicts to the 2.0 maintenance branch I'd be happy to hold off on merging this until we're a little closer to the 2.1 maintenance branch being created. I can continue to update this as I make more things thread-safe.

… non-null

Co-authored-by: Sebastian Berg <[email protected]>

ngoldbaum · 2024-06-19T20:22:01Z

We discussed this at the community meeting and agreed to merge it now, so I'm pulling this in. Thanks all for the reviews!

ngoldbaum added the 39 - free-threading PRs and issues related to support for free-threading CPython (a.k.a. no-GIL, PEP 703) label Jun 3, 2024

ngoldbaum force-pushed the global-state-refactor branch from aca2e25 to 95a592b Compare June 3, 2024 22:12

ngoldbaum mentioned this pull request Jun 3, 2024

MAINT: fix thread-unsafe cache initialization in PyUFunc_TrueDivisionTypeResolver #26497

Closed

rgommers added the 03 - Maintenance label Jun 4, 2024

charris changed the title ~~[MNT] Reorganize non-constant global statics into structs~~ MNT: Reorganize non-constant global statics into structs Jun 4, 2024

ngoldbaum mentioned this pull request Jun 4, 2024

ENH: Support free-threaded python build (tracking issue) #26157

Closed

16 tasks

ngoldbaum mentioned this pull request Jun 5, 2024

MNT: build numpy with link-time optimization in benchmarks #26616

Closed

mattip reviewed Jun 5, 2024

View reviewed changes

ngoldbaum force-pushed the global-state-refactor branch from b5464df to f3643a2 Compare June 5, 2024 22:47

seberg reviewed Jun 6, 2024

View reviewed changes

ngoldbaum force-pushed the global-state-refactor branch 2 times, most recently from e8861b3 to 8d7d57c Compare June 7, 2024 21:49

ngoldbaum force-pushed the global-state-refactor branch from 8d7d57c to eb55252 Compare June 7, 2024 22:34

mattip reviewed Jun 9, 2024

View reviewed changes

numpy/_core/src/multiarray/npy_static_data.c Outdated Show resolved Hide resolved

ngoldbaum force-pushed the global-state-refactor branch from eb55252 to 2bf1f1f Compare June 11, 2024 19:59

seberg reviewed Jun 13, 2024

View reviewed changes

ngoldbaum force-pushed the global-state-refactor branch 2 times, most recently from 39daec6 to 41c7f43 Compare June 13, 2024 19:19

MNT: move interned strings into a single global struct

baee891

ngoldbaum and others added 22 commits June 19, 2024 13:18

MNT: move cached imports into a global struct

69075c1

MNT: move cpu dispatch registry into global data struct

e5c1bd6

MNT: move ndarray.__array_*__ references to global data struct

7719cf2

MNT: move sys.flags.optimize cache to global data struct

3cbb68d

MNT: set up tuple for truediv in global data struct

2ffcc71

MNT: move unpack_bits LUT into global static struct

d2ca21b

MNT: move references to int(1) and int(0) to global static struct

a1f7200

MNT: move initialization of global ArrayMethods to module initialization

26c243d

MNT: move initialization of global tuples to global data struct

536e5fb

MNT: move default extobj contextvar to global data dict

0c22126

MNT: move PyArray_SetStringFunction internals into global data struct

90b1f38

BUG: remove questionable static initialization of an array object

6a296c4

MNT: split global data struct into two structs

398f095

MNT: add PyArrayMethodObject caches to static data struct

8f84875

MNT: move some thread-unsafe state in thread-unsafe state struct

402a83c

MNT: make data structs static instead of heap-allocated

e43275a

MNT: apply sebastian's refactoring suggestions

b706536

MNT: move static data structs into their own file

c237038

MNT: Add more global state I missed to the thread_unsafe_state struct

98ae65d

MNT: verify all entries in npy_interned_str and npy_static_pydata are…

a334ddc

… non-null

Apply suggestions from code review

9ed317f

Co-authored-by: Sebastian Berg <[email protected]>

MAINT: apply more of Sebastian's suggestions

3ae66b1

ngoldbaum force-pushed the global-state-refactor branch from 41c7f43 to 3ae66b1 Compare June 19, 2024 19:38

ngoldbaum merged commit 64ee06a into numpy:main Jun 19, 2024
66 of 68 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

MNT: Reorganize non-constant global statics into structs #26607

MNT: Reorganize non-constant global statics into structs #26607

ngoldbaum commented Jun 3, 2024 •

edited

Loading

ngoldbaum commented Jun 4, 2024

ngoldbaum commented Jun 4, 2024 •

edited

Loading

ngoldbaum commented Jun 5, 2024

mattip Jun 5, 2024

mattip commented Jun 5, 2024

ngoldbaum commented Jun 5, 2024

ngoldbaum commented Jun 6, 2024 •

edited

Loading

rgommers commented Jun 6, 2024

ngoldbaum commented Jun 6, 2024 •

edited

Loading

seberg left a comment

seberg Jun 6, 2024

ngoldbaum Jun 6, 2024

seberg Jun 6, 2024

ngoldbaum Jun 6, 2024

seberg Jun 7, 2024

ngoldbaum commented Jun 7, 2024 •

edited

Loading

seberg left a comment

seberg Jun 13, 2024

ngoldbaum Jun 13, 2024

seberg Jun 13, 2024

ngoldbaum Jun 13, 2024

ngoldbaum commented Jun 13, 2024 •

edited

Loading

ngoldbaum commented Jun 13, 2024

ngoldbaum commented Jun 19, 2024

MNT: Reorganize non-constant global statics into structs #26607

MNT: Reorganize non-constant global statics into structs #26607

Conversation

ngoldbaum commented Jun 3, 2024 • edited Loading

ngoldbaum commented Jun 4, 2024

ngoldbaum commented Jun 4, 2024 • edited Loading

ngoldbaum commented Jun 5, 2024

Choose a reason for hiding this comment

mattip commented Jun 5, 2024

ngoldbaum commented Jun 5, 2024

ngoldbaum commented Jun 6, 2024 • edited Loading

rgommers commented Jun 6, 2024

ngoldbaum commented Jun 6, 2024 • edited Loading

seberg left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

ngoldbaum commented Jun 7, 2024 • edited Loading

seberg left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

ngoldbaum commented Jun 13, 2024 • edited Loading

ngoldbaum commented Jun 13, 2024

ngoldbaum commented Jun 19, 2024

ngoldbaum commented Jun 3, 2024 •

edited

Loading

ngoldbaum commented Jun 4, 2024 •

edited

Loading

ngoldbaum commented Jun 6, 2024 •

edited

Loading

ngoldbaum commented Jun 6, 2024 •

edited

Loading

ngoldbaum commented Jun 7, 2024 •

edited

Loading

ngoldbaum commented Jun 13, 2024 •

edited

Loading