Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 19f86c3

Browse files
authored
Merge pull request #23898 from seberg/string-nonzero-sanity
API: Change string to bool conversions to be consistent with Python
2 parents 47b4eb6 + 03443bd commit 19f86c3

File tree

4 files changed

+66
-90
lines changed

4 files changed

+66
-90
lines changed
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
Truthyness of NumPy strings changed
2+
-----------------------------------
3+
NumPy strings previously were inconsistent about how they defined
4+
if the string is ``True`` or ``False`` and the definition did not
5+
match the one used by Python.
6+
Strings are now considered ``True`` when they are non-empty and
7+
``False`` when they are empty.
8+
This changes the following distinct cases:
9+
10+
* Casts from string to boolean were previously roughly equivalent
11+
to ``string_array.astype(np.int64).astype(bool)``, meaning that only
12+
valid integers could be cast.
13+
Now a string of ``"0"`` will be considered ``True`` since it is not empty.
14+
If you need the old behavior, you may use the above step (casting
15+
to integer first) or ``string_array == "0"`` (if the input is only ever ``0`` or ``1``).
16+
To get the new result on old NumPy versions use ``string_array != ""``.
17+
* ``np.nonzero(string_array)`` previously ignored whitespace so that
18+
a string only containing whitepsace was considered ``False``.
19+
Whitespace is now considered ``True``.
20+
21+
This change does not affect ``np.loadtxt``, ``np.fromstring``, or ``np.genfromtxt``.
22+
The first two still use the integer definition, while ``genfromtxt`` continues to
23+
match for ``"true"`` (ignoring case).
24+
However, if ``np.bool_`` is used as a converter the result will change.
25+
26+
The change does affect ``np.fromregex`` as it uses direct assignments.

numpy/core/src/multiarray/arraytypes.c.src

Lines changed: 23 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1771,13 +1771,6 @@ static void
17711771
if (temp == NULL) {
17721772
return;
17731773
}
1774-
#if @is_string_to_bool@
1775-
/* Legacy behaviour converts strings to integers before going to bool */
1776-
Py_SETREF(temp, PyNumber_Long(temp));
1777-
if (temp == NULL) {
1778-
return;
1779-
}
1780-
#endif
17811774
if (@to@_setitem(temp, op, aop)) {
17821775
Py_DECREF(temp);
17831776
return;
@@ -2802,79 +2795,45 @@ static npy_bool
28022795
/**end repeat**/
28032796

28042797

2805-
#define WHITESPACE " \t\n\r\v\f"
2806-
#define WHITELEN 6
2807-
2808-
static npy_bool
2809-
Py_STRING_ISSPACE(char ch)
2810-
{
2811-
char white[] = WHITESPACE;
2812-
int j;
2813-
npy_bool space = NPY_FALSE;
2814-
2815-
for (j = 0; j < WHITELEN; j++) {
2816-
if (ch == white[j]) {
2817-
space = NPY_TRUE;
2818-
break;
2819-
}
2820-
}
2821-
return space;
2822-
}
28232798

28242799
static npy_bool
28252800
STRING_nonzero (char *ip, PyArrayObject *ap)
28262801
{
28272802
int len = PyArray_DESCR(ap)->elsize;
2828-
int i;
2829-
npy_bool nonz = NPY_FALSE;
2830-
npy_bool seen_null = NPY_FALSE;
28312803

2832-
for (i = 0; i < len; i++) {
2833-
if (*ip == '\0') {
2834-
seen_null = NPY_TRUE;
2804+
for (int i = 0; i < len; i++) {
2805+
if (ip[i]) {
2806+
return NPY_TRUE;
28352807
}
2836-
else if (seen_null || !Py_STRING_ISSPACE(*ip)) {
2837-
nonz = NPY_TRUE;
2838-
break;
2839-
}
2840-
ip++;
28412808
}
2842-
return nonz;
2809+
2810+
return NPY_FALSE;
28432811
}
28442812

28452813
static npy_bool
2846-
UNICODE_nonzero (npy_ucs4 *ip, PyArrayObject *ap)
2814+
UNICODE_nonzero (char *ip, PyArrayObject *ap)
28472815
{
2848-
int len = PyArray_DESCR(ap)->elsize >> 2;
2849-
int i;
2850-
npy_bool nonz = NPY_FALSE;
2851-
npy_bool seen_null = NPY_FALSE;
2852-
char *buffer = NULL;
2853-
2854-
if (PyArray_ISBYTESWAPPED(ap) || !PyArray_ISALIGNED(ap)) {
2855-
buffer = PyArray_malloc(PyArray_DESCR(ap)->elsize);
2856-
if (buffer == NULL) {
2857-
return nonz;
2858-
}
2859-
memcpy(buffer, ip, PyArray_DESCR(ap)->elsize);
2860-
if (PyArray_ISBYTESWAPPED(ap)) {
2861-
byte_swap_vector(buffer, len, 4);
2816+
if (PyArray_ISALIGNED(ap)) {
2817+
/* go character by character */
2818+
Py_UCS4 *chars = (Py_UCS4 *)ip;
2819+
int len = PyArray_DESCR(ap)->elsize / 4;
2820+
for (int i = 0; i < len; i++) {
2821+
if (chars[i]) {
2822+
return NPY_TRUE;
2823+
}
28622824
}
2863-
ip = (npy_ucs4 *)buffer;
28642825
}
2865-
2866-
for (i = 0; i < len; i++) {
2867-
if (*ip == '\0') {
2868-
seen_null = NPY_TRUE;
2869-
}
2870-
else if (seen_null || !Py_UNICODE_ISSPACE(*ip)) {
2871-
nonz = NPY_TRUE;
2872-
break;
2826+
else {
2827+
/* go char/byte by char/byte, it doesn't matter where the nonzero is */
2828+
int len = PyArray_DESCR(ap)->elsize;
2829+
for (int i = 0; i < len; i++) {
2830+
if (ip[i]) {
2831+
return NPY_TRUE;
2832+
}
28732833
}
2874-
ip++;
28752834
}
2876-
PyArray_free(buffer);
2877-
return nonz;
2835+
2836+
return NPY_FALSE;
28782837
}
28792838

28802839
static npy_bool

numpy/core/tests/test_api.py

Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -321,30 +321,21 @@ def test_array_astype_warning(t):
321321
@pytest.mark.parametrize(["dtype", "out_dtype"],
322322
[(np.bytes_, np.bool_),
323323
(np.str_, np.bool_),
324-
(np.dtype("S10,S9"), np.dtype("?,?"))])
324+
(np.dtype("S10,S9"), np.dtype("?,?")),
325+
# The following also checks unaligned unicode access:
326+
(np.dtype("S7,U9"), np.dtype("?,?"))])
325327
def test_string_to_boolean_cast(dtype, out_dtype):
326-
"""
327-
Currently, for `astype` strings are cast to booleans effectively by
328-
calling `bool(int(string)`. This is not consistent (see gh-9875) and
329-
will eventually be deprecated.
330-
"""
331-
arr = np.array(["10", "10\0\0\0", "0\0\0", "0"], dtype=dtype)
332-
expected = np.array([True, True, False, False], dtype=out_dtype)
328+
# Only the last two (empty) strings are falsy (the `\0` is stripped):
329+
arr = np.array(
330+
["10", "10\0\0\0", "0\0\0", "0", "False", " ", "", "\0"],
331+
dtype=dtype)
332+
expected = np.array(
333+
[True, True, True, True, True, True, False, False],
334+
dtype=out_dtype)
333335
assert_array_equal(arr.astype(out_dtype), expected)
334-
335-
@pytest.mark.parametrize(["dtype", "out_dtype"],
336-
[(np.bytes_, np.bool_),
337-
(np.str_, np.bool_),
338-
(np.dtype("S10,S9"), np.dtype("?,?"))])
339-
def test_string_to_boolean_cast_errors(dtype, out_dtype):
340-
"""
341-
These currently error out, since cast to integers fails, but should not
342-
error out in the future.
343-
"""
344-
for invalid in ["False", "True", "", "\0", "non-empty"]:
345-
arr = np.array([invalid], dtype=dtype)
346-
with assert_raises(ValueError):
347-
arr.astype(out_dtype)
336+
# As it's similar, check that nonzero behaves the same (structs are
337+
# nonzero if all entries are)
338+
assert_array_equal(np.nonzero(arr), np.nonzero(expected))
348339

349340
@pytest.mark.parametrize("str_type", [str, bytes, np.str_, np.unicode_])
350341
@pytest.mark.parametrize("scalar_type",

numpy/core/tests/test_multiarray.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9117,10 +9117,10 @@ class TestBytestringArrayNonzero:
91179117
def test_empty_bstring_array_is_falsey(self):
91189118
assert_(not np.array([''], dtype=str))
91199119

9120-
def test_whitespace_bstring_array_is_falsey(self):
9120+
def test_whitespace_bstring_array_is_truthy(self):
91219121
a = np.array(['spam'], dtype=str)
91229122
a[0] = ' \0\0'
9123-
assert_(not a)
9123+
assert_(a)
91249124

91259125
def test_all_null_bstring_array_is_falsey(self):
91269126
a = np.array(['spam'], dtype=str)
@@ -9166,10 +9166,10 @@ class TestUnicodeArrayNonzero:
91669166
def test_empty_ustring_array_is_falsey(self):
91679167
assert_(not np.array([''], dtype=np.str_))
91689168

9169-
def test_whitespace_ustring_array_is_falsey(self):
9169+
def test_whitespace_ustring_array_is_truthy(self):
91709170
a = np.array(['eggs'], dtype=np.str_)
91719171
a[0] = ' \0\0'
9172-
assert_(not a)
9172+
assert_(a)
91739173

91749174
def test_all_null_ustring_array_is_falsey(self):
91759175
a = np.array(['eggs'], dtype=np.str_)

0 commit comments

Comments
 (0)