From 1e15b43ae15af14127f62f05c84d69393881774a Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 20 Oct 2021 10:17:39 -0500 Subject: [PATCH 01/70] ENH: Move npreadtext into NumPy for faster text reading This replaces `np.loadtxt` with the new textreader. The file has a few minor cleanups compared to the npreadtext version. npreadtext was started by Warren Weckesser for inclusion in NumPy and then very heavily modified by me (Sebastian Berg) to improve it and slim it down slightly. Some parts of this code is inspired or even taken from the pandas parser (mainly the integer parsers are fairly verbatim still). Co-authored-by: Warren Weckesser --- numpy/core/setup.py | 9 + numpy/core/src/multiarray/conversion_utils.c | 11 + numpy/core/src/multiarray/conversion_utils.h | 3 + numpy/core/src/multiarray/multiarraymodule.c | 3 + .../src/multiarray/textreading/conversions.c | 375 +++++++++++ .../src/multiarray/textreading/conversions.h | 57 ++ .../src/multiarray/textreading/field_types.c | 200 ++++++ .../src/multiarray/textreading/field_types.h | 49 ++ .../core/src/multiarray/textreading/growth.c | 38 ++ .../core/src/multiarray/textreading/growth.h | 7 + .../multiarray/textreading/parser_config.h | 77 +++ .../src/multiarray/textreading/readtext.c | 199 ++++++ .../src/multiarray/textreading/readtext.h | 7 + numpy/core/src/multiarray/textreading/rows.c | 438 ++++++++++++ numpy/core/src/multiarray/textreading/rows.h | 22 + .../src/multiarray/textreading/str_to_int.c | 87 +++ .../src/multiarray/textreading/str_to_int.h | 175 +++++ .../core/src/multiarray/textreading/stream.h | 29 + .../multiarray/textreading/stream_pyobject.c | 271 ++++++++ .../multiarray/textreading/stream_pyobject.h | 16 + .../src/multiarray/textreading/tokenize.c.src | 449 +++++++++++++ .../src/multiarray/textreading/tokenize.h | 77 +++ numpy/lib/npyio.py | 634 +++++++++--------- 23 files changed, 2922 insertions(+), 311 deletions(-) create mode 100644 numpy/core/src/multiarray/textreading/conversions.c create mode 100644 numpy/core/src/multiarray/textreading/conversions.h create mode 100644 numpy/core/src/multiarray/textreading/field_types.c create mode 100644 numpy/core/src/multiarray/textreading/field_types.h create mode 100644 numpy/core/src/multiarray/textreading/growth.c create mode 100644 numpy/core/src/multiarray/textreading/growth.h create mode 100644 numpy/core/src/multiarray/textreading/parser_config.h create mode 100644 numpy/core/src/multiarray/textreading/readtext.c create mode 100644 numpy/core/src/multiarray/textreading/readtext.h create mode 100644 numpy/core/src/multiarray/textreading/rows.c create mode 100644 numpy/core/src/multiarray/textreading/rows.h create mode 100644 numpy/core/src/multiarray/textreading/str_to_int.c create mode 100644 numpy/core/src/multiarray/textreading/str_to_int.h create mode 100644 numpy/core/src/multiarray/textreading/stream.h create mode 100644 numpy/core/src/multiarray/textreading/stream_pyobject.c create mode 100644 numpy/core/src/multiarray/textreading/stream_pyobject.h create mode 100644 numpy/core/src/multiarray/textreading/tokenize.c.src create mode 100644 numpy/core/src/multiarray/textreading/tokenize.h diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 22cac1e9a43c..3d7e958d3c4f 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -868,6 +868,7 @@ def gl_if_msvc(build_cmd): join('src', 'multiarray', 'typeinfo.h'), join('src', 'multiarray', 'usertypes.h'), join('src', 'multiarray', 'vdot.h'), + join('src', 'multiarray', 'textreading', 'readtext.h'), join('include', 'numpy', 'arrayobject.h'), join('include', 'numpy', '_neighborhood_iterator_imp.h'), join('include', 'numpy', 'npy_endian.h'), @@ -955,6 +956,14 @@ def gl_if_msvc(build_cmd): join('src', 'npysort', 'selection.c.src'), join('src', 'common', 'npy_binsearch.h'), join('src', 'npysort', 'binsearch.cpp'), + join('src', 'multiarray', 'textreading', 'conversions.c'), + join('src', 'multiarray', 'textreading', 'field_types.c'), + join('src', 'multiarray', 'textreading', 'growth.c'), + join('src', 'multiarray', 'textreading', 'readtext.c'), + join('src', 'multiarray', 'textreading', 'rows.c'), + join('src', 'multiarray', 'textreading', 'stream_pyobject.c'), + join('src', 'multiarray', 'textreading', 'str_to_int.c'), + join('src', 'multiarray', 'textreading', 'tokenize.c.src'), ] ####################################################################### diff --git a/numpy/core/src/multiarray/conversion_utils.c b/numpy/core/src/multiarray/conversion_utils.c index a1de580d9537..e4eb4f49efc1 100644 --- a/numpy/core/src/multiarray/conversion_utils.c +++ b/numpy/core/src/multiarray/conversion_utils.c @@ -993,6 +993,17 @@ PyArray_PyIntAsIntp(PyObject *o) } +NPY_NO_EXPORT int +PyArray_IntpFromPyIntConverter(PyObject *o, npy_intp *val) +{ + *val = PyArray_PyIntAsIntp(o); + if (error_converting(*val)) { + return NPY_FAIL; + } + return NPY_SUCCEED; +} + + /* * PyArray_IntpFromIndexSequence * Returns the number of dimensions or -1 if an error occurred. diff --git a/numpy/core/src/multiarray/conversion_utils.h b/numpy/core/src/multiarray/conversion_utils.h index 4072841ee1c7..4d0fbb8941ba 100644 --- a/numpy/core/src/multiarray/conversion_utils.h +++ b/numpy/core/src/multiarray/conversion_utils.h @@ -6,6 +6,9 @@ NPY_NO_EXPORT int PyArray_IntpConverter(PyObject *obj, PyArray_Dims *seq); +NPY_NO_EXPORT int +PyArray_IntpFromPyIntConverter(PyObject *o, npy_intp *val); + NPY_NO_EXPORT int PyArray_OptionalIntpConverter(PyObject *obj, PyArray_Dims *seq); diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c index 789446d0ce3c..a7b6898e17c2 100644 --- a/numpy/core/src/multiarray/multiarraymodule.c +++ b/numpy/core/src/multiarray/multiarraymodule.c @@ -69,6 +69,7 @@ NPY_NO_EXPORT int NPY_NUMUSERTYPES = 0; #include "get_attr_string.h" #include "experimental_public_dtype_api.h" /* _get_experimental_dtype_api */ +#include "textreading/readtext.h" /* _readtext_from_file_object */ #include "npy_dlpack.h" @@ -4456,6 +4457,8 @@ static struct PyMethodDef array_module_methods[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"_get_experimental_dtype_api", (PyCFunction)_get_experimental_dtype_api, METH_O, NULL}, + {"_load_from_filelike", (PyCFunction)_load_from_filelike, + METH_FASTCALL | METH_KEYWORDS, NULL}, /* from umath */ {"frompyfunc", (PyCFunction) ufunc_frompyfunc, diff --git a/numpy/core/src/multiarray/textreading/conversions.c b/numpy/core/src/multiarray/textreading/conversions.c new file mode 100644 index 000000000000..be697c380dd4 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/conversions.c @@ -0,0 +1,375 @@ + +#include + +#include +#include +#include + +#include "conversions.h" +#include "str_to_int.h" + +#include "array_coercion.h" + + +/* + * Coercion to boolean is done via integer right now. + */ +int +to_bool(PyArray_Descr *NPY_UNUSED(descr), + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(pconfig)) +{ + int64_t res; + if (str_to_int64(str, end, INT64_MIN, INT64_MAX, &res) < 0) { + return -1; + } + *dataptr = (char)(res != 0); + return 0; +} + + +/* + * In order to not pack a whole copy of a floating point parser, we copy the + * result into ascii and call the Python one. Float parsing isn't super quick + * so this is not terrible, but avoiding it would speed up things. + * + * Also note that parsing the first float of a complex will copy the whole + * string to ascii rather than just the first part. + * TODO: A tweak of the break might be a simple mitigation there. + * + * @param str The UCS4 string to parse + * @param end Pointer to the end of the string + * @param skip_trailing_whitespace If false does not skip trailing whitespace + * (used by the complex parser). + * @param result Output stored as double value. + */ +static NPY_INLINE int +double_from_ucs4( + const Py_UCS4 *str, const Py_UCS4 *end, + bool skip_trailing_whitespace, double *result, const Py_UCS4 **p_end) +{ + /* skip leading whitespace */ + while (Py_UNICODE_ISSPACE(*str)) { + str++; + } + if (str == end) { + return -1; /* empty or only whitespace: not a floating point number */ + } + + /* We convert to ASCII for the Python parser, use stack if small: */ + char stack_buf[128]; + char *heap_buf = NULL; + char *ascii = stack_buf; + + size_t str_len = end - str; + if (str_len > 128) { + heap_buf = PyMem_MALLOC(str_len); + ascii = heap_buf; + } + char *c = ascii; + for (; str < end; str++, c++) { + if (NPY_UNLIKELY(*str >= 128)) { + break; /* the following cannot be a number anymore */ + } + *c = (char)(*str); + } + *c = '\0'; + + char *end_parsed; + *result = PyOS_string_to_double(ascii, &end_parsed, NULL); + /* Rewind `end` to the first UCS4 character not parsed: */ + end = end - (c - end_parsed); + + PyMem_FREE(heap_buf); + + if (*result == -1. && PyErr_Occurred()) { + return -1; + } + + if (skip_trailing_whitespace) { + /* and then skip any remainig whitespace: */ + while (Py_UNICODE_ISSPACE(*end)) { + end++; + } + } + *p_end = end; + return 0; +} + +/* + * `item` must be the nul-terminated string that is to be + * converted to a double. + * + * To be successful, to_double() must use *all* the characters + * in `item`. E.g. "1.q25" will fail. Leading and trailing + * spaces are allowed. + */ +int +to_float(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(pconfig)) +{ + double double_val; + const Py_UCS4 *p_end; + if (double_from_ucs4(str, end, true, &double_val, &p_end) < 0) { + return -1; + } + if (p_end != end) { + return -1; + } + + float val = (float)double_val; + memcpy(dataptr, &val, sizeof(float)); + if (!PyArray_ISNBO(descr->byteorder)) { + descr->f->copyswap(dataptr, dataptr, 1, NULL); + } + return 0; +} + + +int +to_double(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(pconfig)) +{ + double val; + const Py_UCS4 *p_end; + if (double_from_ucs4(str, end, true, &val, &p_end) < 0) { + return -1; + } + if (p_end != end) { + return -1; + } + + memcpy(dataptr, &val, sizeof(double)); + if (!PyArray_ISNBO(descr->byteorder)) { + descr->f->copyswap(dataptr, dataptr, 1, NULL); + } + return 0; +} + + +static bool +to_complex_int( + const Py_UCS4 *item, const Py_UCS4 *token_end, + double *p_real, double *p_imag, + Py_UCS4 imaginary_unit, bool allow_parens) +{ + const Py_UCS4 *p_end; + bool unmatched_opening_paren = false; + + /* Remove whitespace before the possibly leading '(' */ + while (Py_UNICODE_ISSPACE(*item)) { + ++item; + } + if (allow_parens && (*item == '(')) { + unmatched_opening_paren = true; + ++item; + } + if (double_from_ucs4(item, token_end, false, p_real, &p_end) < 0) { + return false; + } + if (p_end == token_end) { + // No imaginary part in the string (e.g. "3.5") + *p_imag = 0.0; + return !unmatched_opening_paren; + } + if (*p_end == imaginary_unit) { + // Pure imaginary part only (e.g "1.5j") + *p_imag = *p_real; + *p_real = 0.0; + ++p_end; + if (unmatched_opening_paren && (*p_end == ')')) { + ++p_end; + unmatched_opening_paren = false; + } + } + else if (unmatched_opening_paren && (*p_end == ')')) { + *p_imag = 0.0; + ++p_end; + unmatched_opening_paren = false; + } + else { + if (*p_end == '+') { + ++p_end; + } + if (double_from_ucs4(p_end, token_end, false, p_imag, &p_end) < 0) { + return false; + } + if (*p_end != imaginary_unit) { + return false; + } + ++p_end; + if (unmatched_opening_paren && (*p_end == ')')) { + ++p_end; + unmatched_opening_paren = false; + } + } + while (Py_UNICODE_ISSPACE(*p_end)) { + ++p_end; + } + return p_end == token_end; +} + + +int +to_cfloat(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig) +{ + double real; + double imag; + + bool success = to_complex_int( + str, end, &real, &imag, + pconfig->imaginary_unit, true); + + if (!success) { + return -1; + } + npy_complex64 val = {(float)real, (float)imag}; + memcpy(dataptr, &val, sizeof(npy_complex64)); + if (!PyArray_ISNBO(descr->byteorder)) { + descr->f->copyswap(dataptr, dataptr, 1, NULL); + } + return 0; +} + + +int +to_cdouble(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig) +{ + double real; + double imag; + + bool success = to_complex_int( + str, end, &real, &imag, pconfig->imaginary_unit, true); + + if (!success) { + return -1; + } + npy_complex128 val = {real, imag}; + memcpy(dataptr, &val, sizeof(npy_complex128)); + if (!PyArray_ISNBO(descr->byteorder)) { + descr->f->copyswap(dataptr, dataptr, 1, NULL); + } + return 0; +} + + +/* + * String and unicode conversion functions. + */ +int +to_string(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(unused)) +{ + const Py_UCS4* c = str; + size_t length = descr->elsize; + + for (size_t i = 0; i < length; i++) { + if (c < end) { + /* + * loadtxt assumed latin1, which is compatible with UCS1 (first + * 256 unicode characters). + */ + if (NPY_UNLIKELY(*c > 255)) { + /* TODO: Was UnicodeDecodeError, is unspecific error good? */ + return -1; + } + dataptr[i] = (Py_UCS1)(*c); + c++; + } + else { + dataptr[i] = '\0'; + } + } + return 0; +} + + +int +to_unicode(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *NPY_UNUSED(unused)) +{ + size_t length = descr->elsize / 4; + + if (length <= (size_t)(end - str)) { + memcpy(dataptr, str, length * 4); + } + else { + size_t given_len = end - str; + memcpy(dataptr, str, given_len * 4); + memset(dataptr + given_len * 4, '\0', (length -given_len) * 4); + } + + if (!PyArray_ISNBO(descr->byteorder)) { + descr->f->copyswap(dataptr, dataptr, 1, NULL); + } + return 0; +} + + + +/* + * Convert functions helper for the generic converter. + */ +static PyObject * +call_converter_function( + PyObject *func, const Py_UCS4 *str, size_t length, bool byte_converters) +{ + PyObject *s = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, str, length); + if (s == NULL) { + return s; + } + if (byte_converters) { + Py_SETREF(s, PyUnicode_AsEncodedString(s, "latin1", NULL)); + if (s == NULL) { + return NULL; + } + } + if (func == NULL) { + return s; + } + PyObject *result = PyObject_CallFunctionObjArgs(func, s, NULL); + Py_DECREF(s); + return result; +} + + +int +to_generic_with_converter(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *config, PyObject *func) +{ + bool use_byte_converter; + if (func == NULL) { + use_byte_converter = config->c_byte_converters; + } + else { + use_byte_converter = config->python_byte_converters; + } + /* Converts to unicode and calls custom converter (if set) */ + PyObject *converted = call_converter_function( + func, str, (size_t)(end - str), use_byte_converter); + if (converted == NULL) { + return -1; + } + + int res = PyArray_Pack(descr, dataptr, converted); + Py_DECREF(converted); + return res; +} + + +int +to_generic(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *config) +{ + return to_generic_with_converter(descr, str, end, dataptr, config, NULL); +} \ No newline at end of file diff --git a/numpy/core/src/multiarray/textreading/conversions.h b/numpy/core/src/multiarray/textreading/conversions.h new file mode 100644 index 000000000000..6308c10d4248 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/conversions.h @@ -0,0 +1,57 @@ +#ifndef CONVERSIONS_H +#define CONVERSIONS_H + +#include + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/arrayobject.h" + +#include "textreading/parser_config.h" + +int +to_bool(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +int +to_float(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +int +to_double(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +int +to_cfloat(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +int +to_cdouble(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +int +to_string(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *unused); + +int +to_unicode(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *unused); + +int +to_generic_with_converter(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *unused, PyObject *func); + +int +to_generic(PyArray_Descr *descr, + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, + parser_config *pconfig); + +#endif diff --git a/numpy/core/src/multiarray/textreading/field_types.c b/numpy/core/src/multiarray/textreading/field_types.c new file mode 100644 index 000000000000..914c8e4d8c25 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/field_types.c @@ -0,0 +1,200 @@ +#include "field_types.h" +#include "conversions.h" +#include "str_to_int.h" + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/ndarraytypes.h" +#include "alloc.h" + +#include "textreading/growth.h" + + +void +field_types_xclear(int num_field_types, field_type *ft) { + assert(num_field_types >= 0); + if (ft == NULL) { + return; + } + for (int i = 0; i < num_field_types; i++) { + Py_XDECREF(ft[i].descr); + ft[i].descr = NULL; + } + PyMem_Free(ft); +} + + +/* + * Fetch custom converters for the builtin NumPy DTypes (or the generic one). + * Structured DTypes get unpacked and `object` uses the generic method. + * + * TODO: This should probably be moved on the DType object in some form, + * to allow user DTypes to define their own converters. + */ +static set_from_ucs4_function * +get_from_ucs4_function(PyArray_Descr *descr) +{ + if (descr->type_num == NPY_BOOL) { + return &to_bool; + } + else if (PyDataType_ISSIGNED(descr)) { + switch (descr->elsize) { + case 1: + return &to_int8; + case 2: + return &to_int16; + case 4: + return &to_int32; + case 8: + return &to_int64; + default: + assert(0); + } + } + else if (PyDataType_ISUNSIGNED(descr)) { + switch (descr->elsize) { + case 1: + return &to_uint8; + case 2: + return &to_uint16; + case 4: + return &to_uint32; + case 8: + return &to_uint64; + default: + assert(0); + } + } + else if (descr->type_num == NPY_FLOAT) { + return &to_float; + } + else if (descr->type_num == NPY_DOUBLE) { + return &to_double; + } + else if (descr->type_num == NPY_CFLOAT) { + return &to_cfloat; + } + else if (descr->type_num == NPY_CDOUBLE) { + return &to_cdouble; + } + else if (descr->type_num == NPY_STRING) { + return &to_string; + } + else if (descr->type_num == NPY_UNICODE) { + return &to_unicode; + } + return &to_generic; +} + + +/* + * Note that the function cleans up `ft` on error. If `num_field_types < 0` + * cleanup has already happened in the internal call. + */ +static npy_intp +field_type_grow_recursive(PyArray_Descr *descr, + npy_intp num_field_types, field_type **ft, npy_intp *ft_size, + npy_intp field_offset) +{ + if (PyDataType_HASSUBARRAY(descr)) { + PyArray_Dims shape = {NULL, -1}; + + if (!(PyArray_IntpConverter(descr->subarray->shape, &shape))) { + PyErr_SetString(PyExc_ValueError, "invalid subarray shape"); + field_types_xclear(num_field_types, *ft); + return -1; + } + npy_intp size = PyArray_MultiplyList(shape.ptr, shape.len); + npy_free_cache_dim_obj(shape); + for (npy_intp i = 0; i < size; i++) { + num_field_types = field_type_grow_recursive(descr->subarray->base, + num_field_types, ft, ft_size, field_offset); + field_offset += descr->subarray->base->elsize; + if (num_field_types < 0) { + return -1; + } + } + return num_field_types; + } + else if (PyDataType_HASFIELDS(descr)) { + npy_int num_descr_fields = PyTuple_Size(descr->names); + if (num_descr_fields < 0) { + field_types_xclear(num_field_types, *ft); + return -1; + } + for (npy_intp i = 0; i < num_descr_fields; i++) { + PyObject *key = PyTuple_GET_ITEM(descr->names, i); + PyObject *tup = PyObject_GetItem(descr->fields, key); + if (tup == NULL) { + field_types_xclear(num_field_types, *ft); + return -1; + } + PyArray_Descr *field_descr; + PyObject *title; + int offset; + if (!PyArg_ParseTuple(tup, "Oi|O", &field_descr, &offset, &title)) { + Py_DECREF(tup); + field_types_xclear(num_field_types, *ft); + return -1; + } + num_field_types = field_type_grow_recursive( + field_descr, num_field_types, ft, ft_size, + field_offset + offset); + if (num_field_types < 0) { + return -1; + } + } + return num_field_types; + } + + if (*ft_size <= num_field_types) { + npy_intp alloc_size = grow_size_and_multiply( + ft_size, 4, sizeof(field_type)); + if (alloc_size < 0) { + field_types_xclear(num_field_types, *ft); + return -1; + } + field_type *new_ft = PyMem_Realloc(*ft, alloc_size); + if (new_ft == NULL) { + field_types_xclear(num_field_types, *ft); + return -1; + } + *ft = new_ft; + } + + Py_INCREF(descr); + (*ft)[num_field_types].descr = descr; + (*ft)[num_field_types].set_from_ucs4 = get_from_ucs4_function(descr); + (*ft)[num_field_types].structured_offset = field_offset; + + return num_field_types + 1; +} + + +/* + * Prepare the "field_types" for the given dtypes/descriptors. Currently, + * we copy the itemsize, but the main thing is that we check for custom + * converters. + */ +npy_intp +field_types_create(PyArray_Descr *descr, field_type **ft) +{ + if (descr->subarray != NULL) { + /* + * This could probably be allowed, but NumPy absorbs the dimensions + * so it is an awkward corner case that probably never really worked. + */ + PyErr_SetString(PyExc_TypeError, + "file reader does not support subarray dtypes. You can" + "put the dtype into a structured one using " + "`np.dtype(('name', dtype))` to avoid this limitation."); + return -1; + } + + npy_intp ft_size = 4; + *ft = PyMem_Malloc(ft_size * sizeof(field_type)); + if (*ft == NULL) { + return -1; + } + return field_type_grow_recursive(descr, 0, ft, &ft_size, 0); +} diff --git a/numpy/core/src/multiarray/textreading/field_types.h b/numpy/core/src/multiarray/textreading/field_types.h new file mode 100644 index 000000000000..5c4cfb2c6bf8 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/field_types.h @@ -0,0 +1,49 @@ + +#ifndef _FIELD_TYPES_H_ +#define _FIELD_TYPES_H_ + +#include +#include +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/ndarraytypes.h" + +#include "textreading/parser_config.h" + +/* + * The original code had some error details, but I assume that we don't need + * it. Printing the string from which we tried to modify it should be fine. + * This should potentially be public NumPy API, although it is tricky, NumPy + * + * This function must support unaligned memory access. + * + * NOTE: An earlier version of the code had unused default versions (pandas + * does this) when columns are missing. We could define this either + * by passing `NULL` in, or by adding a default explicitly somewhere. + * (I think users should probably have to define the default, at which + * point it doesn't matter here.) + * + * NOTE: We are currently passing the parser config, this could be made public + * or could be set up to be dtype specific/private. Always passing + * pconfig fully seems easier right now even if it may change. + */ +typedef int (set_from_ucs4_function)( + PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, + char *dataptr, parser_config *pconfig); + +typedef struct _field_type { + set_from_ucs4_function *set_from_ucs4; + /* The original NumPy descriptor */ + PyArray_Descr *descr; + /* Offset to this entry within row. */ + npy_intp structured_offset; +} field_type; + + +void +field_types_xclear(int num_field_types, field_type *ft); + +npy_intp +field_types_create(PyArray_Descr *descr, field_type **ft); + +#endif diff --git a/numpy/core/src/multiarray/textreading/growth.c b/numpy/core/src/multiarray/textreading/growth.c new file mode 100644 index 000000000000..a38c6d5aa780 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/growth.c @@ -0,0 +1,38 @@ +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "templ_common.h" + +#include "textreading/growth.h" + + +/* + * Helper function taking the size input and growing it (based on min_grow). + * It further multiplies it with `itemsize` and ensures that all results fit + * into an `npy_intp`. + * Returns -1 if any overflow occurred or the result would not fit. + * The user has to ensure the input is size_t (i.e. unsigned). + */ +npy_intp +grow_size_and_multiply(size_t *size, size_t min_grow, npy_intp itemsize) { + /* min_grow must be a power of two: */ + assert((min_grow & (min_grow - 1)) == 0); + size_t growth = *size >> 2; + if (growth <= min_grow) { + *size += min_grow; + } + else { + *size += growth + min_grow - 1; + *size &= ~min_grow; + + if (*size > NPY_MAX_INTP) { + return -1; + } + } + + npy_intp res; + if (npy_mul_with_overflow_intp(&res, (npy_intp)*size, itemsize)) { + return -1; + } + return res; +} + diff --git a/numpy/core/src/multiarray/textreading/growth.h b/numpy/core/src/multiarray/textreading/growth.h new file mode 100644 index 000000000000..debe9a7b3175 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/growth.h @@ -0,0 +1,7 @@ +#ifndef _NPY_GROWTH_H +#define _NPY_GROWTH_H + +npy_intp +grow_size_and_multiply(size_t *size, size_t min_grow, npy_intp itemsize); + +#endif /*_NPY_GROWTH_H */ diff --git a/numpy/core/src/multiarray/textreading/parser_config.h b/numpy/core/src/multiarray/textreading/parser_config.h new file mode 100644 index 000000000000..c60565de1ce7 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/parser_config.h @@ -0,0 +1,77 @@ + +#ifndef _PARSER_CONFIG_H_ +#define _PARSER_CONFIG_H_ + +#include + +typedef struct { + /* + * Field delimiter character. + * Typically ',', ' ', '\t', ignored if `delimiter_is_whitespace` is true. + */ + Py_UCS4 delimiter; + + /* + * Character used to quote fields. + * Typically '"' or "'". To disable quoting we set this to UINT_MAX + * (which is not a valid unicode character and thus cannot occur in the + * file; the same is used for all other characters if necessary). + */ + Py_UCS4 quote; + + /* + * Character(s) that indicates the start of a comment. + * Typically '#', '%' or ';'. + * When encountered in a line and not inside quotes, all character + * from the comment character(s) to the end of the line are ignored. + */ + Py_UCS4 comment; + + /* + * Ignore whitespace at the beginning of a field (outside/before quotes). + * Is (and must be) set if `delimiter_is_whitespace`. + */ + bool ignore_leading_whitespace; + + /* + * If true, the delimiter is ignored and any unicode whitespace is used + * for splitting (same as `string.split()` in Python). In that case + * `ignore_leading_whitespace` should also be set. + */ + bool delimiter_is_whitespace; + + /* + * A boolean value (0 or 1). If 1, quoted fields may span + * more than one line. For example, the following + * 100, 200, "FOO + * BAR" + * is one "row", containing three fields: 100, 200 and "FOO\nBAR". + * If 0, the parser considers an unclosed quote to be an error. (XXX Check!) + */ + bool allow_embedded_newline; + + /* + * The imaginary unit character. Default is `j`. + */ + Py_UCS4 imaginary_unit; + + /* + * If true, when an integer dtype is given, the field is allowed + * to contain a floating point value. It will be cast to the + * integer type. + */ + bool allow_float_for_int; + /* + * Data should be encoded as `latin1` when using python converter + * (implementing `loadtxt` default Python 2 compatibility mode). + * The c byte converter is used when the user requested `dtype="S"`. + * In this case we go via `dtype=object`, however, loadtxt allows latin1 + * while normal object to string casts only accept ASCII, so it ensures + * that that the object array already contains bytes and not strings. + */ + bool python_byte_converters; + bool c_byte_converters; +} parser_config; + + +#endif diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c new file mode 100644 index 000000000000..750e77b2d80c --- /dev/null +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -0,0 +1,199 @@ +#include +#include + +#define PY_SSIZE_T_CLEAN +#include + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/arrayobject.h" +#include "npy_argparse.h" +#include "conversion_utils.h" + +#include "textreading/parser_config.h" +#include "textreading/stream_pyobject.h" +#include "textreading/field_types.h" +#include "textreading/rows.h" +#include "textreading/str_to_int.h" + + +// +// `usecols` must point to a Python object that is Py_None or a 1-d contiguous +// numpy array with data type int32. +// +// `dtype` must point to a Python object that is Py_None or a numpy dtype +// instance. If the latter, code and sizes must be arrays of length +// num_dtype_fields, holding the flattened data field type codes and byte +// sizes. (num_dtype_fields, codes, and sizes can be inferred from dtype, +// but we do that in Python code.) +// +// If both `usecols` and `dtype` are not None, and the data type is compound, +// then len(usecols) must equal num_dtype_fields. +// +// If `dtype` is given and it is compound, and `usecols` is None, then the +// number of columns in the file must match the number of fields in `dtype`. +// +static PyObject * +_readtext_from_stream(stream *s, parser_config *pc, + PyObject *usecols, Py_ssize_t skiprows, Py_ssize_t max_rows, + PyObject *converters, PyObject *dtype) +{ + PyArrayObject *arr = NULL; + PyArray_Descr *out_dtype = NULL; + int32_t *cols; + int ncols; + field_type *ft = NULL; + + /* TODO: Find better solution maybe? */ + if (double_descr == NULL) { + double_descr = PyArray_DescrFromType(NPY_DOUBLE); + } + + /* + * If dtypes[0] is dtype the input was not structured and the result + * is considered "homogeneous" and we have to discover the number of + * columns/ + */ + out_dtype = (PyArray_Descr *)dtype; + Py_INCREF(out_dtype); + + npy_intp num_fields = field_types_create(out_dtype, &ft); + if (num_fields < 0) { + goto finish; + } + bool homogeneous = num_fields == 1 && ft[0].descr == out_dtype; + + if (usecols == Py_None) { + ncols = num_fields; + cols = NULL; + } + else { + ncols = PyArray_SIZE((PyArrayObject *)usecols); + cols = PyArray_DATA((PyArrayObject *)usecols); + } + + arr = read_rows( + s, max_rows, num_fields, ft, pc, + ncols, cols, skiprows, converters, + NULL, out_dtype, homogeneous); + if (arr == NULL) { + goto finish; + } + + finish: + Py_XDECREF(out_dtype); + field_types_xclear(num_fields, ft); + return (PyObject *)arr; +} + + +static int +parse_control_character(PyObject *obj, Py_UCS4 *character) +{ + if (!PyUnicode_Check(obj) || PyUnicode_GetLength(obj) > 1) { + PyErr_Format(PyExc_TypeError, + "Control character must be a single unicode character or " + "empty unicode string; but got: %.100R", obj); + return 0; + } + if (PyUnicode_GET_LENGTH(obj) == 0) { + *character = (Py_UCS4)-1; /* character beyond unicode range */ + return 1; + } + *character = PyUnicode_READ_CHAR(obj, 0); + return 1; +} + + +NPY_NO_EXPORT PyObject * +_load_from_filelike(PyObject *NPY_UNUSED(mod), + PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames) +{ + PyObject *file; + Py_ssize_t skiprows = 0; + Py_ssize_t max_rows = -1; + PyObject *usecols = Py_None; + PyObject *converters = Py_None; + + PyObject *dtype = Py_None; + PyObject *encoding_obj = Py_None; + const char *encoding = NULL; + + parser_config pc = { + .delimiter = ',', + .comment = '#', + .quote = '"', + .imaginary_unit = 'j', + .allow_float_for_int = true, + .allow_embedded_newline = true, + .delimiter_is_whitespace = false, + .ignore_leading_whitespace = false, + .python_byte_converters = false, + .c_byte_converters = false, + }; + bool filelike = true; + + PyObject *arr = NULL; + + NPY_PREPARE_ARGPARSER; + if (npy_parse_arguments("_load_from_filelike", args, len_args, kwnames, + "file", NULL, &file, + "|delimiter", &parse_control_character, &pc.delimiter, + "|comment", &parse_control_character, &pc.comment, + "|quote", &parse_control_character, &pc.quote, + "|imaginary_unit", &parse_control_character, &pc.imaginary_unit, + "|usecols", NULL, &usecols, + "|skiprows", &PyArray_IntpFromPyIntConverter, &skiprows, + "|max_rows", &PyArray_IntpFromPyIntConverter, &max_rows, + "|converters", NULL, &converters, + "|dtype", NULL, &dtype, + "|encoding", NULL, &encoding_obj, + "|filelike", &PyArray_BoolConverter, &filelike, + "|byte_converters", &PyArray_BoolConverter, &pc.python_byte_converters, + "|c_byte_converters", PyArray_BoolConverter, &pc.c_byte_converters, + NULL, NULL, NULL) < 0) { + return NULL; + } + + if (pc.delimiter == (Py_UCS4)-1) { + pc.delimiter_is_whitespace = true; + /* Ignore leading whitespace to match `string.split(None)` */ + pc.ignore_leading_whitespace = true; + } + + if (!PyArray_DescrCheck(dtype) ) { + PyErr_SetString(PyExc_TypeError, + "internal error: dtype must be provided and be a NumPy dtype"); + return NULL; + } + + if (encoding_obj != Py_None) { + if (!PyUnicode_Check(encoding_obj)) { + PyErr_SetString(PyExc_TypeError, + "encoding must be a unicode string."); + return NULL; + } + encoding = PyUnicode_AsUTF8(encoding_obj); + if (encoding == NULL) { + return NULL; + } + } + + stream *s; + if (filelike) { + s = stream_python_file(file, encoding); + } + else { + s = stream_python_iterable(file, encoding); + } + if (s == NULL) { + PyErr_Format(PyExc_RuntimeError, "Unable to access the file."); + return NULL; + } + + arr = _readtext_from_stream(s, &pc, usecols, skiprows, max_rows, + converters, dtype); + stream_close(s); + return arr; +} + diff --git a/numpy/core/src/multiarray/textreading/readtext.h b/numpy/core/src/multiarray/textreading/readtext.h new file mode 100644 index 000000000000..8c470736827a --- /dev/null +++ b/numpy/core/src/multiarray/textreading/readtext.h @@ -0,0 +1,7 @@ +#ifndef READTEXT_H_ +#define READTEXT_H_ + +NPY_NO_EXPORT PyObject * +_load_from_filelike(PyObject *self, PyObject *args, PyObject *kwargs); + +#endif /* READTEXT_H_ */ diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c new file mode 100644 index 000000000000..9301abd5cf30 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -0,0 +1,438 @@ + +#define PY_SSIZE_T_CLEAN +#include + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/arrayobject.h" +#include "numpy/npy_3kcompat.h" + +#include +#include +#include +#include +#include +#include + +#include "textreading/stream.h" +#include "textreading/tokenize.h" +#include "textreading/conversions.h" +#include "textreading/field_types.h" +#include "textreading/rows.h" +#include "textreading/growth.h" + +/* + * Minimum size to grow the allcoation by (or 25%). The 8KiB means the actual + * growths is within `8 KiB <= size < 16 KiB` (depending on the row size). + */ +#define MIN_BLOCK_SIZE (1 << 13) + + + +/* + * Create the array of converter functions from the Python converters. + */ +PyObject ** +create_conv_funcs( + PyObject *converters, int num_fields, int32_t *usecols) +{ + PyObject **conv_funcs = PyMem_Calloc(num_fields, sizeof(PyObject *)); + if (conv_funcs == NULL) { + PyErr_NoMemory(); + return NULL; + } + if (converters == Py_None) { + return conv_funcs; + } + else if (!PyDict_Check(converters)) { + PyErr_SetString(PyExc_TypeError, + "converters must be a dictionary mapping columns to converter " + "functions."); + return NULL; + } + + PyObject *key, *value; + Py_ssize_t pos = 0; + while (PyDict_Next(converters, &pos, &key, &value)) { + Py_ssize_t column = PyNumber_AsSsize_t(key, PyExc_IndexError); + if (column == -1 && PyErr_Occurred()) { + PyErr_Format(PyExc_TypeError, + "keys of the converters dictionary must be integers; " + "got %.100R", key); + goto error; + } + if (usecols != NULL) { + /* + * This code searches for the corresponding usecol. It is + * identical to the legacy usecols code, which has two weaknesses: + * 1. It fails for duplicated usecols only setting converter for + * the first one. + * 2. It fails e.g. if usecols uses negative indexing and + * converters does not. (This is a feature, since it allows + * us to correctly normalize converters to result column here.) + */ + int i = 0; + for (; i < num_fields; i++) { + if (column == usecols[i]) { + column = i; + break; + } + } + if (i == num_fields) { + continue; /* ignore unused converter */ + } + } + else { + if (column < -num_fields || column >= num_fields) { + PyErr_Format(PyExc_ValueError, + "converter specified for column %zd, which is invalid " + "for the number of fields %d.", column, num_fields); + goto error; + } + if (column < 0) { + column += num_fields; + } + } + if (!PyCallable_Check(value)) { + PyErr_Format(PyExc_TypeError, + "values of the converters dictionary must be callable, " + "but the value associated with key %R is not", key); + goto error; + } + Py_INCREF(value); + conv_funcs[column] = value; + } + return conv_funcs; + + error: + for (int i = 0; i < num_fields; i++) { + Py_XDECREF(conv_funcs[i]); + } + PyMem_FREE(conv_funcs); + return NULL; +} + +/** + * Read a file into the provided array, or create (and possibly grow) an + * array to read into. + * + * @param s The stream object/struct providing reading capabilities used by + * the tokenizer. + * @param max_rows The number of rows to read, or -1. If negative + * all rows are read. + * @param num_field_types The number of field types stored in `field_types`. + * @param field_types Information about the dtype for each column (or one if + * `homogeneous`). + * @param pconfig Pointer to the parser config object used by both the + * tokenizer and the conversion functions. + * @param num_usecols The number of columns in `usecols`. + * @param usecols An array of length `num_usecols` or NULL. If given indicates + * which column is read for each individual row (negative columns are + * accepted). + * @param skiplines The number of lines to skip, these lines are ignored. + * @param converters Python dictionary of converters. Finalizing converters + * is difficult without information about the number of columns. + * @param data_array An array to be filled or NULL. In either case a new + * reference is returned (the reference to `data_array` is not stolen). + * @param out_descr The dtype used for allocating a new array. This is not + * used if `data_array` is provided. Note that the actual dtype of the + * returned array can differ for strings. + * @param num_cols Pointer in which the actual (discovered) number of columns + * is returned. This is only relevant if `homogeneous` is true. + * @param homogeneous Whether the datatype of the array is not homogeneous, + * i.e. not structured. In this case the number of columns has to be + * discovered an the returned array will be 2-dimensional rather than + * 1-dimensional. + * + * @returns Returns the result as an array object or NULL on error. The result + * is always a new reference (even when `data_array` was passed in). + */ +PyArrayObject * +read_rows(stream *s, + npy_intp max_rows, int num_field_types, field_type *field_types, + parser_config *pconfig, int num_usecols, int *usecols, + Py_ssize_t skiplines, PyObject *converters, + PyArrayObject *data_array, PyArray_Descr *out_descr, + bool homogeneous) +{ + char *data_ptr = NULL; + int current_num_fields; + size_t row_size = out_descr->elsize; + PyObject **conv_funcs = NULL; + + bool needs_init = PyDataType_FLAGCHK(out_descr, NPY_NEEDS_INIT); + + int ndim = homogeneous ? 2 : 1; + npy_intp result_shape[2] = {0, 1}; + + bool data_array_allocated = data_array == NULL; + /* Make sure we own `data_array` for the purpose of error handling */ + Py_XINCREF(data_array); + size_t rows_per_block = 1; /* will be increased depending on row size */ + Py_ssize_t data_allocated_rows = 0; + + int ts_result = 0; + tokenizer_state ts; + if (tokenizer_init(&ts, pconfig) < 0) { + goto error; + } + + /* Set the actual number of fields if it is already known, otherwise -1 */ + int actual_num_fields = -1; + if (usecols != NULL) { + actual_num_fields = num_usecols; + } + else if (!homogeneous) { + actual_num_fields = num_field_types; + } + + for (; skiplines > 0; skiplines--) { + ts.state = TOKENIZE_GOTO_LINE_END; + ts_result = tokenize(s, &ts, pconfig); + if (ts_result < 0) { + goto error; + } + else if (ts_result != 0) { + /* Fewer lines than skiplines is acceptable */ + break; + } + } + + Py_ssize_t row_count = 0; /* number of rows actually processed */ + while ((max_rows < 0 || row_count < max_rows) && ts_result == 0) { + ts_result = tokenize(s, &ts, pconfig); + if (ts_result < 0) { + goto error; + } + current_num_fields = ts.num_fields; + field_info *fields = ts.fields; + if (ts.num_fields == 0) { + continue; /* Ignore empty line */ + } + + if (NPY_UNLIKELY(data_ptr == NULL)) { + // We've deferred some of the initialization tasks to here, + // because we've now read the first line, and we definitively + // know how many fields (i.e. columns) we will be processing. + if (actual_num_fields == -1) { + actual_num_fields = current_num_fields; + } + + conv_funcs = create_conv_funcs( + converters, actual_num_fields, usecols); + if (conv_funcs == NULL) { + goto error; + } + + /* Note that result_shape[1] is only used if homogeneous is true */ + result_shape[1] = actual_num_fields; + if (homogeneous) { + row_size *= actual_num_fields; + } + + if (data_array == NULL) { + if (max_rows < 0) { + /* + * Negative max_rows denotes to read the whole file, we + * approach this by allocating ever larger blocks. + * Adds a number of rows based on `MIN_BLOCK_SIZE`. + * Note: later code grows assuming this is a power of two. + */ + if (row_size == 0) { + /* actual rows_per_block should not matter here */ + rows_per_block = 512; + } + else { + /* safe on overflow since min_rows will be 0 or 1 */ + size_t min_rows = ( + (MIN_BLOCK_SIZE + row_size - 1) / row_size); + while (rows_per_block < min_rows) { + rows_per_block *= 2; + } + } + data_allocated_rows = rows_per_block; + } + else { + data_allocated_rows = max_rows; + } + result_shape[0] = data_allocated_rows; + Py_INCREF(out_descr); + /* + * We do not use Empty, as it would fill with None + * and requiring decref'ing if we shrink again. + */ + data_array = (PyArrayObject *)PyArray_SimpleNewFromDescr( + ndim, result_shape, out_descr); + if (data_array == NULL) { + goto error; + } + if (needs_init) { + memset(PyArray_BYTES(data_array), 0, PyArray_NBYTES(data_array)); + } + } + else { + assert(max_rows >=0); + data_allocated_rows = max_rows; + } + data_ptr = PyArray_BYTES(data_array); + } + + if (!usecols && (actual_num_fields != current_num_fields)) { + PyErr_Format(PyExc_ValueError, + "the number of columns changed from %d to %d at row %zu; " + "use `usecols` to select a subset and avoid this error", + actual_num_fields, current_num_fields, row_count+1); + goto error; + } + + if (NPY_UNLIKELY(data_allocated_rows == row_count)) { + /* + * Grow by ~25% and rounded up to the next rows_per_block + * NOTE: This is based on very crude timings and could be refined! + */ + size_t new_rows = data_allocated_rows; + npy_intp alloc_size = grow_size_and_multiply( + &new_rows, rows_per_block, row_size); + if (alloc_size < 0) { + /* should normally error much earlier, but make sure */ + PyErr_SetString(PyExc_ValueError, + "array is too big. Cannot read file as a single array; " + "providing a maximum number of rows to read may help."); + goto error; + } + + char *new_data = PyDataMem_RENEW( + PyArray_BYTES(data_array), alloc_size ? alloc_size : 1); + if (new_data == NULL) { + PyErr_NoMemory(); + goto error; + } + /* Replace the arrays data since it may have changed */ + ((PyArrayObject_fields *)data_array)->data = new_data; + ((PyArrayObject_fields *)data_array)->dimensions[0] = new_rows; + data_ptr = new_data + row_count * row_size; + data_allocated_rows = new_rows; + if (needs_init) { + memset(data_ptr, '\0', (new_rows - row_count) * row_size); + } + } + + for (int i = 0; i < actual_num_fields; ++i) { + int f; /* The field, either 0 (if homogeneous) or i. */ + int col; /* The column as read, remapped by usecols */ + char *item_ptr; + if (homogeneous) { + f = 0; + item_ptr = data_ptr + i * field_types[0].descr->elsize; + } + else { + f = i; + item_ptr = data_ptr + field_types[f].structured_offset; + } + + if (usecols == NULL) { + col = i; + } + else { + col = usecols[i]; + if (col < 0) { + // Python-like column indexing: k = -1 means the last column. + col += current_num_fields; + } + if (NPY_UNLIKELY((col < 0) || (col >= current_num_fields))) { + PyErr_Format(PyExc_ValueError, + "invalid column index %d at row %zu with %d " + "columns", + usecols[i], current_num_fields, row_count+1); + goto error; + } + } + + bool err = 0; + Py_UCS4 *str = ts.field_buffer + fields[col].offset; + Py_UCS4 *end = ts.field_buffer + fields[col + 1].offset - 1; + if (conv_funcs[i] == NULL) { + if (field_types[f].set_from_ucs4(field_types[f].descr, + str, end, item_ptr, pconfig) < 0) { + err = true; + } + } + else { + if (to_generic_with_converter(field_types[f].descr, + str, end, item_ptr, pconfig, conv_funcs[i]) < 0) { + err = true; + } + } + + if (NPY_UNLIKELY(err)) { + PyObject *exc, *val, *tb; + PyErr_Fetch(&exc, &val, &tb); + + size_t length = end - str; + PyObject *string = PyUnicode_FromKindAndData( + PyUnicode_4BYTE_KIND, str, length); + if (string == NULL) { + npy_PyErr_ChainExceptions(exc, val, tb); + goto error; + } + PyErr_Format(PyExc_ValueError, + "could not convert string %.100R to %S at " + "row %zu, column %d.", + string, field_types[f].descr, row_count, col+1); + Py_DECREF(string); + npy_PyErr_ChainExceptionsCause(exc, val, tb); + goto error; + } + } + + ++row_count; + data_ptr += row_size; + } + + tokenizer_clear(&ts); + PyMem_FREE(conv_funcs); + + if (data_array == NULL) { + assert(row_count == 0 && result_shape[0] == 0); + if (actual_num_fields == -1) { + /* + * We found no rows and have to discover the number of elements + * we have no choice but to guess 1. + * NOTE: It may make sense to move this outside of here to refine + * the behaviour where necessary. + */ + result_shape[1] = 1; + } + else { + result_shape[1] = actual_num_fields; + } + Py_INCREF(out_descr); + data_array = (PyArrayObject *)PyArray_Empty( + ndim, result_shape, out_descr, 0); + } + + /* + * Note that if there is no data, `data_array` may still be NULL and + * row_count is 0. In that case, always realloc just in case. + */ + if (data_array_allocated && data_allocated_rows != row_count) { + size_t size = row_count * row_size; + char *new_data = PyDataMem_RENEW( + PyArray_BYTES(data_array), size ? size : 1); + if (new_data == NULL) { + Py_DECREF(data_array); + PyErr_NoMemory(); + return NULL; + } + ((PyArrayObject_fields *)data_array)->data = new_data; + ((PyArrayObject_fields *)data_array)->dimensions[0] = row_count; + } + + return data_array; + + error: + PyMem_FREE(conv_funcs); + tokenizer_clear(&ts); + Py_XDECREF(data_array); + return NULL; +} diff --git a/numpy/core/src/multiarray/textreading/rows.h b/numpy/core/src/multiarray/textreading/rows.h new file mode 100644 index 000000000000..773e0f8e0636 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/rows.h @@ -0,0 +1,22 @@ + +#ifndef _ROWS_H_ +#define _ROWS_H_ + +#define PY_SSIZE_T_CLEAN +#include +#include + +#include "textreading/stream.h" +#include "textreading/field_types.h" +#include "textreading/parser_config.h" + + +PyArrayObject * +read_rows(stream *s, + npy_intp nrows, int num_field_types, field_type *field_types, + parser_config *pconfig, int num_usecols, int *usecols, + Py_ssize_t skiplines, PyObject *converters, + PyArrayObject *data_array, PyArray_Descr *out_descr, + bool homogeneous); + +#endif diff --git a/numpy/core/src/multiarray/textreading/str_to_int.c b/numpy/core/src/multiarray/textreading/str_to_int.c new file mode 100644 index 000000000000..647e79a4f2b7 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/str_to_int.c @@ -0,0 +1,87 @@ + +#include + +#include +#include "textreading/str_to_int.h" +#include "textreading/conversions.h" +#include "textreading/parser_config.h" + + +NPY_NO_EXPORT PyArray_Descr *double_descr = NULL; + +// TODO: The float fallbacks are seriously awkward, why? Or at least why this way? +#define DECLARE_TO_INT(intw, INT_MIN, INT_MAX) \ + int \ + to_##intw(PyArray_Descr *descr, \ + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ + parser_config *pconfig) \ + { \ + int64_t parsed; \ + intw##_t x; \ + \ + if (str_to_int64(str, end, INT_MIN, INT_MAX, &parsed) < 0) { \ + if (pconfig->allow_float_for_int) { \ + double fx; \ + if (to_double(double_descr, str, end, (char *)&fx, pconfig) < 0) { \ + return -1; \ + } \ + else { \ + x = (intw##_t) fx; \ + } \ + } \ + else { \ + return -1; \ + } \ + } \ + else { \ + x = (intw##_t)parsed; \ + } \ + memcpy(dataptr, &x, sizeof(x)); \ + if (!PyArray_ISNBO(descr->byteorder)) { \ + descr->f->copyswap(dataptr, dataptr, 1, NULL); \ + } \ + return 0; \ + } + +#define DECLARE_TO_UINT(uintw, UINT_MAX) \ + int \ + to_##uintw(PyArray_Descr *descr, \ + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ + parser_config *pconfig) \ + { \ + uint64_t parsed; \ + uintw##_t x; \ + \ + if (str_to_uint64(str, end, UINT_MAX, &parsed) < 0) { \ + if (pconfig->allow_float_for_int) { \ + double fx; \ + if (to_double(double_descr, str, end, (char *)&fx, pconfig) < 0) { \ + return -1; \ + } \ + else { \ + x = (uintw##_t) fx; \ + } \ + } \ + else { \ + return -1; \ + } \ + } \ + else { \ + x = (uintw##_t)parsed; \ + } \ + memcpy(dataptr, &x, sizeof(x)); \ + if (!PyArray_ISNBO(descr->byteorder)) { \ + descr->f->copyswap(dataptr, dataptr, 1, NULL); \ + } \ + return 0; \ + } + +DECLARE_TO_INT(int8, INT8_MIN, INT8_MAX) +DECLARE_TO_INT(int16, INT16_MIN, INT16_MAX) +DECLARE_TO_INT(int32, INT32_MIN, INT32_MAX) +DECLARE_TO_INT(int64, INT64_MIN, INT64_MAX) + +DECLARE_TO_UINT(uint8, UINT8_MAX) +DECLARE_TO_UINT(uint16, UINT16_MAX) +DECLARE_TO_UINT(uint32, UINT32_MAX) +DECLARE_TO_UINT(uint64, UINT64_MAX) diff --git a/numpy/core/src/multiarray/textreading/str_to_int.h b/numpy/core/src/multiarray/textreading/str_to_int.h new file mode 100644 index 000000000000..9cead56f08f6 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/str_to_int.h @@ -0,0 +1,175 @@ +#ifndef STR_TO_INT_H +#define STR_TO_INT_H + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/ndarraytypes.h" + +#include "textreading/parser_config.h" + +extern NPY_NO_EXPORT PyArray_Descr *double_descr; + +/* + * The following two string conversion functions are largely equivalent + * in Pandas. They are in the header file here, to ensure they can be easily + * inline in the other function. + * Unlike pandas, pass in end-pointer (do not rely on \0) and return 0 or -1. + * + * The actual functions are defined using macro templating below. + */ +static NPY_INLINE int +str_to_int64( + const Py_UCS4 *p_item, const Py_UCS4 *p_end, + int64_t int_min, int64_t int_max, int64_t *result) +{ + const Py_UCS4 *p = (const Py_UCS4 *)p_item; + bool isneg = 0; + int64_t number = 0; + + // Skip leading spaces. + while (Py_UNICODE_ISSPACE(*p)) { + ++p; + } + + // Handle sign. + if (*p == '-') { + isneg = true; + ++p; + } + else if (*p == '+') { + p++; + } + + // Check that there is a first digit. + if (!isdigit(*p)) { + return -1; + } + + if (isneg) { + // If number is greater than pre_min, at least one more digit + // can be processed without overflowing. + int dig_pre_min = -(int_min % 10); + int64_t pre_min = int_min / 10; + + // Process the digits. + int d = *p; + while (isdigit(d)) { + if ((number > pre_min) || ((number == pre_min) && (d - '0' <= dig_pre_min))) { + number = number * 10 - (d - '0'); + d = *++p; + } + else { + return -1; + } + } + } + else { + // If number is less than pre_max, at least one more digit + // can be processed without overflowing. + int64_t pre_max = int_max / 10; + int dig_pre_max = int_max % 10; + + // Process the digits. + int d = *p; + while (isdigit(d)) { + if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { + number = number * 10 + (d - '0'); + d = *++p; + } + else { + return -1; + } + } + } + + // Skip trailing spaces. + while (Py_UNICODE_ISSPACE(*p)) { + ++p; + } + + // Did we use up all the characters? + if (p != p_end) { + return -1; + } + + *result = number; + return 0; +} + + +static NPY_INLINE int +str_to_uint64( + const Py_UCS4 *p_item, const Py_UCS4 *p_end, + uint64_t uint_max, uint64_t *result) +{ + const Py_UCS4 *p = (const Py_UCS4 *)p_item; + uint64_t number = 0; + int d; + + // Skip leading spaces. + while (Py_UNICODE_ISSPACE(*p)) { + ++p; + } + + // Handle sign. + if (*p == '-') { + return -1; + } + if (*p == '+') { + p++; + } + + // Check that there is a first digit. + if (!isdigit(*p)) { + return -1; + } + + // If number is less than pre_max, at least one more digit + // can be processed without overflowing. + uint64_t pre_max = uint_max / 10; + int dig_pre_max = uint_max % 10; + + // Process the digits. + d = *p; + while (isdigit(d)) { + if ((number < pre_max) || ((number == pre_max) && (d - '0' <= dig_pre_max))) { + number = number * 10 + (d - '0'); + d = *++p; + } + else { + return -1; + } + } + + // Skip trailing spaces. + while (Py_UNICODE_ISSPACE(*p)) { + ++p; + } + + // Did we use up all the characters? + if (p != p_end) { + return -1; + } + + *result = number; + return 0; +} + + +#define DECLARE_TO_INT_PROTOTYPE(intw) \ + int \ + to_##intw(PyArray_Descr *descr, \ + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ + parser_config *pconfig); + +DECLARE_TO_INT_PROTOTYPE(int8) +DECLARE_TO_INT_PROTOTYPE(int16) +DECLARE_TO_INT_PROTOTYPE(int32) +DECLARE_TO_INT_PROTOTYPE(int64) + +DECLARE_TO_INT_PROTOTYPE(uint8) +DECLARE_TO_INT_PROTOTYPE(uint16) +DECLARE_TO_INT_PROTOTYPE(uint32) +DECLARE_TO_INT_PROTOTYPE(uint64) + +#endif diff --git a/numpy/core/src/multiarray/textreading/stream.h b/numpy/core/src/multiarray/textreading/stream.h new file mode 100644 index 000000000000..0c4567329323 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/stream.h @@ -0,0 +1,29 @@ +#ifndef _STREAM_H_ +#define _STREAM_H_ + +#include + +/* + * When getting the next line, we hope that the buffer provider can already + * give some information about the newlines, because for Python iterables + * we definitely expect to get line-by-line buffers. + */ +#define BUFFER_MAY_CONTAIN_NEWLINE 0 +#define BUFFER_IS_PARTIAL_LINE 1 +#define BUFFER_IS_LINEND 2 +#define BUFFER_IS_FILEEND 3 + +typedef struct _stream { + void *stream_data; + int (*stream_nextbuf)(void *sdata, char **start, char **end, int *kind); + // Note that the first argument to stream_close is the stream pointer + // itself, not the stream_data pointer. + int (*stream_close)(struct _stream *strm); +} stream; + + +#define stream_nextbuf(s, start, end, kind) \ + ((s)->stream_nextbuf((s)->stream_data, start, end, kind)) +#define stream_close(s) ((s)->stream_close((s))) + +#endif diff --git a/numpy/core/src/multiarray/textreading/stream_pyobject.c b/numpy/core/src/multiarray/textreading/stream_pyobject.c new file mode 100644 index 000000000000..ccc902657596 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/stream_pyobject.c @@ -0,0 +1,271 @@ +/* + * C side structures to provide capabilities to read Python file like objects + * in chunks, or iterate through iterables with each result representing a + * single line of a file. + */ + +#include +#include + +#define PY_SSIZE_T_CLEAN +#include +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/arrayobject.h" + +#include "textreading/stream.h" + +#define READ_CHUNKSIZE 1 << 14 + + +typedef struct { + /* The Python file object being read. */ + PyObject *file; + + /* The `read` attribute of the file object. */ + PyObject *read; + /* Amount to read each time we call `obj.read()` */ + PyObject *chunksize; + + /* file position when the file_buffer was created. */ + off_t initial_file_pos; + + /* Python str object holding the line most recently read from the file. */ + PyObject *chunk; + + /* Encoding compatible with Python's `PyUnicode_Encode` (may be NULL) */ + const char *encoding; +} python_chunks_from_file; + + +/* + * Helper function to support byte objects as well as unicode strings. + * + * NOTE: Steals a reference to `str` (although usually returns it unmodified). + */ +static NPY_INLINE PyObject * +process_stringlike(PyObject *str, const char *encoding) +{ + if (PyBytes_Check(str)) { + PyObject *ustr; + ustr = PyUnicode_FromEncodedObject(str, encoding, NULL); + if (ustr == NULL) { + return NULL; + } + Py_DECREF(str); + return ustr; + } + else if (!PyUnicode_Check(str)) { + PyErr_SetString(PyExc_TypeError, + "non-string returned while reading data"); + Py_DECREF(str); + return NULL; + } + return str; +} + + +static NPY_INLINE void +buffer_info_from_unicode(PyObject *str, char **start, char **end, int *kind) +{ + Py_ssize_t length = PyUnicode_GET_LENGTH(str); + *kind = PyUnicode_KIND(str); + + if (*kind == PyUnicode_1BYTE_KIND) { + *start = (char *)PyUnicode_1BYTE_DATA(str); + } + else if (*kind == PyUnicode_2BYTE_KIND) { + *start = (char *)PyUnicode_2BYTE_DATA(str); + length *= sizeof(Py_UCS2); + } + else if (*kind == PyUnicode_4BYTE_KIND) { + *start = (char *)PyUnicode_4BYTE_DATA(str); + length *= sizeof(Py_UCS4); + } + *end = *start + length; +} + + +static int +fb_nextbuf(python_chunks_from_file *fb, char **start, char **end, int *kind) +{ + Py_XDECREF(fb->chunk); + fb->chunk = NULL; + + PyObject *chunk = PyObject_CallFunctionObjArgs(fb->read, fb->chunksize, NULL); + if (chunk == NULL) { + return -1; + } + fb->chunk = process_stringlike(chunk, fb->encoding); + if (fb->chunk == NULL) { + return -1; + } + buffer_info_from_unicode(fb->chunk, start, end, kind); + if (*start == *end) { + return BUFFER_IS_FILEEND; + } + return BUFFER_MAY_CONTAIN_NEWLINE; +} + + +static int +fb_del(stream *strm) +{ + python_chunks_from_file *fb = (python_chunks_from_file *)strm->stream_data; + + Py_XDECREF(fb->file); + Py_XDECREF(fb->read); + Py_XDECREF(fb->chunksize); + Py_XDECREF(fb->chunk); + + free(fb); + free(strm); + + return 0; +} + + +stream * +stream_python_file(PyObject *obj, const char *encoding) +{ + python_chunks_from_file *fb; + stream *strm; + + fb = (python_chunks_from_file *) malloc(sizeof(python_chunks_from_file)); + if (fb == NULL) { + PyErr_NoMemory(); + return NULL; + } + + fb->file = NULL; + fb->read = NULL; + fb->chunksize = NULL; + fb->chunk = NULL; + fb->encoding = encoding; + + strm = (stream *) malloc(sizeof(stream)); + if (strm == NULL) { + PyErr_NoMemory(); + free(fb); + return NULL; + } + + fb->file = obj; + Py_INCREF(fb->file); + + fb->read = PyObject_GetAttrString(obj, "read"); + if (fb->read == NULL) { + goto fail; + } + fb->chunksize = PyLong_FromLong(READ_CHUNKSIZE); + if (fb->chunksize == NULL) { + goto fail; + } + + strm->stream_data = (void *)fb; + strm->stream_nextbuf = (void *)&fb_nextbuf; + strm->stream_close = &fb_del; + + return strm; + +fail: + fb_del(strm); + return NULL; +} + + +/* + * Stream from a Python iterable by interpreting each item as a line in a file + */ +typedef struct { + /* The Python file object being read. */ + PyObject *iterator; + + /* Python str object holding the line most recently fetched */ + PyObject *line; + + /* Encoding compatible with Python's `PyUnicode_Encode` (may be NULL) */ + const char *encoding; +} python_lines_from_iterator; + + +static int +it_del(stream *strm) +{ + python_lines_from_iterator *it = (python_lines_from_iterator *)strm->stream_data; + + Py_XDECREF(it->iterator); + Py_XDECREF(it->line); + + free(it); + free(strm); + + return 0; +} + + +static int +it_nextbuf(python_lines_from_iterator *it, char **start, char **end, int *kind) +{ + Py_XDECREF(it->line); + it->line = NULL; + + PyObject *line = PyIter_Next(it->iterator); + if (line == NULL) { + if (PyErr_Occurred()) { + return -1; + } + *start = NULL; + *end = NULL; + return BUFFER_IS_FILEEND; + } + it->line = process_stringlike(line, it->encoding); + if (it->line == NULL) { + return -1; + } + + buffer_info_from_unicode(it->line, start, end, kind); + return BUFFER_IS_LINEND; +} + + +stream * +stream_python_iterable(PyObject *obj, const char *encoding) +{ + python_lines_from_iterator *it; + stream *strm; + + it = (python_lines_from_iterator *)malloc(sizeof(*it)); + if (it == NULL) { + PyErr_NoMemory(); + return NULL; + } + + it->iterator = NULL; + it->line = NULL; + it->encoding = encoding; + + strm = (stream *) malloc(sizeof(stream)); + if (strm == NULL) { + PyErr_NoMemory(); + free(it); + return NULL; + } + if (!PyIter_Check(obj)) { + PyErr_SetString(PyExc_TypeError, + "error reading from object, expected an iterable."); + goto fail; + } + Py_INCREF(obj); + it->iterator = obj; + + strm->stream_data = (void *)it; + strm->stream_nextbuf = (void *)&it_nextbuf; + strm->stream_close = &it_del; + + return strm; + +fail: + it_del(strm); + return NULL; +} diff --git a/numpy/core/src/multiarray/textreading/stream_pyobject.h b/numpy/core/src/multiarray/textreading/stream_pyobject.h new file mode 100644 index 000000000000..93357e352cb4 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/stream_pyobject.h @@ -0,0 +1,16 @@ + +#ifndef _STREAM_PYTHON_FILE_BY_LINE +#define _STREAM_PYTHON_FILE_BY_LINE + +#define PY_SSIZE_T_CLEAN +#include + +#include "textreading/stream.h" + +stream * +stream_python_file(PyObject *obj, const char *encoding); + +stream * +stream_python_iterable(PyObject *obj, const char *encoding); + +#endif diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src new file mode 100644 index 000000000000..dcddb1b36a46 --- /dev/null +++ b/numpy/core/src/multiarray/textreading/tokenize.c.src @@ -0,0 +1,449 @@ + +#include + +#include +#include +#include +#include + +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "numpy/ndarraytypes.h" + +#include "textreading/stream.h" +#include "textreading/tokenize.h" +#include "textreading/parser_config.h" +#include "textreading/growth.h" + + +/* + How parsing quoted fields works: + + For quoting to be activated, the first character of the field + must be the quote character (after taking into account + ignore_leading_spaces). While quoting is active, delimiters + are treated as regular characters, not delimiters. Quoting is + deactivated by the second occurrence of the quote character. An + exception is the occurrence of two consecutive quote characters, + which is treated as a literal occurrence of a single quote character. + E.g. (with delimiter=',' and quote='"'): + 12.3,"New York, NY","3'2""" + The second and third fields are `New York, NY` and `3'2"`. + + If a non-delimiter occurs after the closing quote, the quote is + ignored and parsing continues with quoting deactivated. Quotes + that occur while quoting is not activated are not handled specially; + they become part of the data. + E.g: + 12.3,"ABC"DEF,XY"Z + The second and third fields are `ABCDEF` and `XY"Z`. + + Note that the second field of + 12.3,"ABC" ,4.5 + is `ABC `. Currently there is no option to ignore whitespace + at the end of a field. +*/ + + +/**begin repeat + * #type = Py_UCS1, Py_UCS2, Py_UCS4# + */ +static NPY_INLINE int +copy_to_field_buffer_@type@(tokenizer_state *ts, + const @type@ *chunk_start, const @type@ *chunk_end) +{ + size_t chunk_length = chunk_end - chunk_start; + size_t size = chunk_length + ts->field_buffer_pos + 2; + + if (NPY_UNLIKELY(ts->field_buffer_length < size)) { + npy_intp alloc_size = grow_size_and_multiply(&size, 32, sizeof(Py_UCS4)); + if (alloc_size < 0) { + PyErr_Format(PyExc_ValueError, + "line too long to handle while reading file."); + return -1; + } + Py_UCS4 *grown = PyMem_Realloc(ts->field_buffer, alloc_size); + if (grown == NULL) { + PyErr_NoMemory(); + return -1; + } + ts->field_buffer_length = size; + ts->field_buffer = grown; + } + + Py_UCS4 *write_pos = ts->field_buffer + ts->field_buffer_pos; + for (; chunk_start < chunk_end; chunk_start++, write_pos++) { + *write_pos = (Py_UCS4)*chunk_start; + } + *write_pos = '\0'; /* always ensure we end with NUL */ + ts->field_buffer_pos += chunk_length; + return 0; +} +/**end repeat**/ + + +static NPY_INLINE int +add_field(tokenizer_state *ts) +{ + /* The previous field is done, advance to keep a NUL byte at the end */ + ts->field_buffer_pos += 1; + + if (NPY_UNLIKELY((size_t)ts->num_fields + 1 > ts->fields_size)) { + size_t size = (size_t)ts->num_fields; + + npy_intp alloc_size = grow_size_and_multiply( + &size, 4, sizeof(field_info)); + if (alloc_size < 0) { + /* Check for a size overflow, path should be almost impossible. */ + PyErr_Format(PyExc_ValueError, + "too many columns found; cannot read file."); + return -1; + } + field_info *fields = PyMem_Realloc(ts->fields, alloc_size); + if (fields == NULL) { + PyErr_NoMemory(); + return -1; + } + ts->fields = fields; + ts->fields_size = size; + } + + ts->fields[ts->num_fields].offset = ts->field_buffer_pos; + ts->fields[ts->num_fields].quoted = false; + ts->num_fields += 1; + /* Ensure this (currently empty) word is NUL terminated. */ + ts->field_buffer[ts->field_buffer_pos] = '\0'; + return 0; +} + + +/**begin repeat + * #kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND, PyUnicode_4BYTE_KIND# + * #type = Py_UCS1, Py_UCS2, Py_UCS4# + */ +static NPY_INLINE int +tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config) +{ + @type@ *pos = (@type@ *)ts->pos; + @type@ *stop = (@type@ *)ts->end; + @type@ *chunk_start; + + if (ts->state == TOKENIZE_CHECK_QUOTED) { + /* before we can check for quotes, strip leading whitespace */ + if (config->ignore_leading_whitespace) { + while (pos < stop && Py_UNICODE_ISSPACE(*pos) && + *pos != '\r' && *pos != '\n') { + pos++; + } + if (pos == stop) { + ts->pos = (char *)pos; + return 0; + } + } + + /* Setting chunk effectively starts the field */ + if (*pos == config->quote) { + ts->fields[ts->num_fields - 1].quoted = true; + ts->state = TOKENIZE_QUOTED; + pos++; /* TOKENIZE_QUOTED is OK with pos == stop */ + } + else { + /* Set to TOKENIZE_QUOTED or TOKENIZE_QUOTED_WHITESPACE */ + ts->state = ts->unquoted_state; + } + } + + switch (ts->state) { + case TOKENIZE_UNQUOTED: + chunk_start = pos; + for (; pos < stop; pos++) { + if (*pos == '\r') { + ts->state = TOKENIZE_EAT_CRLF; + break; + } + else if (*pos == '\n') { + ts->state = TOKENIZE_LINE_END; + break; + } + else if (*pos == config->delimiter) { + ts->state = TOKENIZE_INIT; + break; + } + else if (*pos == config->comment) { + ts->state = TOKENIZE_GOTO_LINE_END; + break; + } + } + if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) { + return -1; + } + pos++; + break; + + case TOKENIZE_UNQUOTED_WHITESPACE: + /* Note, this branch is largely identical to `TOKENIZE_UNQUOTED` */ + chunk_start = pos; + for (; pos < stop; pos++) { + if (*pos == '\r') { + ts->state = TOKENIZE_EAT_CRLF; + break; + } + else if (*pos == '\n') { + ts->state = TOKENIZE_LINE_END; + break; + } + else if (Py_UNICODE_ISSPACE(*pos)) { + ts->state = TOKENIZE_INIT; + break; + } + else if (*pos == config->comment) { + ts->state = TOKENIZE_GOTO_LINE_END; + break; + } + } + if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) { + return -1; + } + pos++; + break; + + case TOKENIZE_QUOTED: + chunk_start = pos; + for (; pos < stop; pos++) { + if (!config->allow_embedded_newline) { + if (*pos == '\r') { + ts->state = TOKENIZE_EAT_CRLF; + break; + } + else if (*pos == '\n') { + ts->state = TOKENIZE_LINE_END; + break; + } + } + else if (*pos != config->quote) { + /* inside the field, nothing to do. */ + } + else { + ts->state = TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE; + break; + } + } + if (copy_to_field_buffer_@type@(ts, chunk_start, pos) < 0) { + return -1; + } + pos++; + break; + + case TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE: + if (*pos == config->quote) { + ts->state = TOKENIZE_QUOTED; + pos++; + } + else { + /* continue parsing as if unquoted */ + ts->state = TOKENIZE_UNQUOTED; + } + break; + + case TOKENIZE_GOTO_LINE_END: + if (ts->buf_state != BUFFER_MAY_CONTAIN_NEWLINE) { + pos = stop; /* advance to next buffer */ + ts->state = TOKENIZE_LINE_END; + break; + } + for (; pos < stop; pos++) { + if (*pos == '\r') { + ts->state = TOKENIZE_EAT_CRLF; + break; + } + else if (*pos == '\n') { + ts->state = TOKENIZE_LINE_END; + break; + } + } + pos++; + break; + + case TOKENIZE_EAT_CRLF: + /* "Universal newline" support: remove \n in \r\n. */ + if (*pos == '\n') { + pos++; + } + ts->state = TOKENIZE_LINE_END; + break; + + default: + assert(0); + } + + ts->pos = (char *)pos; + return 0; +} +/**end repeat**/ + + +/* + * This version now always copies the full "row" (all tokens). This makes + * two things easier: + * 1. It means that every word is guaranteed to be followed by a NUL character + * (although it can include one as well). + * 2. In the usecols case we can sniff the first row easier by parsing it + * fully. + * + * The tokenizer could grow the ability to skip fields and check the + * maximum number of fields when known. + * + * Unlike other tokenizers, this one tries to work in chunks and copies + * data to words only when it it has to. The hope is that this makes multiple + * light-weight loops rather than a single heavy one, to allow e.g. quickly + * scanning for the end of a field. + */ +int +tokenize(stream *s, tokenizer_state *ts, parser_config *const config) +{ + assert(ts->fields_size >= 2); + assert(ts->field_buffer_length >= 2*sizeof(Py_UCS4)); + + int finished_reading_file = 0; + + /* Reset to start of buffer */ + ts->field_buffer_pos = 0; + ts->num_fields = 0; + /* Add the first field */ + + while (1) { + if (ts->state == TOKENIZE_INIT) { + /* Start a new field */ + if (add_field(ts) < 0) { + return -1; + } + ts->state = TOKENIZE_CHECK_QUOTED; + } + + if (NPY_UNLIKELY(ts->pos >= ts->end)) { + if (ts->buf_state == BUFFER_IS_LINEND && + ts->state != TOKENIZE_QUOTED && + ts->state != TOKENIZE_CHECK_QUOTED) { + /* + * Finished line, do not read anymore (also do not eat \n). + * If we are in a quoted field and the "line" does not end with + * a newline, the quoted field will be missing it right now. + * TODO: We should probably just insert a "\n" character here, + * which is also closer to what the python code did + * (either by setting pos/end or manually). + */ + goto finish; + } + /* fetch new data */ + ts->buf_state = stream_nextbuf(s, + &ts->pos, &ts->end, &ts->unicode_kind); + if (ts->buf_state < 0) { + return -1; + } + if (ts->buf_state == BUFFER_IS_FILEEND) { + finished_reading_file = 1; + ts->pos = ts->end; /* should be guaranteed, but make sure. */ + goto finish; + } + else if (ts->pos == ts->end) { + if (ts->buf_state != BUFFER_IS_LINEND) { + PyErr_SetString(PyExc_RuntimeError, + "Reader returned an empty buffer, " + "but did not indicate file or line end."); + return -1; + } + /* Otherwise, we are OK with this and assume an empty line. */ + goto finish; + } + } + int status; + if (ts->unicode_kind == PyUnicode_1BYTE_KIND) { + status = tokenizer_core_Py_UCS1(ts, config); + } + else if (ts->unicode_kind == PyUnicode_2BYTE_KIND) { + status = tokenizer_core_Py_UCS2(ts, config); + } + else { + assert(ts->unicode_kind == PyUnicode_4BYTE_KIND); + status = tokenizer_core_Py_UCS4(ts, config); + } + if (status < 0) { + return -1; + } + + if (ts->state == TOKENIZE_LINE_END) { + goto finish; + } + } + + finish: + /* Finish the last field */ + if (add_field(ts) < 0) { + return -1; + } + ts->num_fields -= 1; + /* + * If have one field, but that field is completely empty, this is an + * empty line, and we just ignore it. + */ + if (ts->num_fields == 1 + && ts->fields[1].offset - ts->fields[0].offset == 1 + && !ts->fields->quoted) { + ts->num_fields--; + } + ts->state = TOKENIZE_INIT; + return finished_reading_file; +} + + +void +tokenizer_clear(tokenizer_state *ts) +{ + PyMem_FREE(ts->field_buffer); + ts->field_buffer = NULL; + ts->field_buffer_length = 0; + + PyMem_FREE(ts->fields); + ts->fields = NULL; + ts->fields_size = 0; +} + + +/* + * Initialize the tokenizer. We may want to copy all important config + * variables into the tokenizer. This would improve the cache locality during + * tokenizing. + */ +int +tokenizer_init(tokenizer_state *ts, parser_config *config) +{ + /* State and buf_state could be moved into tokenize if we go by row */ + ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE; + ts->state = TOKENIZE_INIT; + if (config->delimiter_is_whitespace) { + ts->unquoted_state = TOKENIZE_UNQUOTED_WHITESPACE; + } + else { + ts->unquoted_state = TOKENIZE_UNQUOTED; + } + ts->num_fields = 0; + + ts->buf_state = 0; + ts->pos = NULL; + ts->end = NULL; + + ts->field_buffer = PyMem_Malloc(32 * sizeof(Py_UCS4)); + if (ts->field_buffer == NULL) { + PyErr_NoMemory(); + return -1; + } + ts->field_buffer_length = 32; + + ts->fields = PyMem_Malloc(4 * sizeof(*ts->fields)); + if (ts->fields == NULL) { + PyErr_NoMemory(); + return -1; + } + ts->fields_size = 4; + return 0; +} diff --git a/numpy/core/src/multiarray/textreading/tokenize.h b/numpy/core/src/multiarray/textreading/tokenize.h new file mode 100644 index 000000000000..aeac63107f5f --- /dev/null +++ b/numpy/core/src/multiarray/textreading/tokenize.h @@ -0,0 +1,77 @@ + +#ifndef _TOKENIZE_H_ +#define _TOKENIZE_H_ + +#include +#include "textreading/stream.h" +#include "textreading/parser_config.h" + + +typedef enum { + /* Initialization of fields */ + TOKENIZE_INIT, + TOKENIZE_CHECK_QUOTED, + /* Main field parsing states */ + TOKENIZE_UNQUOTED, + TOKENIZE_UNQUOTED_WHITESPACE, + TOKENIZE_QUOTED, + /* Handling of two character control sequences (except "\r\n") */ + TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE, + /* Line end handling */ + TOKENIZE_LINE_END, + TOKENIZE_EAT_CRLF, /* "\r\n" support (carriage return, line feed) */ + TOKENIZE_GOTO_LINE_END, +} tokenizer_parsing_state; + + + +typedef struct { + size_t offset; + bool quoted; +} field_info; + + +typedef struct { + tokenizer_parsing_state state; + /* Either TOKENIZE_UNQUOTED or TOKENIZE_UNQUOTED_WHITESPACE: */ + tokenizer_parsing_state unquoted_state; + int unicode_kind; + int buf_state; + size_t num_fields; + /* the buffer we are currently working on */ + char *pos; + char *end; + /* + * Space to copy words into. The buffer must always be at least two NUL + * entries longer (8 bytes) than the actual word (including initially). + * The first byte beyond the current word is always NUL'ed on write, the + * second byte is there to allow easy appending of an additional empty + * word at the end (this word is also NUL terminated). + */ + size_t field_buffer_length; + size_t field_buffer_pos; + Py_UCS4 *field_buffer; + + /* + * Fields, including information about the field being quoted. This + * always includes one "additional" empty field. The length of a field + * is equal to `fields[i+1].offset - fields[i].offset - 1`. + * + * The tokenizer assumes at least one field is allocated. + */ + field_info *fields; + size_t fields_size; +} tokenizer_state; + + +void +tokenizer_clear(tokenizer_state *ts); + + +int +tokenizer_init(tokenizer_state *ts, parser_config *config); + +int +tokenize(stream *s, tokenizer_state *ts, parser_config *const config); + +#endif diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index a6c2d4c2da4b..c2472f6015fa 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -5,6 +5,7 @@ import warnings import weakref import contextlib +import operator from operator import itemgetter, index as opindex, methodcaller from collections.abc import Mapping @@ -13,6 +14,7 @@ from ._datasource import DataSource from numpy.core import overrides from numpy.core.multiarray import packbits, unpackbits +from numpy.core._multiarray_umath import _load_from_filelike from numpy.core.overrides import set_array_function_like_doc, set_module from ._iotools import ( LineSplitter, NameValidator, StringConverter, ConverterError, @@ -721,101 +723,6 @@ def _savez(file, args, kwds, compress, allow_pickle=True, pickle_kwargs=None): zipf.close() -def _floatconv(x): - try: - return float(x) # The fastest path. - except ValueError: - if '0x' in x: # Don't accidentally convert "a" ("0xa") to 10. - try: - return float.fromhex(x) - except ValueError: - pass - raise # Raise the original exception, which makes more sense. - - -_CONVERTERS = [ # These converters only ever get strs (not bytes) as input. - (np.bool_, lambda x: bool(int(x))), - (np.uint64, np.uint64), - (np.int64, np.int64), - (np.integer, lambda x: int(float(x))), - (np.longdouble, np.longdouble), - (np.floating, _floatconv), - (complex, lambda x: complex(x.replace('+-', '-'))), - (np.bytes_, methodcaller('encode', 'latin-1')), - (np.unicode_, str), -] - - -def _getconv(dtype): - """ - Find the correct dtype converter. Adapted from matplotlib. - - Even when a lambda is returned, it is defined at the toplevel, to allow - testing for equality and enabling optimization for single-type data. - """ - for base, conv in _CONVERTERS: - if issubclass(dtype.type, base): - return conv - return str - - -# _loadtxt_flatten_dtype_internal and _loadtxt_pack_items are loadtxt helpers -# lifted to the toplevel because recursive inner functions cause either -# GC-dependent reference loops (because they are closures over loadtxt's -# internal variables) or large overheads if using a manual trampoline to hide -# the recursive calls. - - -# not to be confused with the flatten_dtype we import... -def _loadtxt_flatten_dtype_internal(dt): - """Unpack a structured data-type, and produce a packer function.""" - if dt.names is None: - # If the dtype is flattened, return. - # If the dtype has a shape, the dtype occurs - # in the list more than once. - shape = dt.shape - if len(shape) == 0: - return ([dt.base], None) - else: - packing = [(shape[-1], list)] - if len(shape) > 1: - for dim in dt.shape[-2::-1]: - packing = [(dim*packing[0][0], packing*dim)] - return ([dt.base] * int(np.prod(dt.shape)), - functools.partial(_loadtxt_pack_items, packing)) - else: - types = [] - packing = [] - for field in dt.names: - tp, bytes = dt.fields[field] - flat_dt, flat_packer = _loadtxt_flatten_dtype_internal(tp) - types.extend(flat_dt) - flat_packing = flat_packer.args[0] if flat_packer else None - # Avoid extra nesting for subarrays - if tp.ndim > 0: - packing.extend(flat_packing) - else: - packing.append((len(flat_dt), flat_packing)) - return (types, functools.partial(_loadtxt_pack_items, packing)) - - -def _loadtxt_pack_items(packing, items): - """Pack items into nested lists based on re-packing info.""" - if packing is None: - return items[0] - elif packing is tuple: - return tuple(items) - elif packing is list: - return list(items) - else: - start = 0 - ret = [] - for length, subpacking in packing: - ret.append( - _loadtxt_pack_items(subpacking, items[start:start+length])) - start += length - return tuple(ret) - def _ensure_ndmin_ndarray_check_param(ndmin): """Just checks if the param ndmin is supported on _ensure_ndmin_ndarray. Is intented to be used as @@ -859,6 +766,310 @@ def _loadtxt_dispatcher(fname, dtype=None, comments=None, delimiter=None, return (like,) +def _check_nonneg_int(value, name="argument"): + try: + operator.index(value) + except TypeError: + raise TypeError(f"{name} must be an integer") from None + if value < 0: + raise ValueError(f"{name} must be nonnegative") + + +def _preprocess_comments(iterable, comments, encoding): + """ + Generator that consumes a line iterated iterable and strips out the + multiple (or multi-character) comments from lines. + This is a pre-processing step to achieve feature parity with loadtxt + (we assume that this feature is a nieche feature). + """ + for line in iterable: + if isinstance(line, bytes): + # Need to handle conversion here, or the splitting would fail + line = line.decode(encoding) + + for c in comments: + line = line.split(c, 1)[0] + + yield line + + +# The number of rows we read in one go if confronted with a parametric dtype +_loadtxt_chunksize = 50000 + + +def _read(fname, *, delimiter=',', comment='#', quote='"', + imaginary_unit='j', usecols=None, skiprows=0, + max_rows=None, converters=None, ndmin=None, unpack=False, + dtype=np.float64, encoding="bytes"): + r""" + Read a NumPy array from a text file. + + Parameters + ---------- + fname : str or file object + The filename or the file to be read. + delimiter : str, optional + Field delimiter of the fields in line of the file. + Default is a comma, ','. + comment : str or sequence of str, optional + Character that begins a comment. All text from the comment + character to the end of the line is ignored. + Multiple comments or multiple-character comment strings are supported, + but may be slower and `quote` must be empty if used. + quote : str, optional + Character that is used to quote string fields. Default is '"' + (a double quote). + imaginary_unit : str, optional + Character that represent the imaginay unit `sqrt(-1)`. + Default is 'j'. + usecols : array_like, optional + A one-dimensional array of integer column numbers. These are the + columns from the file to be included in the array. If this value + is not given, all the columns are used. + skiprows : int, optional + Number of lines to skip before interpreting the data in the file. + max_rows : int, optional + Maximum number of rows of data to read. Default is to read the + entire file. + converters : dict, optional + A dictionary mapping column number to a function that will parse the + column string into the desired value. E.g. if column 0 is a date + string: ``converters = {0: datestr2num}``. Converters can also be used + to provide a default value for missing data, e.g. + ``converters = {3: lambda s: float(s.strip() or 0)}``. + Default: None + ndmin : int, optional + Minimum dimension of the array returned. + Allowed values are 0, 1 or 2. Default is 0. + unpack : bool, optional + If True, the returned array is transposed, so that arguments may be + unpacked using ``x, y, z = read(...)``. When used with a structured + data-type, arrays are returned for each field. Default is False. + dtype : numpy data type + A NumPy dtype instance, can be a structured dtype to map to the + columns of the file. + encoding : str, optional + Encoding used to decode the inputfile. The special value 'bytes' + (the default) enables backwards-compatible behavior for `converters`, + ensuring that inputs to the converter functions are encoded + bytes objects. The special value 'bytes' has no additional effect if + ``converters=None``. If encoding is ``'bytes'`` or ``None``, the + default system encoding is used. + + Returns + ------- + ndarray + NumPy array. + + Examples + -------- + First we create a file for the example. + + >>> s1 = '1.0,2.0,3.0\n4.0,5.0,6.0\n' + >>> with open('example1.csv', 'w') as f: + ... f.write(s1) + >>> a1 = read_from_filename('example1.csv') + >>> a1 + array([[1., 2., 3.], + [4., 5., 6.]]) + + The second example has columns with different data types, so a + one-dimensional array with a structured data type is returned. + The tab character is used as the field delimiter. + + >>> s2 = '1.0\t10\talpha\n2.3\t25\tbeta\n4.5\t16\tgamma\n' + >>> with open('example2.tsv', 'w') as f: + ... f.write(s2) + >>> a2 = read_from_filename('example2.tsv', delimiter='\t') + >>> a2 + array([(1. , 10, b'alpha'), (2.3, 25, b'beta'), (4.5, 16, b'gamma')], + dtype=[('f0', ' 1: + comments = (comment,) + comment = '' + else: + comments = None + + # comment is now either a 1 or 0 character string or a tuple: + if comments is not None: + assert comment == '' + # Note: An earlier version support two character comments (and could + # have been extended to multiple characters, we assume this is + # rare enough to not optimize for. + if quote != "": + raise ValueError( + "when multiple comments or a multi-character comment is given, " + "quotes are not supported. In this case the quote character " + "must be set to the empty string: `quote=''`.") + else: + # No preprocessing necessary + assert comments is None + + if len(imaginary_unit) != 1: + raise ValueError('len(imaginary_unit) must be 1.') + + _check_nonneg_int(skiprows) + if max_rows is not None: + _check_nonneg_int(max_rows) + else: + # Passing -1 to the C code means "read the entire file". + max_rows = -1 + + fh_closing_ctx = contextlib.nullcontext() + filelike = False + try: + if isinstance(fname, os.PathLike): + fname = os.fspath(fname) + # TODO: loadtxt actually uses `file + ''` to decide this?! + if isinstance(fname, str): + fh = np.lib._datasource.open(fname, 'rt', encoding=encoding) + if encoding is None: + encoding = getattr(fh, 'encoding', 'latin1') + + fh_closing_ctx = contextlib.closing(fh) + data = fh + filelike = True + else: + if encoding is None: + encoding = getattr(fname, 'encoding', 'latin1') + data = iter(fname) + except TypeError as e: + raise ValueError( + f"fname must be a string, filehandle, list of strings,\n" + f"or generator. Got {type(fname)} instead.") from e + + with fh_closing_ctx: + if comments is not None: + if filelike: + data = iter(data) + filelike = False + data = _preprocess_comments(data, comments, encoding) + + if read_dtype_via_object_chunks is None: + arr = _load_from_filelike( + data, delimiter=delimiter, comment=comment, quote=quote, + imaginary_unit=imaginary_unit, + usecols=usecols, skiprows=skiprows, max_rows=max_rows, + converters=converters, dtype=dtype, + encoding=encoding, filelike=filelike, + byte_converters=byte_converters) + + else: + # This branch reads the file into chunks of object arrays and then + # casts them to the desired actual dtype. This ensures correct + # string-length and datetime-unit discovery (as for `arr.astype()`). + # Due to chunking, certain error reports are less clear, currently. + if filelike: + data = iter(data) # cannot chunk when reading from file + + c_byte_converters = False + if read_dtype_via_object_chunks == "S": + c_byte_converters = True # Use latin1 rather than ascii + + chunks = [] + while max_rows != 0: + if max_rows < 0: + chunk_size = _loadtxt_chunksize + else: + chunk_size = min(_loadtxt_chunksize, max_rows) + + next_arr = _load_from_filelike( + data, delimiter=delimiter, comment=comment, quote=quote, + imaginary_unit=imaginary_unit, + usecols=usecols, skiprows=skiprows, max_rows=max_rows, + converters=converters, dtype=dtype, + encoding=encoding, filelike=filelike, + byte_converters=byte_converters, + c_byte_converters=c_byte_converters) + # Cast here already. We hope that this is better even for + # large files because the storage is more compact. It could + # be adapted (in principle the concatenate could cast). + chunks.append(next_arr.astype(read_dtype_via_object_chunks)) + + skiprows = 0 # Only have to skip for first chunk + if max_rows >= 0: + max_rows -= chunk_size + if len(next_arr) < chunk_size: + # There was less data than requested, so we are done. + break + + # Need at least one chunk, but if empty, the last one may have + # the wrong shape. + if len(chunks) > 1 and len(chunks[-1]) == 0: + del chunks[-1] + if len(chunks) == 1: + arr = chunks[0] + else: + arr = np.concatenate(chunks, axis=0) + + arr = _ensure_ndmin_ndarray(arr, ndmin=ndmin) + + if unpack: + # Handle unpack like np.loadtxt. + # XXX Check interaction with ndmin! + dt = arr.dtype + if dt.names is not None: + # For structured arrays, return an array for each field. + return [arr[field] for field in dt.names] + else: + return arr.T + else: + return arr + + @set_array_function_like_doc @set_module('numpy') def loadtxt(fname, dtype=float, comments='#', delimiter=None, @@ -1000,228 +1211,29 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, max_rows=max_rows, like=like ) - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Nested functions used by loadtxt. - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - def split_line(line: str): - """Chop off comments, strip, and split at delimiter.""" - for comment in comments: # Much faster than using a single regex. - line = line.split(comment, 1)[0] - line = line.strip('\r\n') - return line.split(delimiter) if line else [] + if delimiter is None: + delimiter = '' + elif isinstance(delimiter, bytes): + delimiter.decode("latin1") - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # Main body of loadtxt. - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - _ensure_ndmin_ndarray_check_param(ndmin) + if dtype is None: + dtype = np.float64 + comment = comments # Type conversions for Py3 convenience - if comments is not None: - if isinstance(comments, (str, bytes)): - comments = [comments] - comments = [_decode_line(x) for x in comments] - else: - comments = [] - - if delimiter is not None: - delimiter = _decode_line(delimiter) - - user_converters = converters - - byte_converters = False - if encoding == 'bytes': - encoding = None - byte_converters = True - - if usecols is not None: - # Copy usecols, allowing it to be a single int or a sequence of ints. - try: - usecols = list(usecols) - except TypeError: - usecols = [usecols] - for i, col_idx in enumerate(usecols): - try: - usecols[i] = opindex(col_idx) # Cast to builtin int now. - except TypeError as e: - e.args = ( - "usecols must be an int or a sequence of ints but " - "it contains at least one element of type %s" % - type(col_idx), - ) - raise - if len(usecols) > 1: - usecols_getter = itemgetter(*usecols) - else: - # Get an iterable back, even if using a single column. - usecols_getter = lambda obj, c=usecols[0]: [obj[c]] + if comment is None: + comment = '' else: - usecols_getter = None - - # Make sure we're dealing with a proper dtype - dtype = np.dtype(dtype) - defconv = _getconv(dtype) - - dtype_types, packer = _loadtxt_flatten_dtype_internal(dtype) - - fh_closing_ctx = contextlib.nullcontext() - try: - if isinstance(fname, os_PathLike): - fname = os_fspath(fname) - if _is_string_like(fname): - fh = np.lib._datasource.open(fname, 'rt', encoding=encoding) - fencoding = getattr(fh, 'encoding', 'latin1') - line_iter = iter(fh) - fh_closing_ctx = contextlib.closing(fh) - else: - line_iter = iter(fname) - fencoding = getattr(fname, 'encoding', 'latin1') - try: - first_line = next(line_iter) - except StopIteration: - pass # Nothing matters if line_iter is empty. - else: - # Put first_line back. - line_iter = itertools.chain([first_line], line_iter) - if isinstance(first_line, bytes): - # Using latin1 matches _decode_line's behavior. - decoder = methodcaller( - "decode", - encoding if encoding is not None else "latin1") - line_iter = map(decoder, line_iter) - except TypeError as e: - raise ValueError( - f"fname must be a string, filehandle, list of strings,\n" - f"or generator. Got {type(fname)} instead." - ) from e - - with fh_closing_ctx: - - # input may be a python2 io stream - if encoding is not None: - fencoding = encoding - # we must assume local encoding - # TODO emit portability warning? - elif fencoding is None: - import locale - fencoding = locale.getpreferredencoding() - - # Skip the first `skiprows` lines - for i in range(skiprows): - next(line_iter) - - # Read until we find a line with some values, and use it to determine - # the need for decoding and estimate the number of columns. - for first_line in line_iter: - ncols = len(usecols or split_line(first_line)) - if ncols: - # Put first_line back. - line_iter = itertools.chain([first_line], line_iter) - break - else: # End of lines reached - ncols = len(usecols or []) - warnings.warn('loadtxt: Empty input file: "%s"' % fname, - stacklevel=2) - - line_iter = itertools.islice(line_iter, max_rows) - lineno_words_iter = filter( - itemgetter(1), # item[1] is words; filter skips empty lines. - enumerate(map(split_line, line_iter), 1 + skiprows)) - - # Now that we know ncols, create the default converters list, and - # set packing, if necessary. - if len(dtype_types) > 1: - # We're dealing with a structured array, each field of - # the dtype matches a column - converters = [_getconv(dt) for dt in dtype_types] - else: - # All fields have the same dtype; use specialized packers which are - # much faster than those using _loadtxt_pack_items. - converters = [defconv for i in range(ncols)] - if ncols == 1: - packer = itemgetter(0) - else: - def packer(row): return row - - # By preference, use the converters specified by the user - for i, conv in (user_converters or {}).items(): - if usecols: - try: - i = usecols.index(i) - except ValueError: - # Unused converter specified - continue - if byte_converters: - # converters may use decode to workaround numpy's old - # behaviour, so encode the string again (converters are only - # called with strings) before passing to the user converter. - def tobytes_first(conv, x): - return conv(x.encode("latin1")) - converters[i] = functools.partial(tobytes_first, conv) - else: - converters[i] = conv - - fencode = methodcaller("encode", fencoding) - converters = [conv if conv is not bytes else fencode - for conv in converters] - if len(set(converters)) == 1: - # Optimize single-type data. Note that this is only reached if - # `_getconv` returns equal callables (i.e. not local lambdas) on - # equal dtypes. - def convert_row(vals, _conv=converters[0]): - return [*map(_conv, vals)] - else: - def convert_row(vals): - return [conv(val) for conv, val in zip(converters, vals)] - - # read data in chunks and fill it into an array via resize - # over-allocating and shrinking the array later may be faster but is - # probably not relevant compared to the cost of actually reading and - # converting the data - X = None - while True: - chunk = [] - for lineno, words in itertools.islice( - lineno_words_iter, _loadtxt_chunksize): - if usecols_getter is not None: - words = usecols_getter(words) - elif len(words) != ncols: - raise ValueError( - f"Wrong number of columns at line {lineno}") - # Convert each value according to its column, then pack it - # according to the dtype's nesting, and store it. - chunk.append(packer(convert_row(words))) - if not chunk: # The islice is empty, i.e. we're done. - break - - if X is None: - X = np.array(chunk, dtype) - else: - nshape = list(X.shape) - pos = nshape[0] - nshape[0] += len(chunk) - X.resize(nshape, refcheck=False) - X[pos:, ...] = chunk - - if X is None: - X = np.array([], dtype) + if isinstance(comment, (str, bytes)): + comment = [comment] + comment = [x.decode('latin1') if isinstance(x, bytes) else x for x in comment] - # Multicolumn data are returned with shape (1, N, M), i.e. - # (1, 1, M) for a single row - remove the singleton dimension there - if X.ndim == 3 and X.shape[:2] == (1, 1): - X.shape = (1, -1) + arr = _read(fname, dtype=dtype, comment=comment, delimiter=delimiter, + converters=converters, skiprows=skiprows, usecols=usecols, + unpack=unpack, ndmin=ndmin, encoding=encoding, + max_rows=max_rows, quote='') - X = _ensure_ndmin_ndarray(X, ndmin=ndmin) - - if unpack: - if len(dtype_types) > 1: - # For structured arrays, return an array for each field. - return [X[field] for field in dtype.names] - else: - return X.T - else: - return X + return arr _loadtxt_with_like = array_function_dispatch( From db47a4234bfa69283fc47db9c1c60d22ba21a273 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 8 Dec 2021 12:18:08 -0600 Subject: [PATCH 02/70] Fixup size_t's to (mostly?) use npy_intp to silence compiler warnings --- .../core/src/multiarray/textreading/growth.c | 31 ++++++++++--------- .../core/src/multiarray/textreading/growth.h | 4 +-- numpy/core/src/multiarray/textreading/rows.c | 6 ++-- .../src/multiarray/textreading/tokenize.c.src | 8 ++--- .../src/multiarray/textreading/tokenize.h | 10 +++--- 5 files changed, 31 insertions(+), 28 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/growth.c b/numpy/core/src/multiarray/textreading/growth.c index a38c6d5aa780..2afd3f82ce98 100644 --- a/numpy/core/src/multiarray/textreading/growth.c +++ b/numpy/core/src/multiarray/textreading/growth.c @@ -1,38 +1,39 @@ #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE -#include "templ_common.h" - -#include "textreading/growth.h" +#include "numpy/ndarraytypes.h" +#include "templ_common.h" /* * Helper function taking the size input and growing it (based on min_grow). * It further multiplies it with `itemsize` and ensures that all results fit * into an `npy_intp`. * Returns -1 if any overflow occurred or the result would not fit. - * The user has to ensure the input is size_t (i.e. unsigned). + * The user has to ensure the input is ssize_t but not negative. */ -npy_intp -grow_size_and_multiply(size_t *size, size_t min_grow, npy_intp itemsize) { +NPY_NO_EXPORT npy_intp +grow_size_and_multiply(npy_intp *size, npy_intp min_grow, npy_intp itemsize) { /* min_grow must be a power of two: */ assert((min_grow & (min_grow - 1)) == 0); - size_t growth = *size >> 2; + npy_uintp new_size = (npy_uintp)*size; + npy_intp growth = *size >> 2; if (growth <= min_grow) { - *size += min_grow; + /* can never lead to overflow if we are using min_growth */ + new_size += min_grow; } else { - *size += growth + min_grow - 1; - *size &= ~min_grow; + new_size += growth + min_grow - 1; + new_size &= ~min_grow; - if (*size > NPY_MAX_INTP) { + if (new_size > NPY_MAX_INTP) { return -1; } } - - npy_intp res; - if (npy_mul_with_overflow_intp(&res, (npy_intp)*size, itemsize)) { + *size = (npy_intp)new_size; + npy_intp alloc_size; + if (npy_mul_with_overflow_intp(&alloc_size, (npy_intp)new_size, itemsize)) { return -1; } - return res; + return alloc_size; } diff --git a/numpy/core/src/multiarray/textreading/growth.h b/numpy/core/src/multiarray/textreading/growth.h index debe9a7b3175..d1b005e381db 100644 --- a/numpy/core/src/multiarray/textreading/growth.h +++ b/numpy/core/src/multiarray/textreading/growth.h @@ -1,7 +1,7 @@ #ifndef _NPY_GROWTH_H #define _NPY_GROWTH_H -npy_intp -grow_size_and_multiply(size_t *size, size_t min_grow, npy_intp itemsize); +NPY_NO_EXPORT npy_intp +grow_size_and_multiply(npy_intp *size, npy_intp min_grow, npy_intp itemsize); #endif /*_NPY_GROWTH_H */ diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index 9301abd5cf30..4992b967a744 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -157,7 +157,7 @@ read_rows(stream *s, { char *data_ptr = NULL; int current_num_fields; - size_t row_size = out_descr->elsize; + npy_intp row_size = out_descr->elsize; PyObject **conv_funcs = NULL; bool needs_init = PyDataType_FLAGCHK(out_descr, NPY_NEEDS_INIT); @@ -169,7 +169,7 @@ read_rows(stream *s, /* Make sure we own `data_array` for the purpose of error handling */ Py_XINCREF(data_array); size_t rows_per_block = 1; /* will be increased depending on row size */ - Py_ssize_t data_allocated_rows = 0; + npy_intp data_allocated_rows = 0; int ts_result = 0; tokenizer_state ts; @@ -290,7 +290,7 @@ read_rows(stream *s, * Grow by ~25% and rounded up to the next rows_per_block * NOTE: This is based on very crude timings and could be refined! */ - size_t new_rows = data_allocated_rows; + npy_intp new_rows = data_allocated_rows; npy_intp alloc_size = grow_size_and_multiply( &new_rows, rows_per_block, row_size); if (alloc_size < 0) { diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src index dcddb1b36a46..dd2bf52ce2bf 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.c.src +++ b/numpy/core/src/multiarray/textreading/tokenize.c.src @@ -52,8 +52,8 @@ static NPY_INLINE int copy_to_field_buffer_@type@(tokenizer_state *ts, const @type@ *chunk_start, const @type@ *chunk_end) { - size_t chunk_length = chunk_end - chunk_start; - size_t size = chunk_length + ts->field_buffer_pos + 2; + npy_intp chunk_length = chunk_end - chunk_start; + npy_intp size = chunk_length + ts->field_buffer_pos + 2; if (NPY_UNLIKELY(ts->field_buffer_length < size)) { npy_intp alloc_size = grow_size_and_multiply(&size, 32, sizeof(Py_UCS4)); @@ -88,8 +88,8 @@ add_field(tokenizer_state *ts) /* The previous field is done, advance to keep a NUL byte at the end */ ts->field_buffer_pos += 1; - if (NPY_UNLIKELY((size_t)ts->num_fields + 1 > ts->fields_size)) { - size_t size = (size_t)ts->num_fields; + if (NPY_UNLIKELY(ts->num_fields + 1 > ts->fields_size)) { + npy_intp size = ts->num_fields; npy_intp alloc_size = grow_size_and_multiply( &size, 4, sizeof(field_info)); diff --git a/numpy/core/src/multiarray/textreading/tokenize.h b/numpy/core/src/multiarray/textreading/tokenize.h index aeac63107f5f..ec25a04282f0 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.h +++ b/numpy/core/src/multiarray/textreading/tokenize.h @@ -3,6 +3,8 @@ #define _TOKENIZE_H_ #include +#include "numpy/ndarraytypes.h" + #include "textreading/stream.h" #include "textreading/parser_config.h" @@ -37,7 +39,7 @@ typedef struct { tokenizer_parsing_state unquoted_state; int unicode_kind; int buf_state; - size_t num_fields; + npy_intp num_fields; /* the buffer we are currently working on */ char *pos; char *end; @@ -48,8 +50,8 @@ typedef struct { * second byte is there to allow easy appending of an additional empty * word at the end (this word is also NUL terminated). */ - size_t field_buffer_length; - size_t field_buffer_pos; + npy_intp field_buffer_length; + npy_intp field_buffer_pos; Py_UCS4 *field_buffer; /* @@ -60,7 +62,7 @@ typedef struct { * The tokenizer assumes at least one field is allocated. */ field_info *fields; - size_t fields_size; + npy_intp fields_size; } tokenizer_state; From ff91f2b526abfa0dda31b32daf9468a65929acde Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 8 Dec 2021 12:35:17 -0600 Subject: [PATCH 03/70] MAINT: Remove float-to-int integer parsing fallback --- .../multiarray/textreading/parser_config.h | 6 - .../src/multiarray/textreading/readtext.c | 6 - .../src/multiarray/textreading/str_to_int.c | 104 +++++++----------- .../src/multiarray/textreading/str_to_int.h | 1 - 4 files changed, 41 insertions(+), 76 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/parser_config.h b/numpy/core/src/multiarray/textreading/parser_config.h index c60565de1ce7..a07c81234974 100644 --- a/numpy/core/src/multiarray/textreading/parser_config.h +++ b/numpy/core/src/multiarray/textreading/parser_config.h @@ -55,12 +55,6 @@ typedef struct { */ Py_UCS4 imaginary_unit; - /* - * If true, when an integer dtype is given, the field is allowed - * to contain a floating point value. It will be cast to the - * integer type. - */ - bool allow_float_for_int; /* * Data should be encoded as `latin1` when using python converter * (implementing `loadtxt` default Python 2 compatibility mode). diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c index 750e77b2d80c..e764d9537f6c 100644 --- a/numpy/core/src/multiarray/textreading/readtext.c +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -44,11 +44,6 @@ _readtext_from_stream(stream *s, parser_config *pc, int ncols; field_type *ft = NULL; - /* TODO: Find better solution maybe? */ - if (double_descr == NULL) { - double_descr = PyArray_DescrFromType(NPY_DOUBLE); - } - /* * If dtypes[0] is dtype the input was not structured and the result * is considered "homogeneous" and we have to discover the number of @@ -124,7 +119,6 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), .comment = '#', .quote = '"', .imaginary_unit = 'j', - .allow_float_for_int = true, .allow_embedded_newline = true, .delimiter_is_whitespace = false, .ignore_leading_whitespace = false, diff --git a/numpy/core/src/multiarray/textreading/str_to_int.c b/numpy/core/src/multiarray/textreading/str_to_int.c index 647e79a4f2b7..b0f0f1d5805b 100644 --- a/numpy/core/src/multiarray/textreading/str_to_int.c +++ b/numpy/core/src/multiarray/textreading/str_to_int.c @@ -9,71 +9,49 @@ NPY_NO_EXPORT PyArray_Descr *double_descr = NULL; -// TODO: The float fallbacks are seriously awkward, why? Or at least why this way? -#define DECLARE_TO_INT(intw, INT_MIN, INT_MAX) \ - int \ - to_##intw(PyArray_Descr *descr, \ - const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ - parser_config *pconfig) \ - { \ - int64_t parsed; \ - intw##_t x; \ - \ - if (str_to_int64(str, end, INT_MIN, INT_MAX, &parsed) < 0) { \ - if (pconfig->allow_float_for_int) { \ - double fx; \ - if (to_double(double_descr, str, end, (char *)&fx, pconfig) < 0) { \ - return -1; \ - } \ - else { \ - x = (intw##_t) fx; \ - } \ - } \ - else { \ - return -1; \ - } \ - } \ - else { \ - x = (intw##_t)parsed; \ - } \ - memcpy(dataptr, &x, sizeof(x)); \ - if (!PyArray_ISNBO(descr->byteorder)) { \ - descr->f->copyswap(dataptr, dataptr, 1, NULL); \ - } \ - return 0; \ + +#define DECLARE_TO_INT(intw, INT_MIN, INT_MAX) \ + int \ + to_##intw(PyArray_Descr *descr, \ + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ + parser_config *pconfig) \ + { \ + int64_t parsed; \ + intw##_t x; \ + \ + if (str_to_int64(str, end, INT_MIN, INT_MAX, &parsed) < 0) { \ + return -1; \ + } \ + else { \ + x = (intw##_t)parsed; \ + } \ + memcpy(dataptr, &x, sizeof(x)); \ + if (!PyArray_ISNBO(descr->byteorder)) { \ + descr->f->copyswap(dataptr, dataptr, 1, NULL); \ + } \ + return 0; \ } -#define DECLARE_TO_UINT(uintw, UINT_MAX) \ - int \ - to_##uintw(PyArray_Descr *descr, \ - const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ - parser_config *pconfig) \ - { \ - uint64_t parsed; \ - uintw##_t x; \ - \ - if (str_to_uint64(str, end, UINT_MAX, &parsed) < 0) { \ - if (pconfig->allow_float_for_int) { \ - double fx; \ - if (to_double(double_descr, str, end, (char *)&fx, pconfig) < 0) { \ - return -1; \ - } \ - else { \ - x = (uintw##_t) fx; \ - } \ - } \ - else { \ - return -1; \ - } \ - } \ - else { \ - x = (uintw##_t)parsed; \ - } \ - memcpy(dataptr, &x, sizeof(x)); \ - if (!PyArray_ISNBO(descr->byteorder)) { \ - descr->f->copyswap(dataptr, dataptr, 1, NULL); \ - } \ - return 0; \ +#define DECLARE_TO_UINT(uintw, UINT_MAX) \ + int \ + to_##uintw(PyArray_Descr *descr, \ + const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ + parser_config *pconfig) \ + { \ + uint64_t parsed; \ + uintw##_t x; \ + \ + if (str_to_uint64(str, end, UINT_MAX, &parsed) < 0) { \ + return -1; \ + } \ + else { \ + x = (uintw##_t)parsed; \ + } \ + memcpy(dataptr, &x, sizeof(x)); \ + if (!PyArray_ISNBO(descr->byteorder)) { \ + descr->f->copyswap(dataptr, dataptr, 1, NULL); \ + } \ + return 0; \ } DECLARE_TO_INT(int8, INT8_MIN, INT8_MAX) diff --git a/numpy/core/src/multiarray/textreading/str_to_int.h b/numpy/core/src/multiarray/textreading/str_to_int.h index 9cead56f08f6..ee1718fb35a9 100644 --- a/numpy/core/src/multiarray/textreading/str_to_int.h +++ b/numpy/core/src/multiarray/textreading/str_to_int.h @@ -7,7 +7,6 @@ #include "textreading/parser_config.h" -extern NPY_NO_EXPORT PyArray_Descr *double_descr; /* * The following two string conversion functions are largely equivalent From 684cefc55eb781e91df9cbcb0e883d3ad09b0347 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 8 Dec 2021 12:54:40 -0600 Subject: [PATCH 04/70] ENH: Allow a single converter to be used for all columns This is always used if it is a callable. --- numpy/core/src/multiarray/textreading/rows.c | 11 ++++++++- numpy/lib/npyio.py | 26 +++++++++++--------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index 4992b967a744..f222b93d7d86 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -41,13 +41,22 @@ create_conv_funcs( PyErr_NoMemory(); return NULL; } + if (converters == Py_None) { return conv_funcs; } + else if (PyCallable_Check(converters)) { + /* a single converter used for all columns individually */ + for (int i = 0; i < num_fields; i++) { + Py_INCREF(converters); + conv_funcs[i] = converters; + } + return conv_funcs; + } else if (!PyDict_Check(converters)) { PyErr_SetString(PyExc_TypeError, "converters must be a dictionary mapping columns to converter " - "functions."); + "functions or a single callable."); return NULL; } diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index c2472f6015fa..f08f0c8f5f7b 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -831,12 +831,13 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', max_rows : int, optional Maximum number of rows of data to read. Default is to read the entire file. - converters : dict, optional - A dictionary mapping column number to a function that will parse the - column string into the desired value. E.g. if column 0 is a date - string: ``converters = {0: datestr2num}``. Converters can also be used - to provide a default value for missing data, e.g. - ``converters = {3: lambda s: float(s.strip() or 0)}``. + converters : dict or callable, optional + A function to parse all columns strings into the desired value, or + a dictionary mapping column number to a parser function. + E.g. if column 0 is a date string: ``converters = {0: datestr2num}``. + Converters can also be used to provide a default value for missing + data, e.g. ``converters = lambda s: float(s.strip() or 0)`` will + convert empty fields to 0. Default: None ndmin : int, optional Minimum dimension of the array returned. @@ -1100,12 +1101,13 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, delimiter : str, optional The string used to separate values. For backwards compatibility, byte strings will be decoded as 'latin1'. The default is whitespace. - converters : dict, optional - A dictionary mapping column number to a function that will parse the - column string into the desired value. E.g., if column 0 is a date - string: ``converters = {0: datestr2num}``. Converters can also be - used to provide a default value for missing data (but see also - `genfromtxt`): ``converters = {3: lambda s: float(s.strip() or 0)}``. + converters : dict or callable, optional + A function to parse all columns strings into the desired value, or + a dictionary mapping column number to a parser function. + E.g. if column 0 is a date string: ``converters = {0: datestr2num}``. + Converters can also be used to provide a default value for missing + data, e.g. ``converters = lambda s: float(s.strip() or 0)`` will + convert empty fields to 0. Default: None. skiprows : int, optional Skip the first `skiprows` lines, including comments; default: 0. From b8c82404855d317a9ac77b4743d3db39f009c6aa Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Mon, 13 Dec 2021 14:35:38 -0600 Subject: [PATCH 05/70] TST: Fixup current loadtxt tests for changes --- numpy/lib/tests/test_io.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index b9b10bc0606e..2b31438f8d85 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -695,7 +695,7 @@ def test_record(self): assert_array_equal(x, a) d = TextIO() - d.write('M 64.0 75.0\nF 25.0 60.0') + d.write('M 64 75.0\nF 25 60.0') d.seek(0) mydescriptor = {'names': ('gender', 'age', 'weight'), 'formats': ('S1', 'i4', 'f4')} @@ -981,7 +981,8 @@ def test_from_float_hex(self): c.write(inp) for dt in [float, np.float32]: c.seek(0) - res = np.loadtxt(c, dtype=dt) + res = np.loadtxt( + c, dtype=dt, converters=float.fromhex, encoding="latin1") assert_equal(res, tgt, err_msg="%s" % dt) def test_default_float_converter_no_default_hex_conversion(self): @@ -990,9 +991,8 @@ def test_default_float_converter_no_default_hex_conversion(self): is not called by default. Regression test related to gh-19598. """ c = TextIO("a b c") - with pytest.raises( - ValueError, match="could not convert string to float" - ): + with pytest.raises(ValueError, + match=".*convert string 'a' to float64 at row 0, column 1"): np.loadtxt(c) def test_default_float_converter_exception(self): @@ -1001,9 +1001,8 @@ def test_default_float_converter_exception(self): conversion is correct. Regression test related to gh-19598. """ c = TextIO("qrs tuv") # Invalid values for default float converter - with pytest.raises( - ValueError, match="could not convert string to float" - ): + with pytest.raises(ValueError, + match="could not convert string 'qrs' to float64"): np.loadtxt(c) def test_from_complex(self): From 7a4251853534001e4c9ff7b942ff15f42f86ca4f Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Mon, 13 Dec 2021 15:09:02 -0600 Subject: [PATCH 06/70] STY: Fix some style issues (mainly long lines) Note that one of the long lines is a link that cannot be split reasonably. --- numpy/lib/npyio.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index f08f0c8f5f7b..572d1fa1a5fc 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -760,9 +760,10 @@ def _ensure_ndmin_ndarray(a, *, ndmin: int): _loadtxt_chunksize = 50000 -def _loadtxt_dispatcher(fname, dtype=None, comments=None, delimiter=None, - converters=None, skiprows=None, usecols=None, unpack=None, - ndmin=None, encoding=None, max_rows=None, *, like=None): +def _loadtxt_dispatcher( + fname, dtype=None, comments=None, delimiter=None, + converters=None, skiprows=None, usecols=None, unpack=None, + ndmin=None, encoding=None, max_rows=None, *, like=None): return (like,) @@ -798,9 +799,9 @@ def _preprocess_comments(iterable, comments, encoding): def _read(fname, *, delimiter=',', comment='#', quote='"', - imaginary_unit='j', usecols=None, skiprows=0, - max_rows=None, converters=None, ndmin=None, unpack=False, - dtype=np.float64, encoding="bytes"): + imaginary_unit='j', usecols=None, skiprows=0, + max_rows=None, converters=None, ndmin=None, unpack=False, + dtype=np.float64, encoding="bytes"): r""" Read a NumPy array from a text file. @@ -898,7 +899,7 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', read_dtype_via_object_chunks = None if dtype.kind in 'SUM' and ( - dtype == "S0" or dtype == "U0" or dtype == "M8" or dtype == 'm8'): + dtype == "S0" or dtype == "U0" or dtype == "M8" or dtype == 'm8'): # This is a legacy "flexible" dtype. We do not truly support # parametric dtypes currently (no dtype discovery step in the core), # but have to support these for backward compatibility. @@ -952,9 +953,9 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', # rare enough to not optimize for. if quote != "": raise ValueError( - "when multiple comments or a multi-character comment is given, " - "quotes are not supported. In this case the quote character " - "must be set to the empty string: `quote=''`.") + "when multiple comments or a multi-character comment is " + "given, quotes are not supported. In this case the quote " + "character must be set to the empty string: `quote=''`.") else: # No preprocessing necessary assert comments is None @@ -1011,7 +1012,7 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', else: # This branch reads the file into chunks of object arrays and then # casts them to the desired actual dtype. This ensures correct - # string-length and datetime-unit discovery (as for `arr.astype()`). + # string-length and datetime-unit discovery (like `arr.astype()`). # Due to chunking, certain error reports are less clear, currently. if filelike: data = iter(data) # cannot chunk when reading from file @@ -1228,7 +1229,8 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, else: if isinstance(comment, (str, bytes)): comment = [comment] - comment = [x.decode('latin1') if isinstance(x, bytes) else x for x in comment] + comment = [ + x.decode('latin1') if isinstance(x, bytes) else x for x in comment] arr = _read(fname, dtype=dtype, comment=comment, delimiter=delimiter, converters=converters, skiprows=skiprows, usecols=usecols, From 07389a7458ec5e31eab53cf7cddc84bc70e7eeb9 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Mon, 13 Dec 2021 15:38:34 -0600 Subject: [PATCH 07/70] MAINT: Make text reader custom mem handler compatible --- numpy/core/src/multiarray/textreading/rows.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index f222b93d7d86..6af5936c0be1 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -6,12 +6,9 @@ #define _MULTIARRAYMODULE #include "numpy/arrayobject.h" #include "numpy/npy_3kcompat.h" +#include "alloc.h" -#include #include -#include -#include -#include #include #include "textreading/stream.h" @@ -310,8 +307,9 @@ read_rows(stream *s, goto error; } - char *new_data = PyDataMem_RENEW( - PyArray_BYTES(data_array), alloc_size ? alloc_size : 1); + char *new_data = PyDataMem_UserRENEW( + PyArray_BYTES(data_array), alloc_size ? alloc_size : 1, + PyArray_HANDLER(data_array)); if (new_data == NULL) { PyErr_NoMemory(); goto error; @@ -426,8 +424,9 @@ read_rows(stream *s, */ if (data_array_allocated && data_allocated_rows != row_count) { size_t size = row_count * row_size; - char *new_data = PyDataMem_RENEW( - PyArray_BYTES(data_array), size ? size : 1); + char *new_data = PyDataMem_UserRENEW( + PyArray_BYTES(data_array), size ? size : 1, + PyArray_HANDLER(data_array)); if (new_data == NULL) { Py_DECREF(data_array); PyErr_NoMemory(); From 0a636c4faf2826a13f25566668ee8649081b80d1 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 5 Jan 2022 18:24:11 -0600 Subject: [PATCH 08/70] MAINT: Address Tylers review comments (Mainly revising the doc strings) --- .../src/multiarray/textreading/conversions.c | 9 +----- .../src/multiarray/textreading/field_types.h | 30 +++++++++++++++---- .../src/multiarray/textreading/tokenize.c.src | 2 +- numpy/lib/npyio.py | 3 -- 4 files changed, 26 insertions(+), 18 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/conversions.c b/numpy/core/src/multiarray/textreading/conversions.c index be697c380dd4..26f68d4cfda7 100644 --- a/numpy/core/src/multiarray/textreading/conversions.c +++ b/numpy/core/src/multiarray/textreading/conversions.c @@ -96,14 +96,7 @@ double_from_ucs4( return 0; } -/* - * `item` must be the nul-terminated string that is to be - * converted to a double. - * - * To be successful, to_double() must use *all* the characters - * in `item`. E.g. "1.q25" will fail. Leading and trailing - * spaces are allowed. - */ + int to_float(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, diff --git a/numpy/core/src/multiarray/textreading/field_types.h b/numpy/core/src/multiarray/textreading/field_types.h index 5c4cfb2c6bf8..e76ffd6d3fde 100644 --- a/numpy/core/src/multiarray/textreading/field_types.h +++ b/numpy/core/src/multiarray/textreading/field_types.h @@ -10,14 +10,15 @@ #include "textreading/parser_config.h" -/* - * The original code had some error details, but I assume that we don't need - * it. Printing the string from which we tried to modify it should be fine. - * This should potentially be public NumPy API, although it is tricky, NumPy +/** + * Function defining the conversion for each value. * - * This function must support unaligned memory access. + * This function must support unaligned memory access. As of now, there is + * no special error handling (in whatever form): We assume that it is always + * reasonable to raise a `ValueError` noting the string that failed to be + * converted. * - * NOTE: An earlier version of the code had unused default versions (pandas + * NOTE: An earlier version of the code had unused default values (pandas * does this) when columns are missing. We could define this either * by passing `NULL` in, or by adding a default explicitly somewhere. * (I think users should probably have to define the default, at which @@ -26,6 +27,23 @@ * NOTE: We are currently passing the parser config, this could be made public * or could be set up to be dtype specific/private. Always passing * pconfig fully seems easier right now even if it may change. + * (A future use-case may for example be user-specified strings that are + * considered boolean True or False). + * + * TODO: Aside from nailing down the above notes, it may be nice to expose + * these function publically. This could allow user DTypes to provide + * a converter or custom converters written in C rather than Python. + * + * @param descr The NumPy descriptor of the field (may be byte-swapped, etc.) + * @param str Pointer to the beginning of the UCS4 string to be parsed. + * @param end Pointer to the end of the UCS4 string. This value is currently + * guaranteed to be `\0`, ensuring that parsers can rely on + * nul-termination. + * @param dataptr The pointer where to store the parsed value + * @param pconfig Additional configuration for the parser. + * @returns 0 on success and -1 on failure. If the return value is -1 an + * error may or may not be set. If an error is set, it is chained + * behind the generic ValueError. */ typedef int (set_from_ucs4_function)( PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src index dd2bf52ce2bf..ed68749d1a0c 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.c.src +++ b/numpy/core/src/multiarray/textreading/tokenize.c.src @@ -294,7 +294,7 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config) * maximum number of fields when known. * * Unlike other tokenizers, this one tries to work in chunks and copies - * data to words only when it it has to. The hope is that this makes multiple + * data to words only when it has to. The hope is that this makes multiple * light-weight loops rather than a single heavy one, to allow e.g. quickly * scanning for the end of a field. */ diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 572d1fa1a5fc..0db5208c3f27 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -956,9 +956,6 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', "when multiple comments or a multi-character comment is " "given, quotes are not supported. In this case the quote " "character must be set to the empty string: `quote=''`.") - else: - # No preprocessing necessary - assert comments is None if len(imaginary_unit) != 1: raise ValueError('len(imaginary_unit) must be 1.') From 6bf1b2110157a58b5e909a169e7f495ce9e91b09 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Thu, 6 Jan 2022 18:37:07 -0600 Subject: [PATCH 09/70] BUG: Fix skiprows handling and simplify lineskipping logic Somewhere, the skiprow handling got broken, this fixes it. It also simplifies the "forward to end of line" logic slightly (and hopefully correctly). This logic is used when a comment is reached or for skipping lines before any actual data is read. --- numpy/core/src/multiarray/textreading/stream.h | 5 ++--- .../src/multiarray/textreading/tokenize.c.src | 16 +++++++++++----- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/stream.h b/numpy/core/src/multiarray/textreading/stream.h index 0c4567329323..b2fb1e1bf698 100644 --- a/numpy/core/src/multiarray/textreading/stream.h +++ b/numpy/core/src/multiarray/textreading/stream.h @@ -9,9 +9,8 @@ * we definitely expect to get line-by-line buffers. */ #define BUFFER_MAY_CONTAIN_NEWLINE 0 -#define BUFFER_IS_PARTIAL_LINE 1 -#define BUFFER_IS_LINEND 2 -#define BUFFER_IS_FILEEND 3 +#define BUFFER_IS_LINEND 1 +#define BUFFER_IS_FILEEND 2 typedef struct _stream { void *stream_data; diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src index ed68749d1a0c..10475b9211d5 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.c.src +++ b/numpy/core/src/multiarray/textreading/tokenize.c.src @@ -249,6 +249,8 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config) if (ts->buf_state != BUFFER_MAY_CONTAIN_NEWLINE) { pos = stop; /* advance to next buffer */ ts->state = TOKENIZE_LINE_END; + /* Ensure we don't think we have an empty line left to parse: */ + ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE; break; } for (; pos < stop; pos++) { @@ -322,16 +324,20 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config) if (NPY_UNLIKELY(ts->pos >= ts->end)) { if (ts->buf_state == BUFFER_IS_LINEND && - ts->state != TOKENIZE_QUOTED && - ts->state != TOKENIZE_CHECK_QUOTED) { + ts->state != TOKENIZE_QUOTED) { /* * Finished line, do not read anymore (also do not eat \n). * If we are in a quoted field and the "line" does not end with * a newline, the quoted field will be missing it right now. - * TODO: We should probably just insert a "\n" character here, - * which is also closer to what the python code did - * (either by setting pos/end or manually). + * (i.e. `np.loadtxt(['"a', 'b"'], dtype="S2")` reads "ab") + * TODO: We should possibly insert a '\n' character when inside + * a quoted field the and '\n' character is not included + * in the string. `FileLike.readline()` does ensure it + * is included. + * + * Ensure we don't think we have an empty line left to parse: */ + ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE; goto finish; } /* fetch new data */ From 37523dc7130cfac5400e7a0b511ba049f4c3713f Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Fri, 7 Jan 2022 18:00:29 -0600 Subject: [PATCH 10/70] ENH: Raise an error for (most) stray newline characters This makes it strict that newline characters _within_ a single line (which is only possible if the user passes in a manual iterator of strings), is considered weird and rejected. An example is: `np.loadtxt(['1\n1', "2 2"], dtype=np.int64)` --- numpy/core/src/multiarray/textreading/tokenize.c.src | 7 +++++++ numpy/lib/tests/test_io.py | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src index 10475b9211d5..d0671050bcef 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.c.src +++ b/numpy/core/src/multiarray/textreading/tokenize.c.src @@ -383,6 +383,13 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config) } finish: + if (NPY_UNLIKELY(ts->pos != ts->end && ts->buf_state == BUFFER_IS_LINEND)) { + PyErr_SetString(PyExc_ValueError, + "Found an unquoted embedded newline within a single line of " + "input. This is currently not supported."); + return -1; + } + /* Finish the last field */ if (add_field(ts) < 0) { return -1; diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 2b31438f8d85..a3f2ec411d6c 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -1131,8 +1131,8 @@ def test_none_as_string(self): @pytest.mark.skipif(locale.getpreferredencoding() == 'ANSI_X3.4-1968', reason="Wrong preferred encoding") def test_binary_load(self): - butf8 = b"5,6,7,\xc3\x95scarscar\n\r15,2,3,hello\n\r"\ - b"20,2,3,\xc3\x95scar\n\r" + butf8 = b"5,6,7,\xc3\x95scarscar\r\n15,2,3,hello\r\n"\ + b"20,2,3,\xc3\x95scar\r\n" sutf8 = butf8.decode("UTF-8").replace("\r", "").splitlines() with temppath() as path: with open(path, "wb") as f: From 3f2b8d38805d082459d5fc8cfd747291c5ed32d2 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Fri, 7 Jan 2022 19:42:06 -0600 Subject: [PATCH 11/70] ENH: Reject empty string as control character `None` is forced instead in all cases (mainly applies to comments). This is not really a change in behaviour: It was always utterly broken. The one weird thing about it is that `delimiter=None` means "any whitespace", while `quote=None` and `comments=None` means that no quote/comment character exists at all. --- .../src/multiarray/textreading/readtext.c | 14 ++--- numpy/lib/npyio.py | 56 ++++++++++--------- 2 files changed, 38 insertions(+), 32 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c index e764d9537f6c..b7330d8712c2 100644 --- a/numpy/core/src/multiarray/textreading/readtext.c +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -85,16 +85,16 @@ _readtext_from_stream(stream *s, parser_config *pc, static int parse_control_character(PyObject *obj, Py_UCS4 *character) { - if (!PyUnicode_Check(obj) || PyUnicode_GetLength(obj) > 1) { - PyErr_Format(PyExc_TypeError, - "Control character must be a single unicode character or " - "empty unicode string; but got: %.100R", obj); - return 0; - } - if (PyUnicode_GET_LENGTH(obj) == 0) { + if (obj == Py_None) { *character = (Py_UCS4)-1; /* character beyond unicode range */ return 1; } + if (!PyUnicode_Check(obj) || PyUnicode_GetLength(obj) != 1) { + PyErr_Format(PyExc_TypeError, + "Text reading control character must be a single unicode " + "character or None; but got: %.100R", obj); + return 0; + } *character = PyUnicode_READ_CHAR(obj, 0); return 1; } diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 0db5208c3f27..330eca6421b5 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -811,15 +811,17 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', The filename or the file to be read. delimiter : str, optional Field delimiter of the fields in line of the file. - Default is a comma, ','. - comment : str or sequence of str, optional + Default is a comma, ','. If None any sequence of whitespace is + considered a delimiter. + comment : str or sequence of str or None, optional Character that begins a comment. All text from the comment character to the end of the line is ignored. Multiple comments or multiple-character comment strings are supported, but may be slower and `quote` must be empty if used. - quote : str, optional + Use None to disable all use of comments. + quote : str or None, optional Character that is used to quote string fields. Default is '"' - (a double quote). + (a double quote). Use None to disable quote support. imaginary_unit : str, optional Character that represent the imaginay unit `sqrt(-1)`. Default is 'j'. @@ -929,29 +931,33 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', _ensure_ndmin_ndarray_check_param(ndmin) - if not isinstance(comment, str): + if comment is None: + comments = None + elif isinstance(comment, str): + if len(comment) > 1: # length of 0 is rejected later + comments = (comment,) + comment = None + else: + comments = None + else: # assume comments are a sequence of strings comments = tuple(comment) - comment = '' - # If there is only one comment, and that comment has one character, - # the normal parsing can deal with it just fine. - if len(comments) == 1: + comment = None + if len(comments) == 0: + comments = None # No comments at all + elif len(comments) == 1: + # If there is only one comment, and that comment has one character, + # the normal parsing can deal with it just fine. if isinstance(comments[0], str) and len(comments[0]) == 1: comment = comments[0] comments = None - elif len(comment) > 1: - comments = (comment,) - comment = '' - else: - comments = None # comment is now either a 1 or 0 character string or a tuple: if comments is not None: - assert comment == '' # Note: An earlier version support two character comments (and could # have been extended to multiple characters, we assume this is # rare enough to not optimize for. - if quote != "": + if quote is not None: raise ValueError( "when multiple comments or a multi-character comment is " "given, quotes are not supported. In this case the quote " @@ -1073,7 +1079,7 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', @set_module('numpy') def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None, skiprows=0, usecols=None, unpack=False, - ndmin=0, encoding='bytes', max_rows=None, *, like=None): + ndmin=0, encoding='bytes', max_rows=None, *, quote=None, like=None): r""" Load data from a text file. @@ -1092,7 +1098,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, each row will be interpreted as an element of the array. In this case, the number of columns used must match the number of fields in the data-type. - comments : str or sequence of str, optional + comments : str or sequence of str or None, optional The characters or list of characters used to indicate the start of a comment. None implies no comments. For backwards compatibility, byte strings will be decoded as 'latin1'. The default is '#'. @@ -1143,6 +1149,10 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, is to read all the lines. .. versionadded:: 1.16.0 + quote : unicode character or None, optional + If given (normally ``"``) quoting support is enabled. Double quotes + are considered a single escaped ones if found within a quoted field + (supporting the Excel csv dialect). ${ARRAY_FUNCTION_LIKE} .. versionadded:: 1.20.0 @@ -1211,9 +1221,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, max_rows=max_rows, like=like ) - if delimiter is None: - delimiter = '' - elif isinstance(delimiter, bytes): + if isinstance(delimiter, bytes): delimiter.decode("latin1") if dtype is None: @@ -1221,9 +1229,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, comment = comments # Type conversions for Py3 convenience - if comment is None: - comment = '' - else: + if comment is not None: if isinstance(comment, (str, bytes)): comment = [comment] comment = [ @@ -1232,7 +1238,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, arr = _read(fname, dtype=dtype, comment=comment, delimiter=delimiter, converters=converters, skiprows=skiprows, usecols=usecols, unpack=unpack, ndmin=ndmin, encoding=encoding, - max_rows=max_rows, quote='') + max_rows=max_rows, quote=quote) return arr From 66a61b03658f3c9f312505dcf7eab07e4cf91ac6 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Thu, 6 Jan 2022 14:51:41 -0800 Subject: [PATCH 12/70] Port over tests from npreadtext test suite - Add test for parsing scientific notation. - Add multiple-char comment test. - Port over tests for structured dtypes. - Add tests for exceptions on skiprows/max_rows. - port over ndmin tests. - Make structured data reusable, add unpack tests. - Port over delimiter tests. - Port over maxrows test w/ various dtypes. - Port over test of exception msg on parse failure. - Port over test for converters w/neg indices. - Port over usecols tests - Port over unicode tests. - Port over more converter tests. - Port over test for large rows. - Port over test for string-len discovery. - Port over float conversion accuracy test. - Port over bool test. - Add test for implicit float->int conversion. - Port over complex parsing tests. - Port over tests for reading from generator. - Port over object cleanup test. - Port over bytes incompat test. - Port over converters tests. Co-authored-by: Warren Weckesser Co-authored-by: Sebastian Berg --- numpy/lib/tests/test_io.py | 484 +++++++++++++++++++++++++++++++++++++ 1 file changed, 484 insertions(+) diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index a3f2ec411d6c..1884349db320 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -2396,6 +2396,13 @@ def test_auto_dtype_largeint(self): assert_equal(test['f1'], 17179869184) assert_equal(test['f2'], 1024) + def test_unpack_float_data(self): + txt = TextIO("1,2,3\n4,5,6\n7,8,9\n0.0,1.0,2.0") + a, b, c = np.loadtxt(txt, delimiter=",", unpack=True) + assert_array_equal(a, np.array([1.0, 4.0, 7.0, 0.0])) + assert_array_equal(b, np.array([2.0, 5.0, 8.0, 1.0])) + assert_array_equal(c, np.array([3.0, 6.0, 9.0, 2.0])) + def test_unpack_structured(self): # Regression test for gh-4341 # Unpacking should work on structured arrays @@ -2682,3 +2689,480 @@ def test_load_refcount(): with assert_no_gc_cycles(): x = np.loadtxt(TextIO("0 1 2 3"), dtype=dt) assert_equal(x, np.array([((0, 1), (2, 3))], dtype=dt)) + + +def test_loadtxt_scientific_notation(): + """Test that both 'e' and 'E' are parsed correctly.""" + data = TextIO( + ( + "1.0e-1,2.0E1,3.0\n" + "4.0e-2,5.0E-1,6.0\n" + "7.0e-3,8.0E1,9.0\n" + "0.0e-4,1.0E-1,2.0" + ) + ) + expected = np.array( + [[0.1, 20., 3.0], [0.04, 0.5, 6], [0.007, 80., 9], [0, 0.1, 2]] + ) + assert_array_equal(np.loadtxt(data, delimiter=","), expected) + + +@pytest.mark.parametrize("comment", ["..", "//", "@-", "this is a comment:"]) +def test_loadtxt_comment_multiple_chars(comment): + content = "# IGNORE\n1.5, 2.5# ABC\n3.0,4.0# XXX\n5.5,6.0\n" + txt = TextIO(content.replace("#", comment)) + a = np.loadtxt(txt, delimiter=",", comments=comment) + assert_equal(a, [[1.5, 2.5], [3.0, 4.0], [5.5, 6.0]]) + + +@pytest.fixture +def mixed_types_structured(): + """ + Fixture providing hetergeneous input data with a structured dtype, along + with the associated structured array. + """ + data = TextIO( + ( + "1000;2.4;alpha;-34\n" + "2000;3.1;beta;29\n" + "3500;9.9;gamma;120\n" + "4090;8.1;delta;0\n" + "5001;4.4;epsilon;-99\n" + "6543;7.8;omega;-1\n" + ) + ) + dtype = np.dtype( + [('f0', np.uint16), ('f1', np.float64), ('f2', 'S7'), ('f3', np.int8)] + ) + expected = np.array( + [ + (1000, 2.4, "alpha", -34), + (2000, 3.1, "beta", 29), + (3500, 9.9, "gamma", 120), + (4090, 8.1, "delta", 0), + (5001, 4.4, "epsilon", -99), + (6543, 7.8, "omega", -1) + ], + dtype=dtype + ) + return data, dtype, expected + + +@pytest.mark.parametrize('skiprows', [0, 1, 2, 3]) +def test_loadtxt_structured_dtype_and_skiprows_no_empty_lines( + skiprows, mixed_types_structured + ): + data, dtype, expected = mixed_types_structured + a = np.loadtxt(data, dtype=dtype, delimiter=";", skiprows=skiprows) + assert_array_equal(a, expected[skiprows:]) + + +def test_loadtxt_unpack_structured(mixed_types_structured): + data, dtype, expected = mixed_types_structured + + a, b, c, d = np.loadtxt(data, dtype=dtype, delimiter=";", unpack=True) + assert_array_equal(a, expected["f0"]) + assert_array_equal(b, expected["f1"]) + assert_array_equal(c, expected["f2"]) + assert_array_equal(d, expected["f3"]) + + +def test_loadtxt_structured_dtype_with_shape(): + dtype = np.dtype([("a", "u1", 2), ("b", "u1", 2)]) + data = TextIO("0,1,2,3\n6,7,8,9\n") + expected = np.array([((0, 1), (2, 3)), ((6, 7), (8, 9))], dtype=dtype) + assert_array_equal(np.loadtxt(data, delimiter=",", dtype=dtype), expected) + + +def test_loadtxt_structured_dtype_with_multi_shape(): + dtype = np.dtype([("a", "u1", (2, 2))]) + data = TextIO("0 1 2 3\n") + expected = np.array([(((0, 1), (2, 3)),)], dtype=dtype) + assert_array_equal(np.loadtxt(data, dtype=dtype), expected) + + +def test_loadtxt_nested_structured_subarray(): + # Test from gh-16678 + point = np.dtype([('x', float), ('y', float)]) + dt = np.dtype([('code', int), ('points', point, (2,))]) + data = TextIO("100,1,2,3,4\n200,5,6,7,8\n") + expected = np.array( + [ + (100, [(1., 2.), (3., 4.)]), + (200, [(5., 6.), (7., 8.)]), + ], + dtype=dt + ) + assert_array_equal(np.loadtxt(data, dtype=dt, delimiter=","), expected) + + +def test_loadtxt_structured_dtype_offsets(): + # An aligned structured dtype will have additional padding + dt = np.dtype("i1, i4, i1, i4, i1, i4", align=True) + data = TextIO("1,2,3,4,5,6\n7,8,9,10,11,12\n") + expected = np.array([(1, 2, 3, 4, 5, 6), (7, 8, 9, 10, 11, 12)], dtype=dt) + assert_array_equal(np.loadtxt(data, delimiter=",", dtype=dt), expected) + + +@pytest.mark.parametrize("param", ("skiprows", "max_rows")) +def test_loadtxt_exception_negative_row_limits(param): + """skiprows and max_rows should raise for negative parameters.""" + with pytest.raises(ValueError, match="argument must be nonnegative"): + np.loadtxt("foo.bar", **{param: -3}) + + +@pytest.mark.parametrize("param", ("skiprows", "max_rows")) +def test_loadtxt_exception_noninteger_row_limits(param): + with pytest.raises(TypeError, match="argument must be an integer"): + np.loadtxt("foo.bar", **{param: 1.0}) + + +@pytest.mark.parametrize( + "data, shape", + [ + ("1 2 3 4 5\n", (1, 5)), # Single row + ("1\n2\n3\n4\n5\n", (5, 1)), # Single column + ] +) +def test_loadtxt_ndmin_single_row_or_col(data, shape): + arr = np.array([1, 2, 3, 4, 5]) + arr2d = arr.reshape(shape) + + assert_array_equal(np.loadtxt(TextIO(data), dtype=int), arr) + assert_array_equal(np.loadtxt(TextIO(data), dtype=int, ndmin=0), arr) + assert_array_equal(np.loadtxt(TextIO(data), dtype=int, ndmin=1), arr) + assert_array_equal(np.loadtxt(TextIO(data), dtype=int, ndmin=2), arr2d) + + +@pytest.mark.parametrize("badval", [-1, 3, None, "plate of shrimp"]) +def test_loadtxt_bad_ndmin(badval): + with pytest.raises(ValueError, match="Illegal value of ndmin keyword"): + np.loadtxt("foo.bar", ndmin=badval) + + +@pytest.mark.parametrize( + "ws", + ( + "\t", # tab + "\u2003", # em + "\u00A0", # non-break + "\u3000", # ideographic space + ) +) +def test_loadtxt_blank_lines_spaces_delimit(ws): + txt = StringIO( + f"1 2{ws}30\n\n4 5 60\n {ws} \n7 8 {ws} 90\n # comment\n3 2 1" + ) + # NOTE: It is unclear that the ` # comment` should succeed. Except + # for delimiter=None, which should use any whitespace (and maybe + # should just be implemented closer to Python + expected = np.array([[1, 2, 30], [4, 5, 60], [7, 8, 90], [3, 2, 1]]) + assert_equal( + np.loadtxt(txt, dtype=int, delimiter='', comments="#"), expected + ) + + +def test_loadtxt_blank_lines_normal_delimiter(): + txt = StringIO('1,2,30\n\n4,5,60\n\n7,8,90\n# comment\n3,2,1') + expected = np.array([[1, 2, 30], [4, 5, 60], [7, 8, 90], [3, 2, 1]]) + assert_equal( + np.loadtxt(txt, dtype=int, delimiter=',', comments="#"), expected + ) + + +@pytest.mark.parametrize("dtype", (float, object)) +def test_loadtxt_maxrows_no_blank_lines(dtype): + txt = TextIO("1.5,2.5\n3.0,4.0\n5.5,6.0") + res = np.loadtxt(txt, dtype=dtype, delimiter=",", max_rows=2) + assert_equal(res.dtype, dtype) + assert_equal(res, np.array([["1.5", "2.5"], ["3.0", "4.0"]], dtype=dtype)) + + +@pytest.mark.parametrize("dtype", (np.dtype("f8"), np.dtype("i2"))) +def test_loadtxt_exception_message_bad_values(dtype): + txt = TextIO("1.0,2.0\n3.0,XXX\n5.5,6.0") + msg = f"could not convert string .XXX. to {dtype} at row 1, column 2" + with pytest.raises(ValueError, match=msg): + np.loadtxt(txt, dtype=dtype, delimiter=",") + + +def test_loadtxt_converters_negative_indices(): + txt = TextIO('1.5,2.5\n3.0,XXX\n5.5,6.0') + conv = {-1: lambda s: np.nan if s == 'XXX' else float(s)} + expected = np.array([[1.5, 2.5], [3.0, np.nan], [5.5, 6.0]]) + res = np.loadtxt( + txt, dtype=np.float64, delimiter=",", converters=conv, encoding=None + ) + assert_equal(res, expected) + + +def test_loadtxt_converters_negative_indices_with_usecols(): + txt = TextIO('1.5,2.5,3.5\n3.0,4.0,XXX\n5.5,6.0,7.5\n') + conv = {-1: lambda s: np.nan if s == 'XXX' else float(s)} + expected = np.array([[1.5, 3.5], [3.0, np.nan], [5.5, 7.5]]) + res = np.loadtxt( + txt, + dtype=np.float64, + delimiter=",", + converters=conv, + usecols=[0, -1], + encoding=None, + ) + assert_equal(res, expected) + + +def test_loadtxt_ragged_usecols(): + # usecols, and negative ones, work even with varying number of columns. + txt = TextIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n") + expected = np.array([[0, 0], [0, 0], [0, 0]]) + res = np.loadtxt(txt, dtype=float, delimiter=",", usecols=[0, -2]) + assert_equal(res, expected) + + +def test_loadtxt_empty_usecols(): + txt = TextIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n") + res = np.loadtxt(txt, dtype=np.dtype([]), delimiter=",", usecols=[]) + assert res.shape == (3,) + assert res.dtype == np.dtype([]) + + +@pytest.mark.parametrize("c1", ["a", "の", "🫕"]) +@pytest.mark.parametrize("c2", ["a", "の", "🫕"]) +def test_loadtxt_large_unicode_characters(c1, c2): + # c1 and c2 span ascii, 16bit and 32bit range. + txt = StringIO(f"a,{c1},c,1.0\ne,{c2},2.0,g") + res = np.loadtxt(txt, dtype=np.dtype('U12'), delimiter=",") + expected = np.array( + [f"a,{c1},c,1.0".split(","), f"e,{c2},2.0,g".split(",")], + dtype=np.dtype('U12') + ) + assert_equal(res, expected) + + +def test_loadtxt_unicode_with_converter(): + txt = StringIO("cat,dog\nαβγ,δεζ\nabc,def\n") + conv = {0: lambda s: s.upper()} + res = np.loadtxt( + txt, + dtype=np.dtype("U12"), + converters=conv, + delimiter=",", + encoding=None + ) + expected = np.array([['CAT', 'dog'], ['ΑΒΓ', 'δεζ'], ['ABC', 'def']]) + assert_equal(res, expected) + + +def test_loadtxt_converter_with_structured_dtype(): + txt = TextIO('1.5,2.5,Abc\n3.0,4.0,dEf\n5.5,6.0,ghI\n') + dt = np.dtype([('m', np.int32), ('r', np.float32), ('code', 'U8')]) + conv = {0: lambda s: int(10*float(s)), -1: lambda s: s.upper()} + res = np.loadtxt(txt, dtype=dt, delimiter=",", converters=conv) + expected = np.array( + [(15, 2.5, 'ABC'), (30, 4.0, 'DEF'), (55, 6.0, 'GHI')], dtype=dt + ) + assert_equal(res, expected) + + +def test_loadtxt_converter_with_unicode_dtype(): + """ + With the default 'bytes' encoding, tokens are encoded prior to being passed + to the converter. This means that the output of the converter may be bytes + instead of unicode as expected by `read_rows`. + + This test checks that outputs from the above scenario are properly decoded + prior to parsing by `read_rows`. + """ + txt = StringIO('abc,def\nrst,xyz') + conv = bytes.upper + res = np.loadtxt(txt, dtype=np.dtype("U3"), converters=conv, delimiter=",") + expected = np.array([['ABC', 'DEF'], ['RST', 'XYZ']]) + assert_equal(res, expected) + + +def test_loadtxt_read_huge_row(): + row = "1.5, 2.5," * 50000 + row = row[:-1] + "\n" + txt = TextIO(row * 2) + res = np.loadtxt(txt, delimiter=",", dtype=float) + assert_equal(res, np.tile([1.5, 2.5], (2, 50000))) + + +@pytest.mark.parametrize( + ("given_dtype", "expected_dtype"), + [ + ("S", np.dtype("S5")), + ("U", np.dtype("U5")), + ], +) +def test_loadtxt_string_no_length_given(given_dtype, expected_dtype): + """ + The given dtype is just 'S' or 'U' with no length. In these cases, the + length of the resulting dtype is determined by the longest string found + in the file. + """ + txt = TextIO("AAA,5-1\nBBBBB,0-3\nC,4-9\n") + res = np.loadtxt(txt, dtype=given_dtype, delimiter=",") + expected = np.array( + [['AAA', '5-1'], ['BBBBB', '0-3'], ['C', '4-9']], dtype=expected_dtype + ) + assert_equal(res, expected) + assert_equal(res.dtype, expected_dtype) + + +def test_loadtxt_float_conversion(): + """ + Some tests that the conversion to float64 works as accurately as the Python + built-in `float` function. In a naive version of the float parser, these + strings resulted in values that were off by an ULP or two. + """ + strings = [ + '0.9999999999999999', + '9876543210.123456', + '5.43215432154321e+300', + '0.901', + '0.333', + ] + txt = TextIO('\n'.join(strings)) + res = np.loadtxt(txt) + expected = np.array([float(s) for s in strings]) + assert_equal(res, expected) + + +def test_loadtxt_bool(): + # Simple test for bool via integer + txt = TextIO("1, 0\n10, -1") + res = np.loadtxt(txt, dtype=bool, delimiter=",") + assert res.dtype == bool + assert_array_equal(res, [[True, False], [True, True]]) + # Make sure we use only 1 and 0 on the byte level: + assert_array_equal(res.view(np.uint8), [[1, 0], [1, 1]]) + + +@pytest.mark.parametrize( + "dtype", + ( + np.int8, + np.int16, + np.int32, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + ) +) +def test_loadtxt_implicit_cast_float_to_int(dtype): + """ + Currently the parser_config flag `allow_flot_for_int` is hardcoded to be + true. This means that if the parsing of an integer value fails, the code + will attempt to parse it as a float, then cast the float value to an + integer. This flag is only used when an explicit dtype is given. + """ + txt = TextIO("1.0, 2.1, 3.7\n4, 5, 6") + res = np.loadtxt(txt, dtype=dtype, delimiter=",") + expected = np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype) + assert_equal(res, expected) + + +@pytest.mark.parametrize("dtype", (np.complex64, np.complex128)) +@pytest.mark.parametrize("with_parens", (False, True)) +def test_loadtxt_complex_parsing(dtype, with_parens): + s = "(1.0-2.5j),3.75,(7+-5.0j)\n(4),(-19e2j),(0)" + if not with_parens: + s = s.replace("(", "").replace(")", "") + + res = np.loadtxt(TextIO(s), dtype=dtype, delimiter=",") + expected = np.array( + [[1.0-2.5j, 3.75, 7-5j], [4.0, -1900j, 0]], dtype=dtype + ) + assert_equal(res, expected) + + +def test_loadtxt_read_from_generator(): + def gen(): + for i in range(4): + yield f"{i},{2*i},{i**2}" + + res = np.loadtxt(gen(), dtype=int, delimiter=",") + expected = np.array([[0, 0, 0], [1, 2, 1], [2, 4, 4], [3, 6, 9]]) + assert_equal(res, expected) + + +def test_loadtxt_read_from_generator_multitype(): + def gen(): + for i in range(3): + yield f"{i} {i / 4}" + + res = np.loadtxt(gen(), dtype="i, d", delimiter=" ") + expected = np.array([(0, 0.0), (1, 0.25), (2, 0.5)], dtype="i, d") + assert_equal(res, expected) + + +def test_loadtxt_read_from_bad_generator(): + def gen(): + for entry in ["1,2", b"3, 5", 12738]: + yield entry + + with pytest.raises( + TypeError, match=r"non-string returned while reading data" + ): + np.loadtxt(gen(), dtype="i, i", delimiter=",") + + +@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts") +def test_loadtxt_object_cleanup_on_read_error(): + sentinel = object() + + already_read = 0 + def conv(x): + nonlocal already_read + if already_read > 4999: + raise ValueError("failed half-way through!") + already_read += 1 + return sentinel + + txt = TextIO("x\n" * 10000) + + with pytest.raises(ValueError, match="at row 5000, column 1"): + np.loadtxt(txt, dtype=object, converters={0: conv}) + + assert sys.getrefcount(sentinel) == 2 + + +def test_loadtxt_character_not_bytes_compatible(): + """Test exception when a character cannot be encoded as 'S'.""" + data = StringIO("–") # == \u2013 + with pytest.raises(ValueError): + np.loadtxt(data, dtype="S5") + + +@pytest.mark.parametrize("conv", (0, [float], "")) +def test_loadtxt_invalid_converter(conv): + msg = ( + "converters must be a dictionary mapping columns to converter " + "functions or a single callable." + ) + with pytest.raises(TypeError, match=msg): + np.loadtxt(TextIO("1 2\n3 4"), converters=conv) + + +def test_loadtxt_converters_dict_raises_non_integer_key(): + with pytest.raises(TypeError, match="keys of the converters dict"): + np.loadtxt(TextIO("1 2\n3 4"), converters={"a": int}) + with pytest.raises(TypeError, match="keys of the converters dict"): + np.loadtxt(TextIO("1 2\n3 4"), converters={"a": int}, usecols=0) + + +@pytest.mark.parametrize("bad_col_ind", (3, -3)) +def test_loadtxt_converters_dict_raises_non_col_key(bad_col_ind): + data = TextIO("1 2\n3 4") + with pytest.raises(ValueError, match="converter specified for column"): + np.loadtxt(data, converters={bad_col_ind: int}) + + +def test_loadtxt_converters_dict_raises_val_not_callable(): + with pytest.raises( + TypeError, match="values of the converters dictionary must be callable" + ): + np.loadtxt(StringIO("1 2\n3 4"), converters={0: 1}) From 10a90f0a7aba057958c2c5f405096c7450af91e1 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Fri, 7 Jan 2022 21:52:07 -0600 Subject: [PATCH 13/70] TST: Small fixups for tests to make sure they pass again --- numpy/lib/tests/test_io.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 1884349db320..26eb3a1c57b0 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -2858,7 +2858,7 @@ def test_loadtxt_blank_lines_spaces_delimit(ws): # should just be implemented closer to Python expected = np.array([[1, 2, 30], [4, 5, 60], [7, 8, 90], [3, 2, 1]]) assert_equal( - np.loadtxt(txt, dtype=int, delimiter='', comments="#"), expected + np.loadtxt(txt, dtype=int, delimiter=None, comments="#"), expected ) @@ -2880,8 +2880,8 @@ def test_loadtxt_maxrows_no_blank_lines(dtype): @pytest.mark.parametrize("dtype", (np.dtype("f8"), np.dtype("i2"))) def test_loadtxt_exception_message_bad_values(dtype): - txt = TextIO("1.0,2.0\n3.0,XXX\n5.5,6.0") - msg = f"could not convert string .XXX. to {dtype} at row 1, column 2" + txt = TextIO("1,2\n3,XXX\n5,6") + msg = f"could not convert string 'XXX' to {dtype} at row 1, column 2" with pytest.raises(ValueError, match=msg): np.loadtxt(txt, dtype=dtype, delimiter=",") From 2a0a4f4c4d4995249f29ac71a2e31952b44d7832 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Fri, 7 Jan 2022 21:52:24 -0600 Subject: [PATCH 14/70] TST: Fix test to align with stricter integer parsing --- numpy/lib/tests/test_io.py | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 26eb3a1c57b0..c277f1ddc314 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -3039,31 +3039,11 @@ def test_loadtxt_bool(): assert_array_equal(res.view(np.uint8), [[1, 0], [1, 1]]) -@pytest.mark.parametrize( - "dtype", - ( - np.int8, - np.int16, - np.int32, - np.int64, - np.uint8, - np.uint16, - np.uint32, - np.uint64, - ) -) -def test_loadtxt_implicit_cast_float_to_int(dtype): - """ - Currently the parser_config flag `allow_flot_for_int` is hardcoded to be - true. This means that if the parsing of an integer value fails, the code - will attempt to parse it as a float, then cast the float value to an - integer. This flag is only used when an explicit dtype is given. - """ +@pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) +def test_loadtxt_implicit_cast_float_to_int_fails(dtype): txt = TextIO("1.0, 2.1, 3.7\n4, 5, 6") - res = np.loadtxt(txt, dtype=dtype, delimiter=",") - expected = np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype) - assert_equal(res, expected) - + with pytest.raises(ValueError): + np.loadtxt(txt, dtype=dtype, delimiter=",") @pytest.mark.parametrize("dtype", (np.complex64, np.complex128)) @pytest.mark.parametrize("with_parens", (False, True)) From 1270a1768f7fe166b61fc3cac3ad0a2f91ea1b21 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Mon, 10 Jan 2022 15:06:03 -0600 Subject: [PATCH 15/70] BUG: Add missing quote copy The code that copied the quote character in when a double-quote occurred went AWOL. Add a copy path back. --- numpy/core/src/multiarray/textreading/tokenize.c.src | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src index d0671050bcef..68387a022583 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.c.src +++ b/numpy/core/src/multiarray/textreading/tokenize.c.src @@ -236,6 +236,11 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config) case TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE: if (*pos == config->quote) { + /* Copy the quote character directly from the config: */ + if (copy_to_field_buffer_Py_UCS4(ts, + &config->quote, &config->quote+1) < 0) { + return -1; + } ts->state = TOKENIZE_QUOTED; pos++; } From b670ff7a188bb22ef2dd3437242394a61e805119 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Mon, 10 Jan 2022 12:04:56 -0800 Subject: [PATCH 16/70] Rename quotechar param and update docstring. --- numpy/lib/npyio.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 330eca6421b5..ac71c5b0d55d 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -1079,7 +1079,8 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', @set_module('numpy') def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None, skiprows=0, usecols=None, unpack=False, - ndmin=0, encoding='bytes', max_rows=None, *, quote=None, like=None): + ndmin=0, encoding='bytes', max_rows=None, *, quotechar=None, + like=None): r""" Load data from a text file. @@ -1149,10 +1150,16 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, is to read all the lines. .. versionadded:: 1.16.0 - quote : unicode character or None, optional - If given (normally ``"``) quoting support is enabled. Double quotes - are considered a single escaped ones if found within a quoted field - (supporting the Excel csv dialect). + quotechar : unicode character or None, optional + The character used to denote the start and end of a quoted item. + Occurrences of the delimiter or comment characters are ignored within + a quoted item. The default value is ``quotechar=None``, which means + quoting support is disabled. + + If two consecutive instances of `quotechar` are found within a quoted + field, the first is treated as an escape character. See examples. + + .. versionadded:: 1.23.0 ${ARRAY_FUNCTION_LIKE} .. versionadded:: 1.20.0 @@ -1238,7 +1245,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, arr = _read(fname, dtype=dtype, comment=comment, delimiter=delimiter, converters=converters, skiprows=skiprows, usecols=usecols, unpack=unpack, ndmin=ndmin, encoding=encoding, - max_rows=max_rows, quote=quote) + max_rows=max_rows, quote=quotechar) return arr From bbf14c01022023f0be0b3d25af2d315a6e42598e Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Mon, 10 Jan 2022 12:05:13 -0800 Subject: [PATCH 17/70] TST: Add tests for quote character support. --- numpy/lib/tests/test_io.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index c277f1ddc314..97d8f5b14578 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -3146,3 +3146,18 @@ def test_loadtxt_converters_dict_raises_val_not_callable(): TypeError, match="values of the converters dictionary must be callable" ): np.loadtxt(StringIO("1 2\n3 4"), converters={0: 1}) + + +@pytest.mark.parametrize("q", ('"', "'", "`")) +def test_loadtxt_quoted_field(q): + txt = TextIO( + f"{q}alpha, x{q}, 2.5\n{q}beta, y{q}, 4.5\n{q}gamma, z{q}, 5.0\n" + ) + dtype = np.dtype([('f0', 'U8'), ('f1', np.float64)]) + expected = np.array( + [("alpha, x", 2.5), ("beta, y", 4.5), ("gamma, z", 5.0)], dtype=dtype + ) + + # Test quote param default + res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar=q) + assert_array_equal(res, expected) From 942d4f8ab095f152f5e59e43cada49d3d15839d0 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Mon, 10 Jan 2022 12:15:02 -0800 Subject: [PATCH 18/70] Add test to check quoting support disabled by default. --- numpy/lib/tests/test_io.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 97d8f5b14578..a0bc2e135e29 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -3161,3 +3161,19 @@ def test_loadtxt_quoted_field(q): # Test quote param default res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar=q) assert_array_equal(res, expected) + + +def test_loadtxt_quote_support_default(): + """Support for quoted fields is disabled by default.""" + txt = TextIO('"lat,long", 45, 30\n') + dtype = np.dtype([('f0', 'U24'), ('f1', np.float64), ('f2', np.float64)]) + + with pytest.raises(ValueError, match="the number of columns changed"): + np.loadtxt(txt, dtype=dtype, delimiter=",") + + # Enable quoting support with non-None value for quotechar param + txt.seek(0) + expected = np.array([("lat,long", 45., 30.)], dtype=dtype) + + res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar='"') + assert_array_equal(res, expected) From 2912231adb4b4b1a89a48277d352f9c93248282f Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Mon, 10 Jan 2022 12:31:16 -0800 Subject: [PATCH 19/70] Add tests for quote+multichar comments. Also correct exception message. --- numpy/lib/npyio.py | 4 ++-- numpy/lib/tests/test_io.py | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index ac71c5b0d55d..b5723dee59e2 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -960,8 +960,8 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', if quote is not None: raise ValueError( "when multiple comments or a multi-character comment is " - "given, quotes are not supported. In this case the quote " - "character must be set to the empty string: `quote=''`.") + "given, quotes are not supported. In this case quotechar " + "must be set to None.") if len(imaginary_unit) != 1: raise ValueError('len(imaginary_unit) must be 1.') diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index a0bc2e135e29..d73d50959ace 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -3177,3 +3177,26 @@ def test_loadtxt_quote_support_default(): res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar='"') assert_array_equal(res, expected) + + +def test_loadtxt_quotechar_multichar_error(): + txt = StringIO("1,2\n3,4") + msg = r".*must be a single unicode character or None" + with pytest.raises(TypeError, match=msg): + np.loadtxt(txt, delimiter=",", quotechar="''") + + +def test_loadtxt_comment_multichar_error_with_quote(): + txt = StringIO("1,2\n3,4") + msg = ( + "when multiple comments or a multi-character comment is given, " + "quotes are not supported." + ) + with pytest.raises(ValueError, match=msg): + np.loadtxt(txt, delimiter=",", comments="123", quotechar='"') + with pytest.raises(ValueError, match=msg): + np.loadtxt(txt, delimiter=",", comments=["#", "%"], quotechar='"') + + # A single character string in a tuple is unpacked though: + res = np.loadtxt(txt, delimiter=",", comments=("#",), quotechar="'") + assert_equal(res, [[1, 2], [3, 4]]) From ff5eb6406a80d7858b93516f96f3e76f57236eb3 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Mon, 10 Jan 2022 12:37:32 -0800 Subject: [PATCH 20/70] TST: structured dtype w/ quotes. --- numpy/lib/tests/test_io.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index d73d50959ace..32beddfdb862 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -3200,3 +3200,32 @@ def test_loadtxt_comment_multichar_error_with_quote(): # A single character string in a tuple is unpacked though: res = np.loadtxt(txt, delimiter=",", comments=("#",), quotechar="'") assert_equal(res, [[1, 2], [3, 4]]) + + +def test_loadtxt_structured_dtype_with_quotes(): + data = TextIO( + ( + "1000;2.4;'alpha';-34\n" + "2000;3.1;'beta';29\n" + "3500;9.9;'gamma';120\n" + "4090;8.1;'delta';0\n" + "5001;4.4;'epsilon';-99\n" + "6543;7.8;'omega';-1\n" + ) + ) + dtype = np.dtype( + [('f0', np.uint16), ('f1', np.float64), ('f2', 'S7'), ('f3', np.int8)] + ) + expected = np.array( + [ + (1000, 2.4, "alpha", -34), + (2000, 3.1, "beta", 29), + (3500, 9.9, "gamma", 120), + (4090, 8.1, "delta", 0), + (5001, 4.4, "epsilon", -99), + (6543, 7.8, "omega", -1) + ], + dtype=dtype + ) + res = np.loadtxt(data, dtype=dtype, delimiter=";", quotechar="'") + assert_array_equal(res, expected) From 1489805af8ec0f2b27e4f7439bdc4e48acfdaa6a Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Mon, 10 Jan 2022 12:57:42 -0800 Subject: [PATCH 21/70] Add tests for empty quotes and escaped quotechars. --- numpy/lib/tests/test_io.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 32beddfdb862..e218ba6575ae 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -3229,3 +3229,17 @@ def test_loadtxt_structured_dtype_with_quotes(): ) res = np.loadtxt(data, dtype=dtype, delimiter=";", quotechar="'") assert_array_equal(res, expected) + + +def test_loadtxt_quoted_field_is_not_empty(): + txt = StringIO('1\n\n"4"\n""') + expected = np.array(["1", "4", ""], dtype="U1") + res = np.loadtxt(txt, delimiter=",", dtype="U1", quotechar='"') + assert_equal(res, expected) + + +def test_loadtxt_consecutive_quotechar_escaped(): + txt = TextIO('"Hello, my name is ""Monty""!"') + expected = np.array('Hello, my name is "Monty"!', dtype="U40") + res = np.loadtxt(txt, dtype="U40", delimiter=",", quotechar='"') + assert_equal(res, expected) From 156964d151cf267ed2e032e21af6492143c6c13e Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Mon, 10 Jan 2022 13:02:23 -0800 Subject: [PATCH 22/70] rm incorrect comment. --- numpy/lib/tests/test_io.py | 1 - 1 file changed, 1 deletion(-) diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index e218ba6575ae..7cc1b7ede960 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -3158,7 +3158,6 @@ def test_loadtxt_quoted_field(q): [("alpha, x", 2.5), ("beta, y", 4.5), ("gamma, z", 5.0)], dtype=dtype ) - # Test quote param default res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar=q) assert_array_equal(res, expected) From 6d116b4d8686d9bbbb6be0d53ec3b65ed70fcb0b Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Mon, 10 Jan 2022 22:51:54 -0600 Subject: [PATCH 23/70] DOC: Add release notes for loadtxt changes These are pretty verbose right now, so we may wish to move some of it into the docs or similar. I think they cover most/all important changes, although some very corner cases changes are currently ignored (e.g. we don't support embedded newlines in rows...) --- .../upcoming_changes/20580.compatibility.rst | 32 +++++++++++++++++++ .../upcoming_changes/20580.new_feature.rst | 8 +++++ .../upcoming_changes/20580.performance.rst | 4 +++ 3 files changed, 44 insertions(+) create mode 100644 doc/release/upcoming_changes/20580.compatibility.rst create mode 100644 doc/release/upcoming_changes/20580.new_feature.rst create mode 100644 doc/release/upcoming_changes/20580.performance.rst diff --git a/doc/release/upcoming_changes/20580.compatibility.rst b/doc/release/upcoming_changes/20580.compatibility.rst new file mode 100644 index 000000000000..740764357e70 --- /dev/null +++ b/doc/release/upcoming_changes/20580.compatibility.rst @@ -0,0 +1,32 @@ +``np.loadtxt`` has recieved several changes +------------------------------------------- + +The row counting of `numpy.loadtxt` was fixed. ``loadtxt`` ignores fully +empty lines in the file, but counted them towards ``max_rows``. +When ``max_rows`` is used and the file contains empty lines, these will now +not be counted. Previously, it was possible that the result contained fewer +than ``max_rows`` rows even though more data was available to be read. +If the old behaviour is required, ``itertools.islice`` may be used:: + + import itertools + lines = itertools.islice(open("file"), 0, max_rows) + result = np.loadtxt(lines, ...) + +While generally much faster and improved, `numpy.loadtxt` may now fail to +converter certain strings to numbers that were previously successfully read. +The most important cases for this are: +* Parsing floating point values such as ``1.0`` into integers will now fail +* Parsing hexadecimal floats such as ``0x3p3`` will fail +* An ``_`` was previously accepted as a thousands delimiter ``100_000``. + This will now result in an error. + +If you experience these limitations, they can all be worked around by passing +appropriate ``converters=``. NumPy now supports passing a single converter +to be used for all columns to make this more convenient. +For example, ``converters=float.fromhex`` can read hexadecimal float numbers +and ``converters=int`` will be able to read ``100_000``. + +Further, the error messages have been generally improved. However, this means +that error types may differ. In particularly, a ``ValueError`` is now always +raised when parsing of a single entry fails. + diff --git a/doc/release/upcoming_changes/20580.new_feature.rst b/doc/release/upcoming_changes/20580.new_feature.rst new file mode 100644 index 000000000000..b47049ef20e3 --- /dev/null +++ b/doc/release/upcoming_changes/20580.new_feature.rst @@ -0,0 +1,8 @@ +``np.loadtxt`` now supports quote character and single converter function +------------------------------------------------------------------------- +`numpy.loadtxt` now supports an additional ``quotechar`` keyword argument +which is not set by default. Using ``quotechar='"'`` will read quoted fields +as used by the Excel CSV dialect. + +Further, it is now possible to pass a single callable rather than a dictionary +for the ``converters`` argument. diff --git a/doc/release/upcoming_changes/20580.performance.rst b/doc/release/upcoming_changes/20580.performance.rst new file mode 100644 index 000000000000..baae08765c11 --- /dev/null +++ b/doc/release/upcoming_changes/20580.performance.rst @@ -0,0 +1,4 @@ +Faster ``np.loadtxt`` +--------------------- +`numpy.loadtxt` is now generally much faster than previously as most of it +is now implemented in C. From ad0a8e4364cbac98448c80aa246f832b48b08caf Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 11 Jan 2022 12:39:56 -0600 Subject: [PATCH 24/70] MAINT: Replace last uses of raw malloc with PyMem_MALLOC --- .../multiarray/textreading/stream_pyobject.c | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/stream_pyobject.c b/numpy/core/src/multiarray/textreading/stream_pyobject.c index ccc902657596..0e050e90cebb 100644 --- a/numpy/core/src/multiarray/textreading/stream_pyobject.c +++ b/numpy/core/src/multiarray/textreading/stream_pyobject.c @@ -118,8 +118,8 @@ fb_del(stream *strm) Py_XDECREF(fb->chunksize); Py_XDECREF(fb->chunk); - free(fb); - free(strm); + PyMem_FREE(fb); + PyMem_FREE(strm); return 0; } @@ -131,7 +131,7 @@ stream_python_file(PyObject *obj, const char *encoding) python_chunks_from_file *fb; stream *strm; - fb = (python_chunks_from_file *) malloc(sizeof(python_chunks_from_file)); + fb = (python_chunks_from_file *)PyMem_MALLOC(sizeof(python_chunks_from_file)); if (fb == NULL) { PyErr_NoMemory(); return NULL; @@ -143,10 +143,10 @@ stream_python_file(PyObject *obj, const char *encoding) fb->chunk = NULL; fb->encoding = encoding; - strm = (stream *) malloc(sizeof(stream)); + strm = (stream *)PyMem_MALLOC(sizeof(stream)); if (strm == NULL) { PyErr_NoMemory(); - free(fb); + PyMem_FREE(fb); return NULL; } @@ -197,8 +197,8 @@ it_del(stream *strm) Py_XDECREF(it->iterator); Py_XDECREF(it->line); - free(it); - free(strm); + PyMem_FREE(it); + PyMem_FREE(strm); return 0; } @@ -235,7 +235,7 @@ stream_python_iterable(PyObject *obj, const char *encoding) python_lines_from_iterator *it; stream *strm; - it = (python_lines_from_iterator *)malloc(sizeof(*it)); + it = (python_lines_from_iterator *)PyMem_MALLOC(sizeof(*it)); if (it == NULL) { PyErr_NoMemory(); return NULL; @@ -245,10 +245,10 @@ stream_python_iterable(PyObject *obj, const char *encoding) it->line = NULL; it->encoding = encoding; - strm = (stream *) malloc(sizeof(stream)); + strm = (stream *)PyMem_MALLOC(sizeof(stream)); if (strm == NULL) { PyErr_NoMemory(); - free(it); + PyMem_FREE(it); return NULL; } if (!PyIter_Check(obj)) { From 4ca4c1a32f9cecddddbb6a02e9f56be4ccac190f Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 11 Jan 2022 12:50:58 -0600 Subject: [PATCH 25/70] MAINT: Fixup include guards and use NPY_NO_EXPORT (or static) throughout --- .../src/multiarray/textreading/conversions.c | 18 +++++++------- .../src/multiarray/textreading/conversions.h | 24 +++++++++---------- .../src/multiarray/textreading/field_types.c | 4 ++-- .../src/multiarray/textreading/field_types.h | 10 ++++---- .../core/src/multiarray/textreading/growth.h | 6 ++--- .../multiarray/textreading/parser_config.h | 6 ++--- .../src/multiarray/textreading/readtext.h | 6 ++--- numpy/core/src/multiarray/textreading/rows.c | 4 ++-- numpy/core/src/multiarray/textreading/rows.h | 8 +++---- .../src/multiarray/textreading/str_to_int.c | 4 ++-- .../src/multiarray/textreading/str_to_int.h | 8 +++---- .../core/src/multiarray/textreading/stream.h | 6 ++--- .../multiarray/textreading/stream_pyobject.c | 4 ++-- .../multiarray/textreading/stream_pyobject.h | 10 ++++---- .../src/multiarray/textreading/tokenize.c.src | 6 ++--- .../src/multiarray/textreading/tokenize.h | 13 +++++----- 16 files changed, 68 insertions(+), 69 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/conversions.c b/numpy/core/src/multiarray/textreading/conversions.c index 26f68d4cfda7..04060baa1ea1 100644 --- a/numpy/core/src/multiarray/textreading/conversions.c +++ b/numpy/core/src/multiarray/textreading/conversions.c @@ -14,7 +14,7 @@ /* * Coercion to boolean is done via integer right now. */ -int +NPY_NO_EXPORT int to_bool(PyArray_Descr *NPY_UNUSED(descr), const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *NPY_UNUSED(pconfig)) @@ -97,7 +97,7 @@ double_from_ucs4( } -int +NPY_NO_EXPORT int to_float(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *NPY_UNUSED(pconfig)) @@ -120,7 +120,7 @@ to_float(PyArray_Descr *descr, } -int +NPY_NO_EXPORT int to_double(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *NPY_UNUSED(pconfig)) @@ -205,7 +205,7 @@ to_complex_int( } -int +NPY_NO_EXPORT int to_cfloat(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *pconfig) @@ -229,7 +229,7 @@ to_cfloat(PyArray_Descr *descr, } -int +NPY_NO_EXPORT int to_cdouble(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *pconfig) @@ -255,7 +255,7 @@ to_cdouble(PyArray_Descr *descr, /* * String and unicode conversion functions. */ -int +NPY_NO_EXPORT int to_string(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *NPY_UNUSED(unused)) @@ -284,7 +284,7 @@ to_string(PyArray_Descr *descr, } -int +NPY_NO_EXPORT int to_unicode(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *NPY_UNUSED(unused)) @@ -334,7 +334,7 @@ call_converter_function( } -int +NPY_NO_EXPORT int to_generic_with_converter(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *config, PyObject *func) @@ -359,7 +359,7 @@ to_generic_with_converter(PyArray_Descr *descr, } -int +NPY_NO_EXPORT int to_generic(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *config) diff --git a/numpy/core/src/multiarray/textreading/conversions.h b/numpy/core/src/multiarray/textreading/conversions.h index 6308c10d4248..222eea4e7160 100644 --- a/numpy/core/src/multiarray/textreading/conversions.h +++ b/numpy/core/src/multiarray/textreading/conversions.h @@ -1,5 +1,5 @@ -#ifndef CONVERSIONS_H -#define CONVERSIONS_H +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_CONVERSIONS_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_CONVERSIONS_H_ #include @@ -9,49 +9,49 @@ #include "textreading/parser_config.h" -int +NPY_NO_EXPORT int to_bool(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *pconfig); -int +NPY_NO_EXPORT int to_float(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *pconfig); -int +NPY_NO_EXPORT int to_double(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *pconfig); -int +NPY_NO_EXPORT int to_cfloat(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *pconfig); -int +NPY_NO_EXPORT int to_cdouble(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *pconfig); -int +NPY_NO_EXPORT int to_string(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *unused); -int +NPY_NO_EXPORT int to_unicode(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *unused); -int +NPY_NO_EXPORT int to_generic_with_converter(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *unused, PyObject *func); -int +NPY_NO_EXPORT int to_generic(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *pconfig); -#endif +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_CONVERSIONS_H_ */ diff --git a/numpy/core/src/multiarray/textreading/field_types.c b/numpy/core/src/multiarray/textreading/field_types.c index 914c8e4d8c25..e1dbd2984bfd 100644 --- a/numpy/core/src/multiarray/textreading/field_types.c +++ b/numpy/core/src/multiarray/textreading/field_types.c @@ -10,7 +10,7 @@ #include "textreading/growth.h" -void +NPY_NO_EXPORT void field_types_xclear(int num_field_types, field_type *ft) { assert(num_field_types >= 0); if (ft == NULL) { @@ -176,7 +176,7 @@ field_type_grow_recursive(PyArray_Descr *descr, * we copy the itemsize, but the main thing is that we check for custom * converters. */ -npy_intp +NPY_NO_EXPORT npy_intp field_types_create(PyArray_Descr *descr, field_type **ft) { if (descr->subarray != NULL) { diff --git a/numpy/core/src/multiarray/textreading/field_types.h b/numpy/core/src/multiarray/textreading/field_types.h index e76ffd6d3fde..f26e00a5e921 100644 --- a/numpy/core/src/multiarray/textreading/field_types.h +++ b/numpy/core/src/multiarray/textreading/field_types.h @@ -1,6 +1,6 @@ -#ifndef _FIELD_TYPES_H_ -#define _FIELD_TYPES_H_ +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_FIELD_TYPES_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_FIELD_TYPES_H_ #include #include @@ -58,10 +58,10 @@ typedef struct _field_type { } field_type; -void +NPY_NO_EXPORT void field_types_xclear(int num_field_types, field_type *ft); -npy_intp +NPY_NO_EXPORT npy_intp field_types_create(PyArray_Descr *descr, field_type **ft); -#endif +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_FIELD_TYPES_H_ */ diff --git a/numpy/core/src/multiarray/textreading/growth.h b/numpy/core/src/multiarray/textreading/growth.h index d1b005e381db..237b77ad3ad6 100644 --- a/numpy/core/src/multiarray/textreading/growth.h +++ b/numpy/core/src/multiarray/textreading/growth.h @@ -1,7 +1,7 @@ -#ifndef _NPY_GROWTH_H -#define _NPY_GROWTH_H +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_ NPY_NO_EXPORT npy_intp grow_size_and_multiply(npy_intp *size, npy_intp min_grow, npy_intp itemsize); -#endif /*_NPY_GROWTH_H */ +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_GROWTH_H_ */ diff --git a/numpy/core/src/multiarray/textreading/parser_config.h b/numpy/core/src/multiarray/textreading/parser_config.h index a07c81234974..b6e7feec0666 100644 --- a/numpy/core/src/multiarray/textreading/parser_config.h +++ b/numpy/core/src/multiarray/textreading/parser_config.h @@ -1,6 +1,6 @@ -#ifndef _PARSER_CONFIG_H_ -#define _PARSER_CONFIG_H_ +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_PARSER_CONFIG_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_PARSER_CONFIG_H_ #include @@ -68,4 +68,4 @@ typedef struct { } parser_config; -#endif +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_PARSER_CONFIG_H_ */ diff --git a/numpy/core/src/multiarray/textreading/readtext.h b/numpy/core/src/multiarray/textreading/readtext.h index 8c470736827a..5cf48c555b4f 100644 --- a/numpy/core/src/multiarray/textreading/readtext.h +++ b/numpy/core/src/multiarray/textreading/readtext.h @@ -1,7 +1,7 @@ -#ifndef READTEXT_H_ -#define READTEXT_H_ +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_READTEXT_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_READTEXT_H_ NPY_NO_EXPORT PyObject * _load_from_filelike(PyObject *self, PyObject *args, PyObject *kwargs); -#endif /* READTEXT_H_ */ +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_READTEXT_H_ */ diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index 6af5936c0be1..d3e4e462e538 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -29,7 +29,7 @@ /* * Create the array of converter functions from the Python converters. */ -PyObject ** +static PyObject ** create_conv_funcs( PyObject *converters, int num_fields, int32_t *usecols) { @@ -153,7 +153,7 @@ create_conv_funcs( * @returns Returns the result as an array object or NULL on error. The result * is always a new reference (even when `data_array` was passed in). */ -PyArrayObject * +NPY_NO_EXPORT PyArrayObject * read_rows(stream *s, npy_intp max_rows, int num_field_types, field_type *field_types, parser_config *pconfig, int num_usecols, int *usecols, diff --git a/numpy/core/src/multiarray/textreading/rows.h b/numpy/core/src/multiarray/textreading/rows.h index 773e0f8e0636..342af0a4b70d 100644 --- a/numpy/core/src/multiarray/textreading/rows.h +++ b/numpy/core/src/multiarray/textreading/rows.h @@ -1,6 +1,6 @@ -#ifndef _ROWS_H_ -#define _ROWS_H_ +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_ROWS_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_ROWS_H_ #define PY_SSIZE_T_CLEAN #include @@ -11,7 +11,7 @@ #include "textreading/parser_config.h" -PyArrayObject * +NPY_NO_EXPORT PyArrayObject * read_rows(stream *s, npy_intp nrows, int num_field_types, field_type *field_types, parser_config *pconfig, int num_usecols, int *usecols, @@ -19,4 +19,4 @@ read_rows(stream *s, PyArrayObject *data_array, PyArray_Descr *out_descr, bool homogeneous); -#endif +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_ROWS_H_ */ diff --git a/numpy/core/src/multiarray/textreading/str_to_int.c b/numpy/core/src/multiarray/textreading/str_to_int.c index b0f0f1d5805b..2efb431db197 100644 --- a/numpy/core/src/multiarray/textreading/str_to_int.c +++ b/numpy/core/src/multiarray/textreading/str_to_int.c @@ -11,7 +11,7 @@ NPY_NO_EXPORT PyArray_Descr *double_descr = NULL; #define DECLARE_TO_INT(intw, INT_MIN, INT_MAX) \ - int \ + NPY_NO_EXPORT int \ to_##intw(PyArray_Descr *descr, \ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ parser_config *pconfig) \ @@ -33,7 +33,7 @@ NPY_NO_EXPORT PyArray_Descr *double_descr = NULL; } #define DECLARE_TO_UINT(uintw, UINT_MAX) \ - int \ + NPY_NO_EXPORT int \ to_##uintw(PyArray_Descr *descr, \ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ parser_config *pconfig) \ diff --git a/numpy/core/src/multiarray/textreading/str_to_int.h b/numpy/core/src/multiarray/textreading/str_to_int.h index ee1718fb35a9..e310e1ca2c18 100644 --- a/numpy/core/src/multiarray/textreading/str_to_int.h +++ b/numpy/core/src/multiarray/textreading/str_to_int.h @@ -1,5 +1,5 @@ -#ifndef STR_TO_INT_H -#define STR_TO_INT_H +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STR_TO_INT_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STR_TO_INT_H_ #define NPY_NO_DEPRECATED_API NPY_API_VERSION #define _MULTIARRAYMODULE @@ -156,7 +156,7 @@ str_to_uint64( #define DECLARE_TO_INT_PROTOTYPE(intw) \ - int \ + NPY_NO_EXPORT int \ to_##intw(PyArray_Descr *descr, \ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ parser_config *pconfig); @@ -171,4 +171,4 @@ DECLARE_TO_INT_PROTOTYPE(uint16) DECLARE_TO_INT_PROTOTYPE(uint32) DECLARE_TO_INT_PROTOTYPE(uint64) -#endif +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STR_TO_INT_H_ */ diff --git a/numpy/core/src/multiarray/textreading/stream.h b/numpy/core/src/multiarray/textreading/stream.h index b2fb1e1bf698..064dcd9cce8e 100644 --- a/numpy/core/src/multiarray/textreading/stream.h +++ b/numpy/core/src/multiarray/textreading/stream.h @@ -1,5 +1,5 @@ -#ifndef _STREAM_H_ -#define _STREAM_H_ +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_H_ #include @@ -25,4 +25,4 @@ typedef struct _stream { ((s)->stream_nextbuf((s)->stream_data, start, end, kind)) #define stream_close(s) ((s)->stream_close((s))) -#endif +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_H_ */ diff --git a/numpy/core/src/multiarray/textreading/stream_pyobject.c b/numpy/core/src/multiarray/textreading/stream_pyobject.c index 0e050e90cebb..85f027221bed 100644 --- a/numpy/core/src/multiarray/textreading/stream_pyobject.c +++ b/numpy/core/src/multiarray/textreading/stream_pyobject.c @@ -125,7 +125,7 @@ fb_del(stream *strm) } -stream * +NPY_NO_EXPORT stream * stream_python_file(PyObject *obj, const char *encoding) { python_chunks_from_file *fb; @@ -229,7 +229,7 @@ it_nextbuf(python_lines_from_iterator *it, char **start, char **end, int *kind) } -stream * +NPY_NO_EXPORT stream * stream_python_iterable(PyObject *obj, const char *encoding) { python_lines_from_iterator *it; diff --git a/numpy/core/src/multiarray/textreading/stream_pyobject.h b/numpy/core/src/multiarray/textreading/stream_pyobject.h index 93357e352cb4..45c11dd951a7 100644 --- a/numpy/core/src/multiarray/textreading/stream_pyobject.h +++ b/numpy/core/src/multiarray/textreading/stream_pyobject.h @@ -1,16 +1,16 @@ -#ifndef _STREAM_PYTHON_FILE_BY_LINE -#define _STREAM_PYTHON_FILE_BY_LINE +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_PYOBJECT_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_PYOBJECT_H_ #define PY_SSIZE_T_CLEAN #include #include "textreading/stream.h" -stream * +NPY_NO_EXPORT stream * stream_python_file(PyObject *obj, const char *encoding); -stream * +NPY_NO_EXPORT stream * stream_python_iterable(PyObject *obj, const char *encoding); -#endif +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_PYOBJECT_H_ */ diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src index 68387a022583..dd957b1d5d31 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.c.src +++ b/numpy/core/src/multiarray/textreading/tokenize.c.src @@ -305,7 +305,7 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config) * light-weight loops rather than a single heavy one, to allow e.g. quickly * scanning for the end of a field. */ -int +NPY_NO_EXPORT int tokenize(stream *s, tokenizer_state *ts, parser_config *const config) { assert(ts->fields_size >= 2); @@ -414,7 +414,7 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config) } -void +NPY_NO_EXPORT void tokenizer_clear(tokenizer_state *ts) { PyMem_FREE(ts->field_buffer); @@ -432,7 +432,7 @@ tokenizer_clear(tokenizer_state *ts) * variables into the tokenizer. This would improve the cache locality during * tokenizing. */ -int +NPY_NO_EXPORT int tokenizer_init(tokenizer_state *ts, parser_config *config) { /* State and buf_state could be moved into tokenize if we go by row */ diff --git a/numpy/core/src/multiarray/textreading/tokenize.h b/numpy/core/src/multiarray/textreading/tokenize.h index ec25a04282f0..02aa8a8d81a8 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.h +++ b/numpy/core/src/multiarray/textreading/tokenize.h @@ -1,6 +1,6 @@ -#ifndef _TOKENIZE_H_ -#define _TOKENIZE_H_ +#ifndef NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_ +#define NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_ #include #include "numpy/ndarraytypes.h" @@ -26,7 +26,6 @@ typedef enum { } tokenizer_parsing_state; - typedef struct { size_t offset; bool quoted; @@ -66,14 +65,14 @@ typedef struct { } tokenizer_state; -void +NPY_NO_EXPORT void tokenizer_clear(tokenizer_state *ts); -int +NPY_NO_EXPORT int tokenizer_init(tokenizer_state *ts, parser_config *config); -int +NPY_NO_EXPORT int tokenize(stream *s, tokenizer_state *ts, parser_config *const config); -#endif +#endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_TOKENIZE_H_ */ From 10b04d65bfb403d38f6d5c0de40e26ba1849d10d Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 11 Jan 2022 13:20:22 -0600 Subject: [PATCH 26/70] MAINT: Add sanity check to ensure usecols is correct. --- .../src/multiarray/textreading/readtext.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c index b7330d8712c2..fcf5056e2e4a 100644 --- a/numpy/core/src/multiarray/textreading/readtext.c +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -172,6 +172,25 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), return NULL; } } + /* + * TODO: It would be nicer to move usecol parsing to C, but we don't have + * quite the right helper in NumPy yet so using a 1D, 32bit, + * contiguous array. (and ensure this is true) + * NOTE: This should never fail as the public API ensures the conditions + * are met. + */ + if (usecols != Py_None) { + if (!PyArray_CheckExact(usecols) + || PyArray_NDIM((PyArrayObject *)usecols) != 1 + || !PyArray_ISCARRAY((PyArrayObject *)usecols) + || PyArray_DESCR((PyArrayObject *)usecols)->kind != 'i' + || PyArray_DESCR((PyArrayObject *)usecols)->elsize != 4 + || PyArray_ISBYTESWAPPED((PyArrayObject *)usecols)) { + PyErr_SetString(PyExc_RuntimeError, + "Internally a bad value was passed for usecols."); + return NULL; + } + } stream *s; if (filelike) { From 530c954316680f234b6926d0188843bca56de90d Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Tue, 11 Jan 2022 10:45:28 -0800 Subject: [PATCH 27/70] Add UserWarning when reading no data. --- numpy/lib/npyio.py | 7 +++++++ numpy/lib/tests/test_io.py | 23 +++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index b5723dee59e2..3b7b3bb30c87 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -1062,6 +1062,13 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', arr = _ensure_ndmin_ndarray(arr, ndmin=ndmin) + if arr.shape[0] == 0: + warnings.warn( + f'loadtxt: input contained no data: "{fname}"', + category=UserWarning, + stacklevel=2 + ) + if unpack: # Handle unpack like np.loadtxt. # XXX Check interaction with ndmin! diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 7cc1b7ede960..4a5cc1ad8b96 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -3242,3 +3242,26 @@ def test_loadtxt_consecutive_quotechar_escaped(): expected = np.array('Hello, my name is "Monty"!', dtype="U40") res = np.loadtxt(txt, dtype="U40", delimiter=",", quotechar='"') assert_equal(res, expected) + + +@pytest.mark.parametrize("data", ("", "\n\n\n", "# 1 2 3\n# 4 5 6\n")) +@pytest.mark.parametrize("ndmin", (0, 1, 2)) +def test_loadtxt_warn_on_no_data(data, ndmin): + """Check that a UserWarning is emitted when no data is read from input.""" + txt = TextIO(data) + with pytest.warns(UserWarning, match="input contained no data"): + np.loadtxt(txt, ndmin=ndmin) + + with NamedTemporaryFile(mode="w") as fh: + fh.write(data) + fh.seek(0) + with pytest.warns(UserWarning, match="input contained no data"): + np.loadtxt(txt, ndmin=ndmin) + + +@pytest.mark.parametrize("skiprows", (2, 3)) +def test_loadtxt_warn_on_skipped_data(skiprows): + data = "1 2 3\n4 5 6" + txt = TextIO(data) + with pytest.warns(UserWarning, match="input contained no data"): + np.loadtxt(txt, skiprows=skiprows) From 3ca9f5a2a252e020a44a355f4fc8114d91ea3423 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Tue, 11 Jan 2022 12:05:00 -0800 Subject: [PATCH 28/70] Add warning on empty file + tests. --- numpy/lib/npyio.py | 13 +++++++------ numpy/lib/tests/test_io.py | 6 ++---- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 3b7b3bb30c87..6e660dce118d 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -1062,12 +1062,13 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', arr = _ensure_ndmin_ndarray(arr, ndmin=ndmin) - if arr.shape[0] == 0: - warnings.warn( - f'loadtxt: input contained no data: "{fname}"', - category=UserWarning, - stacklevel=2 - ) + if arr.shape: + if arr.shape[0] == 0: + warnings.warn( + f'loadtxt: input contained no data: "{fname}"', + category=UserWarning, + stacklevel=2 + ) if unpack: # Handle unpack like np.loadtxt. diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 4a5cc1ad8b96..b4ca5b74bee6 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -919,8 +919,7 @@ def test_str_dtype(self): assert_array_equal(x, a) def test_empty_file(self): - with suppress_warnings() as sup: - sup.filter(message="loadtxt: Empty input file:") + with pytest.warns(UserWarning, match="input contained no data"): c = TextIO() x = np.loadtxt(c) assert_equal(x.shape, (0,)) @@ -1098,8 +1097,7 @@ def test_ndmin_keyword(self): assert_(x.shape == (3,)) # Test ndmin kw with empty file. - with suppress_warnings() as sup: - sup.filter(message="loadtxt: Empty input file:") + with pytest.warns(UserWarning, match="input contained no data"): f = TextIO() assert_(np.loadtxt(f, ndmin=2).shape == (0, 1,)) assert_(np.loadtxt(f, ndmin=1).shape == (0,)) From e1f7ad16518f95b6c5b560a03375b4329c8136ff Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 11 Jan 2022 15:46:21 -0600 Subject: [PATCH 29/70] BUG: Fix complex parser and add tests for whitespace and failure paths --- .../src/multiarray/textreading/conversions.c | 52 ++++++++++++------- numpy/lib/tests/test_io.py | 27 ++++++++++ 2 files changed, 60 insertions(+), 19 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/conversions.c b/numpy/core/src/multiarray/textreading/conversions.c index 04060baa1ea1..8c685ea64f2e 100644 --- a/numpy/core/src/multiarray/textreading/conversions.c +++ b/numpy/core/src/multiarray/textreading/conversions.c @@ -46,11 +46,13 @@ to_bool(PyArray_Descr *NPY_UNUSED(descr), static NPY_INLINE int double_from_ucs4( const Py_UCS4 *str, const Py_UCS4 *end, - bool skip_trailing_whitespace, double *result, const Py_UCS4 **p_end) + bool strip_whitespace, double *result, const Py_UCS4 **p_end) { /* skip leading whitespace */ - while (Py_UNICODE_ISSPACE(*str)) { - str++; + if (strip_whitespace) { + while (Py_UNICODE_ISSPACE(*str)) { + str++; + } } if (str == end) { return -1; /* empty or only whitespace: not a floating point number */ @@ -69,7 +71,9 @@ double_from_ucs4( char *c = ascii; for (; str < end; str++, c++) { if (NPY_UNLIKELY(*str >= 128)) { - break; /* the following cannot be a number anymore */ + /* Character cannot be used, ignore for end calculation and stop */ + end = str; + break; } *c = (char)(*str); } @@ -86,7 +90,7 @@ double_from_ucs4( return -1; } - if (skip_trailing_whitespace) { + if (strip_whitespace) { /* and then skip any remainig whitespace: */ while (Py_UNICODE_ISSPACE(*end)) { end++; @@ -158,6 +162,10 @@ to_complex_int( if (allow_parens && (*item == '(')) { unmatched_opening_paren = true; ++item; + /* Allow whitespace within the parentheses: "( 1j)" */ + while (Py_UNICODE_ISSPACE(*item)) { + ++item; + } } if (double_from_ucs4(item, token_end, false, p_real, &p_end) < 0) { return false; @@ -168,23 +176,15 @@ to_complex_int( return !unmatched_opening_paren; } if (*p_end == imaginary_unit) { - // Pure imaginary part only (e.g "1.5j") + /* Only an imaginary part (e.g "1.5j") */ *p_imag = *p_real; *p_real = 0.0; ++p_end; - if (unmatched_opening_paren && (*p_end == ')')) { - ++p_end; - unmatched_opening_paren = false; - } - } - else if (unmatched_opening_paren && (*p_end == ')')) { - *p_imag = 0.0; - ++p_end; - unmatched_opening_paren = false; } - else { + else if (*p_end == '+' || *p_end == '-') { + /* Imaginary part still to parse */ if (*p_end == '+') { - ++p_end; + ++p_end; /* Advance to support +- (and ++) */ } if (double_from_ucs4(p_end, token_end, false, p_imag, &p_end) < 0) { return false; @@ -193,11 +193,25 @@ to_complex_int( return false; } ++p_end; - if (unmatched_opening_paren && (*p_end == ')')) { + } + else { + *p_imag = 0; + } + + if (unmatched_opening_paren) { + /* Allow whitespace inside brackets as in "(1+2j )" or "( 1j )" */ + while (Py_UNICODE_ISSPACE(*p_end)) { + ++p_end; + } + if (*p_end == ')') { ++p_end; - unmatched_opening_paren = false; + } + else { + /* parentheses was not closed */ + return false; } } + while (Py_UNICODE_ISSPACE(*p_end)) { ++p_end; } diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index b4ca5b74bee6..5ba852e3ddd1 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -3263,3 +3263,30 @@ def test_loadtxt_warn_on_skipped_data(skiprows): txt = TextIO(data) with pytest.warns(UserWarning, match="input contained no data"): np.loadtxt(txt, skiprows=skiprows) + + +@pytest.mark.parametrize("dtype", + np.typecodes["AllInteger"] + "efdFD" + "?") +def test_loadtxt_unicode_whitespace_stripping(dtype): + # Test that all numeric types (and bool) strip whitespace correctly + # \u202F is a narrow no-break space, `\n` is just a whitespace if quoted. + # Currently, skip float128 as it did not always support this and has no + # "custom" parsing: + txt = StringIO(' 3 ,"\u202F2\n"') + res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar='"') + assert_array_equal(res, np.array([3, 2]).astype(dtype)) + +@pytest.mark.parametrize("dtype", "FD") +def test_loadtxt_unicode_whitespace_stripping_complex(dtype): + # Complex has a few extra cases since it has two components and parentheses + line = " 1 , 2+3j , ( 4+5j ), ( 6+-7j ) , 8j , ( 9j ) \n" + data = [line, line.replace(" ", "\u202F")] + res = np.loadtxt(data, dtype=dtype, delimiter=',') + assert_array_equal(res, np.array([[1, 2+3j, 4+5j, 6-7j, 8j, 9j]] * 2)) + +@pytest.mark.parametrize("dtype", "FD") +@pytest.mark.parametrize("field", + ["1 +2j", "1+ 2j", "1+2 j", "1+-+3", "(1j", "(1", "(1+2j", "1+2j)"]) +def test_loadtxt_bad_complex(dtype, field): + with pytest.raises(ValueError): + np.loadtxt([field + "\n"], dtype=dtype, delimiter=",") From 5692292feed0ed2246c652b57d9c1b24ee59b237 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 11 Jan 2022 16:54:07 -0600 Subject: [PATCH 30/70] BUG,TST: Add test for huge-float buffer path and ensure error return If a memory error happens, we should at least not crash the interpreter --- numpy/core/src/multiarray/textreading/conversions.c | 4 ++++ numpy/lib/tests/test_io.py | 9 +++++++++ 2 files changed, 13 insertions(+) diff --git a/numpy/core/src/multiarray/textreading/conversions.c b/numpy/core/src/multiarray/textreading/conversions.c index 8c685ea64f2e..0fb4f05fb670 100644 --- a/numpy/core/src/multiarray/textreading/conversions.c +++ b/numpy/core/src/multiarray/textreading/conversions.c @@ -66,6 +66,10 @@ double_from_ucs4( size_t str_len = end - str; if (str_len > 128) { heap_buf = PyMem_MALLOC(str_len); + if (heap_buf == NULL) { + PyErr_NoMemory(); + return -1; + } ascii = heap_buf; } char *c = ascii; diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 5ba852e3ddd1..e90971bef855 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -2986,6 +2986,15 @@ def test_loadtxt_read_huge_row(): assert_equal(res, np.tile([1.5, 2.5], (2, 50000))) +@pytest.mark.parametrize("dtype", "edfgFDG") +def test_loadtxt_huge_float(dtype): + # Covers a non-optimized path that is rarely taken: + field = "0" * 1000 + ".123456789" + dtype = np.dtype(dtype) + value = np.loadtxt([field], dtype=dtype)[()] + assert value == dtype.type("0.123456789") + + @pytest.mark.parametrize( ("given_dtype", "expected_dtype"), [ From fac91342789a8e5c8600535e5fc032b6d9285915 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 11 Jan 2022 17:07:36 -0600 Subject: [PATCH 31/70] TST: Add test to cover copyswap (byte-swap and unaligned) --- numpy/lib/tests/test_io.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index e90971bef855..548f5e5b4e99 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -3273,6 +3273,19 @@ def test_loadtxt_warn_on_skipped_data(skiprows): with pytest.warns(UserWarning, match="input contained no data"): np.loadtxt(txt, skiprows=skiprows) +@pytest.mark.parametrize("dtype", + np.typecodes["AllInteger"] + np.typecodes["AllFloat"]) +@pytest.mark.parametrize("swap", [True, False]) +def test_loadtxt_byteswapping_and_unaligned(dtype, swap): + data = ["x,1\n"] # no need for complicated data + dtype = np.dtype(dtype) + if swap: + dtype = dtype.newbyteorder() + full_dt = np.dtype([("a", "S1"), ("b", dtype)], align=False) + # The above ensures that the interesting "b" field is unaligned: + assert full_dt.fields["b"][1] == 1 + res = np.loadtxt(data, dtype=full_dt, delimiter=",") + assert res["b"] == 1 @pytest.mark.parametrize("dtype", np.typecodes["AllInteger"] + "efdFD" + "?") From e0e3a72b88d8f52aedb882c75a7df4215a35d466 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 11 Jan 2022 17:11:20 -0600 Subject: [PATCH 32/70] DOC: See if adding a newline fixes release note rendering --- doc/release/upcoming_changes/20580.compatibility.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/release/upcoming_changes/20580.compatibility.rst b/doc/release/upcoming_changes/20580.compatibility.rst index 740764357e70..b8306eae3df7 100644 --- a/doc/release/upcoming_changes/20580.compatibility.rst +++ b/doc/release/upcoming_changes/20580.compatibility.rst @@ -15,6 +15,7 @@ If the old behaviour is required, ``itertools.islice`` may be used:: While generally much faster and improved, `numpy.loadtxt` may now fail to converter certain strings to numbers that were previously successfully read. The most important cases for this are: + * Parsing floating point values such as ``1.0`` into integers will now fail * Parsing hexadecimal floats such as ``0x3p3`` will fail * An ``_`` was previously accepted as a thousands delimiter ``100_000``. From e4d0e60d2b8383fe995c518b3eef0007b3e48ab4 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 11 Jan 2022 18:35:27 -0600 Subject: [PATCH 33/70] BUG: Fix some issues found by a valgrind run --- numpy/core/src/multiarray/textreading/conversions.c | 2 +- numpy/core/src/multiarray/textreading/field_types.c | 1 + numpy/core/src/multiarray/textreading/rows.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/conversions.c b/numpy/core/src/multiarray/textreading/conversions.c index 0fb4f05fb670..6d68e961d696 100644 --- a/numpy/core/src/multiarray/textreading/conversions.c +++ b/numpy/core/src/multiarray/textreading/conversions.c @@ -63,7 +63,7 @@ double_from_ucs4( char *heap_buf = NULL; char *ascii = stack_buf; - size_t str_len = end - str; + size_t str_len = end - str + 1; if (str_len > 128) { heap_buf = PyMem_MALLOC(str_len); if (heap_buf == NULL) { diff --git a/numpy/core/src/multiarray/textreading/field_types.c b/numpy/core/src/multiarray/textreading/field_types.c index e1dbd2984bfd..0722efd57b00 100644 --- a/numpy/core/src/multiarray/textreading/field_types.c +++ b/numpy/core/src/multiarray/textreading/field_types.c @@ -137,6 +137,7 @@ field_type_grow_recursive(PyArray_Descr *descr, field_types_xclear(num_field_types, *ft); return -1; } + Py_DECREF(tup); num_field_types = field_type_grow_recursive( field_descr, num_field_types, ft, ft_size, field_offset + offset); diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index d3e4e462e538..d5004613c796 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -54,7 +54,7 @@ create_conv_funcs( PyErr_SetString(PyExc_TypeError, "converters must be a dictionary mapping columns to converter " "functions or a single callable."); - return NULL; + goto error; } PyObject *key, *value; From 334470edb1e4e4ea1bc87773ef6d0c6fd510486a Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 11 Jan 2022 21:03:08 -0600 Subject: [PATCH 34/70] BUG: Fix growing when NPY_RELAXED_STRIDES_DEBUG=1 is used --- numpy/core/src/multiarray/textreading/rows.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index d5004613c796..08bc80cc4064 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -269,6 +269,12 @@ read_rows(stream *s, */ data_array = (PyArrayObject *)PyArray_SimpleNewFromDescr( ndim, result_shape, out_descr); +#ifdef NPY_RELAXED_STRIDES_DEBUG + /* Incompatible with NPY_RELAXED_STRIDES_DEBUG due to growing */ + if (result_shape[0] == 1) { + PyArray_STRIDES(data_array)[0] = row_size; + } +#endif /* NPY_RELAXED_STRIDES_DEBUG */ if (data_array == NULL) { goto error; } From e2d9f6b8f34b45657773b42f1c1334e075b443b3 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 12 Jan 2022 22:36:29 -0600 Subject: [PATCH 35/70] MAINT: Move usecol handling to C and support more than integer cols Of course to actually use that many columns you need A LOT of memory right now. Each field stores at least a UCS4 NUL character, but the field is padded enough to require 16 bytes. We always parse a full row, so that requires 20 bytes per field... (i.e. 32 GiB RAM is not enough to test this :)). --- .../src/multiarray/textreading/readtext.c | 80 +++++++++++-------- numpy/core/src/multiarray/textreading/rows.c | 13 +-- numpy/core/src/multiarray/textreading/rows.h | 4 +- numpy/lib/npyio.py | 21 +---- numpy/lib/tests/test_io.py | 15 +++- 5 files changed, 74 insertions(+), 59 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c index fcf5056e2e4a..869be4b02624 100644 --- a/numpy/core/src/multiarray/textreading/readtext.c +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -8,6 +8,7 @@ #define _MULTIARRAYMODULE #include "numpy/arrayobject.h" #include "npy_argparse.h" +#include "common.h" #include "conversion_utils.h" #include "textreading/parser_config.h" @@ -34,14 +35,13 @@ // number of columns in the file must match the number of fields in `dtype`. // static PyObject * -_readtext_from_stream(stream *s, parser_config *pc, - PyObject *usecols, Py_ssize_t skiprows, Py_ssize_t max_rows, - PyObject *converters, PyObject *dtype) +_readtext_from_stream(stream *s, + parser_config *pc, Py_ssize_t num_usecols, Py_ssize_t usecols[], + Py_ssize_t skiprows, Py_ssize_t max_rows, + PyObject *converters, PyObject *dtype) { PyArrayObject *arr = NULL; PyArray_Descr *out_dtype = NULL; - int32_t *cols; - int ncols; field_type *ft = NULL; /* @@ -52,24 +52,24 @@ _readtext_from_stream(stream *s, parser_config *pc, out_dtype = (PyArray_Descr *)dtype; Py_INCREF(out_dtype); - npy_intp num_fields = field_types_create(out_dtype, &ft); + Py_ssize_t num_fields = field_types_create(out_dtype, &ft); if (num_fields < 0) { goto finish; } bool homogeneous = num_fields == 1 && ft[0].descr == out_dtype; - if (usecols == Py_None) { - ncols = num_fields; - cols = NULL; - } - else { - ncols = PyArray_SIZE((PyArrayObject *)usecols); - cols = PyArray_DATA((PyArrayObject *)usecols); + if (!homogeneous && usecols != NULL && num_usecols != num_fields) { + PyErr_Format(PyExc_TypeError, + "If a structured dtype is used, the number of columns in " + "`usecols` must match the effective number of fields. " + "But %zd usecols were given and the number of fields is %zd.", + num_usecols, num_fields); + goto finish; } arr = read_rows( s, max_rows, num_fields, ft, pc, - ncols, cols, skiprows, converters, + num_usecols, usecols, skiprows, converters, NULL, out_dtype, homogeneous); if (arr == NULL) { goto finish; @@ -107,7 +107,7 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), PyObject *file; Py_ssize_t skiprows = 0; Py_ssize_t max_rows = -1; - PyObject *usecols = Py_None; + PyObject *usecols_obj = Py_None; PyObject *converters = Py_None; PyObject *dtype = Py_None; @@ -136,7 +136,7 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), "|comment", &parse_control_character, &pc.comment, "|quote", &parse_control_character, &pc.quote, "|imaginary_unit", &parse_control_character, &pc.imaginary_unit, - "|usecols", NULL, &usecols, + "|usecols", NULL, &usecols_obj, "|skiprows", &PyArray_IntpFromPyIntConverter, &skiprows, "|max_rows", &PyArray_IntpFromPyIntConverter, &max_rows, "|converters", NULL, &converters, @@ -172,24 +172,38 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), return NULL; } } + /* - * TODO: It would be nicer to move usecol parsing to C, but we don't have - * quite the right helper in NumPy yet so using a 1D, 32bit, - * contiguous array. (and ensure this is true) - * NOTE: This should never fail as the public API ensures the conditions - * are met. + * Parse usecols, the rest of NumPy has no clear helper for this, so do + * it here manually. */ - if (usecols != Py_None) { - if (!PyArray_CheckExact(usecols) - || PyArray_NDIM((PyArrayObject *)usecols) != 1 - || !PyArray_ISCARRAY((PyArrayObject *)usecols) - || PyArray_DESCR((PyArrayObject *)usecols)->kind != 'i' - || PyArray_DESCR((PyArrayObject *)usecols)->elsize != 4 - || PyArray_ISBYTESWAPPED((PyArrayObject *)usecols)) { - PyErr_SetString(PyExc_RuntimeError, - "Internally a bad value was passed for usecols."); + Py_ssize_t num_usecols = -1; + Py_ssize_t *usecols = NULL; + if (usecols_obj != Py_None) { + num_usecols = PySequence_Length(usecols_obj); + if (num_usecols < 0) { return NULL; } + /* Calloc just to not worry about overflow */ + usecols = PyMem_Calloc(num_usecols, sizeof(Py_ssize_t)); + for (Py_ssize_t i = 0; i < num_usecols; i++) { + PyObject *tmp = PySequence_GetItem(usecols_obj, i); + if (tmp == NULL) { + return NULL; + } + usecols[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError); + if (error_converting(usecols[i])) { + if (PyErr_ExceptionMatches(PyExc_TypeError)) { + PyErr_Format(PyExc_TypeError, + "usecols must be an int or a sequence of ints but " + "it contains at least one element of type '%s'", + tmp->ob_type->tp_name); + } + Py_DECREF(tmp); + return NULL; + } + Py_DECREF(tmp); + } } stream *s; @@ -201,12 +215,14 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), } if (s == NULL) { PyErr_Format(PyExc_RuntimeError, "Unable to access the file."); + PyMem_FREE(usecols); return NULL; } - arr = _readtext_from_stream(s, &pc, usecols, skiprows, max_rows, - converters, dtype); + arr = _readtext_from_stream( + s, &pc, num_usecols, usecols, skiprows, max_rows, converters, dtype); stream_close(s); + PyMem_FREE(usecols); return arr; } diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index 08bc80cc4064..8c95ba537b02 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -31,7 +31,7 @@ */ static PyObject ** create_conv_funcs( - PyObject *converters, int num_fields, int32_t *usecols) + PyObject *converters, int num_fields, Py_ssize_t *usecols) { PyObject **conv_funcs = PyMem_Calloc(num_fields, sizeof(PyObject *)); if (conv_funcs == NULL) { @@ -155,8 +155,8 @@ create_conv_funcs( */ NPY_NO_EXPORT PyArrayObject * read_rows(stream *s, - npy_intp max_rows, int num_field_types, field_type *field_types, - parser_config *pconfig, int num_usecols, int *usecols, + npy_intp max_rows, Py_ssize_t num_field_types, field_type *field_types, + parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols, Py_ssize_t skiplines, PyObject *converters, PyArrayObject *data_array, PyArray_Descr *out_descr, bool homogeneous) @@ -187,6 +187,7 @@ read_rows(stream *s, int actual_num_fields = -1; if (usecols != NULL) { actual_num_fields = num_usecols; + assert(num_field_types == num_usecols); } else if (!homogeneous) { actual_num_fields = num_field_types; @@ -330,9 +331,9 @@ read_rows(stream *s, } } - for (int i = 0; i < actual_num_fields; ++i) { - int f; /* The field, either 0 (if homogeneous) or i. */ - int col; /* The column as read, remapped by usecols */ + for (Py_ssize_t i = 0; i < actual_num_fields; ++i) { + Py_ssize_t f; /* The field, either 0 (if homogeneous) or i. */ + Py_ssize_t col; /* The column as read, remapped by usecols */ char *item_ptr; if (homogeneous) { f = 0; diff --git a/numpy/core/src/multiarray/textreading/rows.h b/numpy/core/src/multiarray/textreading/rows.h index 342af0a4b70d..20eb9e186a19 100644 --- a/numpy/core/src/multiarray/textreading/rows.h +++ b/numpy/core/src/multiarray/textreading/rows.h @@ -13,8 +13,8 @@ NPY_NO_EXPORT PyArrayObject * read_rows(stream *s, - npy_intp nrows, int num_field_types, field_type *field_types, - parser_config *pconfig, int num_usecols, int *usecols, + npy_intp nrows, Py_ssize_t num_field_types, field_type *field_types, + parser_config *pconfig, Py_ssize_t num_usecols, Py_ssize_t *usecols, Py_ssize_t skiplines, PyObject *converters, PyArrayObject *data_array, PyArray_Descr *out_descr, bool homogeneous); diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 6e660dce118d..f15f94580673 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -909,25 +909,12 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', dtype = np.dtype(object) if usecols is not None: - # Allow usecols to be a single int or a sequence of ints + # Allow usecols to be a single int or a sequence of ints, the C-code + # handles the rest try: - usecols_as_list = list(usecols) + usecols = list(usecols) except TypeError: - usecols_as_list = [usecols] - for col_idx in usecols_as_list: - try: - operator.index(col_idx) - except TypeError: - # Some unit tests for numpy.loadtxt require that the - # error message matches this format. - raise TypeError( - "usecols must be an int or a sequence of ints but " - "it contains at least one element of type %s" % - type(col_idx), - ) from None - # Fall back to existing code - usecols = np.array([operator.index(i) for i in usecols_as_list], - dtype=np.int32) + usecols = [usecols] _ensure_ndmin_ndarray_check_param(ndmin) diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 548f5e5b4e99..6539a3d36ac1 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -871,16 +871,27 @@ def __index__(self): bogus_idx = 1.5 assert_raises_regex( TypeError, - '^usecols must be.*%s' % type(bogus_idx), + '^usecols must be.*%s' % type(bogus_idx).__name__, np.loadtxt, c, usecols=bogus_idx ) assert_raises_regex( TypeError, - '^usecols must be.*%s' % type(bogus_idx), + '^usecols must be.*%s' % type(bogus_idx).__name__, np.loadtxt, c, usecols=[0, bogus_idx, 0] ) + def test_bad_usecols(self): + with pytest.raises(OverflowError): + np.loadtxt(["1\n"], usecols=[2**64], delimiter=",") + with pytest.raises((ValueError, OverflowError)): + # Overflow error on 32bit platforms + np.loadtxt(["1\n"], usecols=[2**62], delimiter=",") + with pytest.raises(TypeError, + match="If a structured dtype .*. But 1 usecols were given and " + "the number of fields is 3."): + np.loadtxt(["1,1\n"], dtype="i,(2)i", usecols=[0], delimiter=",") + def test_fancy_dtype(self): c = TextIO() c.write('1,2,3.0\n4,5,6.0\n') From c000c1e67477a6bfbc23326ed19af4177f5a80e9 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Thu, 13 Jan 2022 14:26:27 -0600 Subject: [PATCH 36/70] BUG: Make sure num-fields is intp/ssize_t compatible In theory (if homogeneous) we actually should support more than 2**31 columns. This should fix that. Also cap overallocation scheme, so that we don't waste quite so much memory in these extreme cases --- numpy/core/src/multiarray/textreading/growth.c | 8 ++++++++ numpy/core/src/multiarray/textreading/rows.c | 10 +++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/growth.c b/numpy/core/src/multiarray/textreading/growth.c index 2afd3f82ce98..49a09d57217c 100644 --- a/numpy/core/src/multiarray/textreading/growth.c +++ b/numpy/core/src/multiarray/textreading/growth.c @@ -6,6 +6,10 @@ /* * Helper function taking the size input and growing it (based on min_grow). + * The current scheme is a minimum growth and a general growth by 25% + * overallocation. This is then capped at 2**20 elements, as that propels us + * in the range of large page sizes (so it is presumably more than enough). + * * It further multiplies it with `itemsize` and ensures that all results fit * into an `npy_intp`. * Returns -1 if any overflow occurred or the result would not fit. @@ -22,6 +26,10 @@ grow_size_and_multiply(npy_intp *size, npy_intp min_grow, npy_intp itemsize) { new_size += min_grow; } else { + if (growth > 1 << 20) { + /* limit growth to order of MiB (even hugepages are not larger) */ + growth = 1 << 20; + } new_size += growth + min_grow - 1; new_size &= ~min_grow; diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index 8c95ba537b02..37a1bd67f899 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -31,7 +31,7 @@ */ static PyObject ** create_conv_funcs( - PyObject *converters, int num_fields, Py_ssize_t *usecols) + PyObject *converters, Py_ssize_t num_fields, const Py_ssize_t *usecols) { PyObject **conv_funcs = PyMem_Calloc(num_fields, sizeof(PyObject *)); if (conv_funcs == NULL) { @@ -44,7 +44,7 @@ create_conv_funcs( } else if (PyCallable_Check(converters)) { /* a single converter used for all columns individually */ - for (int i = 0; i < num_fields; i++) { + for (Py_ssize_t i = 0; i < num_fields; i++) { Py_INCREF(converters); conv_funcs[i] = converters; } @@ -77,7 +77,7 @@ create_conv_funcs( * converters does not. (This is a feature, since it allows * us to correctly normalize converters to result column here.) */ - int i = 0; + Py_ssize_t i = 0; for (; i < num_fields; i++) { if (column == usecols[i]) { column = i; @@ -111,7 +111,7 @@ create_conv_funcs( return conv_funcs; error: - for (int i = 0; i < num_fields; i++) { + for (Py_ssize_t i = 0; i < num_fields; i++) { Py_XDECREF(conv_funcs[i]); } PyMem_FREE(conv_funcs); @@ -184,7 +184,7 @@ read_rows(stream *s, } /* Set the actual number of fields if it is already known, otherwise -1 */ - int actual_num_fields = -1; + Py_ssize_t actual_num_fields = -1; if (usecols != NULL) { actual_num_fields = num_usecols; assert(num_field_types == num_usecols); From cc2c5827af19f81f839ba1a2f88ed17a46a54bad Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Thu, 13 Jan 2022 16:25:24 -0600 Subject: [PATCH 37/70] BUG: Ensure current num fields holds enough space for ultra-wide columns A bit tricky, the code used to assume we are limited to <32bits due to the dtype limitations. But we are not for homogeneous arrays, so lets try to get it to work. (Right now it seems to fail, although even if it succeeds it will need a huge amount of memory) --- numpy/core/src/multiarray/textreading/rows.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index 37a1bd67f899..0bd4397ada3a 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -162,7 +162,7 @@ read_rows(stream *s, bool homogeneous) { char *data_ptr = NULL; - int current_num_fields; + Py_ssize_t current_num_fields; npy_intp row_size = out_descr->elsize; PyObject **conv_funcs = NULL; From da00bf4c398cbcca95ed7fbc69e7f9a7764a856c Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Thu, 13 Jan 2022 17:03:18 -0600 Subject: [PATCH 38/70] ENH: Give warning for empty-lines not counting towards max-rows This also slightly cleans up the empty-line handling: previously we sometimes just had some extra empty-lines that just effectively never mattered --- numpy/core/src/multiarray/textreading/rows.c | 28 +++++++++++++++++-- .../core/src/multiarray/textreading/stream.h | 3 ++ .../src/multiarray/textreading/tokenize.c.src | 19 ++++++------- numpy/lib/tests/test_io.py | 24 ++++++++++++++++ 4 files changed, 62 insertions(+), 12 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index 0bd4397ada3a..f7eed1855314 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -176,6 +176,8 @@ read_rows(stream *s, Py_XINCREF(data_array); size_t rows_per_block = 1; /* will be increased depending on row size */ npy_intp data_allocated_rows = 0; + /* We give a warning if max_rows is used and an empty line is encountered */ + bool give_empty_row_warning = max_rows >= 0; int ts_result = 0; tokenizer_state ts; @@ -193,7 +195,7 @@ read_rows(stream *s, actual_num_fields = num_field_types; } - for (; skiplines > 0; skiplines--) { + for (Py_ssize_t i = 0; i < skiplines; i++) { ts.state = TOKENIZE_GOTO_LINE_END; ts_result = tokenize(s, &ts, pconfig); if (ts_result < 0) { @@ -213,7 +215,29 @@ read_rows(stream *s, } current_num_fields = ts.num_fields; field_info *fields = ts.fields; - if (ts.num_fields == 0) { + if (NPY_UNLIKELY(ts.num_fields == 0)) { + /* + * Deprecated NumPy 1.23, 2021-01-13 (not really a deprecation, + * but similar policy should apply to removing the warning again) + */ + /* Tokenizer may give a final "empty line" even if there is none */ + if (give_empty_row_warning && ts_result == 0) { + give_empty_row_warning = false; + if (PyErr_WarnFormat(PyExc_UserWarning, 3, + "Input line %zd contained no data and will not be " + "counted towards `max_rows=%zd`. This differs from " + "the behaviour in NumPy <=1.22 which counted lines " + "rather than rows. If desired, the previous behaviour " + "can be achieved by using `itertools.islice`.\n" + "Please see the 1.23 release notes for an example on " + "how to do this. If you wish to ignore this warning, " + "use `warnings.filterwarnings`. This warning is " + "expected to be removed in the future and is given " + "only once per `loadtxt` call.", + row_count + skiplines + 1, max_rows) < 0) { + goto error; + } + } continue; /* Ignore empty line */ } diff --git a/numpy/core/src/multiarray/textreading/stream.h b/numpy/core/src/multiarray/textreading/stream.h index 064dcd9cce8e..d38abdd52948 100644 --- a/numpy/core/src/multiarray/textreading/stream.h +++ b/numpy/core/src/multiarray/textreading/stream.h @@ -7,6 +7,9 @@ * When getting the next line, we hope that the buffer provider can already * give some information about the newlines, because for Python iterables * we definitely expect to get line-by-line buffers. + * + * BUFFER_IS_FILEEND must be returned when the end of the file is reached and + * must NOT be returned together with a valid (non-empty) buffer. */ #define BUFFER_MAY_CONTAIN_NEWLINE 0 #define BUFFER_IS_LINEND 1 diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src index dd957b1d5d31..f5db64af6d1b 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.c.src +++ b/numpy/core/src/multiarray/textreading/tokenize.c.src @@ -254,8 +254,6 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config) if (ts->buf_state != BUFFER_MAY_CONTAIN_NEWLINE) { pos = stop; /* advance to next buffer */ ts->state = TOKENIZE_LINE_END; - /* Ensure we don't think we have an empty line left to parse: */ - ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE; break; } for (; pos < stop; pos++) { @@ -339,10 +337,7 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config) * a quoted field the and '\n' character is not included * in the string. `FileLike.readline()` does ensure it * is included. - * - * Ensure we don't think we have an empty line left to parse: */ - ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE; goto finish; } /* fetch new data */ @@ -388,11 +383,15 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config) } finish: - if (NPY_UNLIKELY(ts->pos != ts->end && ts->buf_state == BUFFER_IS_LINEND)) { - PyErr_SetString(PyExc_ValueError, - "Found an unquoted embedded newline within a single line of " - "input. This is currently not supported."); - return -1; + if (ts->buf_state == BUFFER_IS_LINEND) { + /* This line is "finished", make sure we don't touch it again: */ + ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE; + if (NPY_UNLIKELY(ts->pos < ts->end)) { + PyErr_SetString(PyExc_ValueError, + "Found an unquoted embedded newline within a single line of " + "input. This is currently not supported."); + return -1; + } } /* Finish the last field */ diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 6539a3d36ac1..fd27fd6716e7 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -1204,6 +1204,30 @@ def test_max_rows_larger(self): a = np.array([[1, 2, 3, 5], [4, 5, 7, 8], [2, 1, 4, 5]], int) assert_array_equal(x, a) + @pytest.mark.parametrize(["skip", "data"], [ + (1, ["ignored\n", "1,2\n", "\n", "3,4\n"]), + # "Bad" lines that do not end in newlines: + (1, ["ignored", "1,2", "", "3,4"]), + (1, StringIO("ignored\n1,2\n\n3,4")), + # Same as above, but do not skip any lines: + (0, ["-1,0\n", "1,2\n", "\n", "3,4\n"]), + (0, ["-1,0", "1,2", "", "3,4"]), + (0, StringIO("-1,0\n1,2\n\n3,4")),]) + def test_max_rows_empty_lines(self, skip, data): + with pytest.warns(UserWarning, + match=f"Input line 3.*max_rows={3-skip}"): + res = np.loadtxt(data, dtype=int, skiprows=skip, delimiter=",", + max_rows=3-skip) + assert_array_equal(res, [[-1, 0], [1, 2], [3, 4]][skip:]) + + if isinstance(data, StringIO): + data.seek(0) + + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + with pytest.raises(UserWarning): + np.loadtxt(data, dtype=int, skiprows=skip, delimiter=",", + max_rows=3-skip) class Testfromregex: def test_record(self): From 08fa5ce7b6fb4d4a01923931f88d78592fc69165 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Thu, 13 Jan 2022 18:59:59 -0600 Subject: [PATCH 39/70] MAINT: Small cleanup, use FINLINE for int parsers It seems the boolean converter using the integer parser caused it to not always be inlined as desired, so just use FINLINE. --- numpy/core/src/multiarray/textreading/str_to_int.c | 3 --- numpy/core/src/multiarray/textreading/str_to_int.h | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/str_to_int.c b/numpy/core/src/multiarray/textreading/str_to_int.c index 2efb431db197..f7e09574c8d1 100644 --- a/numpy/core/src/multiarray/textreading/str_to_int.c +++ b/numpy/core/src/multiarray/textreading/str_to_int.c @@ -7,9 +7,6 @@ #include "textreading/parser_config.h" -NPY_NO_EXPORT PyArray_Descr *double_descr = NULL; - - #define DECLARE_TO_INT(intw, INT_MIN, INT_MAX) \ NPY_NO_EXPORT int \ to_##intw(PyArray_Descr *descr, \ diff --git a/numpy/core/src/multiarray/textreading/str_to_int.h b/numpy/core/src/multiarray/textreading/str_to_int.h index e310e1ca2c18..a0a89a0ef7de 100644 --- a/numpy/core/src/multiarray/textreading/str_to_int.h +++ b/numpy/core/src/multiarray/textreading/str_to_int.h @@ -16,7 +16,7 @@ * * The actual functions are defined using macro templating below. */ -static NPY_INLINE int +NPY_FINLINE int str_to_int64( const Py_UCS4 *p_item, const Py_UCS4 *p_end, int64_t int_min, int64_t int_max, int64_t *result) @@ -96,7 +96,7 @@ str_to_int64( } -static NPY_INLINE int +NPY_FINLINE int str_to_uint64( const Py_UCS4 *p_item, const Py_UCS4 *p_end, uint64_t uint_max, uint64_t *result) From 73940d6b08a5b3690799bfdd68bb514ee1445b16 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Thu, 13 Jan 2022 20:06:16 -0600 Subject: [PATCH 40/70] MAINT,TST,BUG: Simplify streamer init, fix issues, and add tests --- .../src/multiarray/textreading/readtext.c | 1 - .../core/src/multiarray/textreading/stream.h | 14 +++- .../multiarray/textreading/stream_pyobject.c | 73 ++++++------------- numpy/lib/tests/test_io.py | 45 ++++++++++++ 4 files changed, 79 insertions(+), 54 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c index 869be4b02624..151bf894c754 100644 --- a/numpy/core/src/multiarray/textreading/readtext.c +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -214,7 +214,6 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), s = stream_python_iterable(file, encoding); } if (s == NULL) { - PyErr_Format(PyExc_RuntimeError, "Unable to access the file."); PyMem_FREE(usecols); return NULL; } diff --git a/numpy/core/src/multiarray/textreading/stream.h b/numpy/core/src/multiarray/textreading/stream.h index d38abdd52948..59bd1407466b 100644 --- a/numpy/core/src/multiarray/textreading/stream.h +++ b/numpy/core/src/multiarray/textreading/stream.h @@ -15,8 +15,18 @@ #define BUFFER_IS_LINEND 1 #define BUFFER_IS_FILEEND 2 +/* + * Base struct for streams. We currently have two, a chunked reader for + * filelikes and a line-by-line for any iterable. + * As of writing, the chunked reader was only used for filelikes not already + * opened. That is to preserve the amount read in case of an error exactly. + * If we drop this, we could read it more often (but not when `max_rows` is + * used). + * + * The "streams" can extend this struct to store their own data (so it is + * a very lightweight "object"). + */ typedef struct _stream { - void *stream_data; int (*stream_nextbuf)(void *sdata, char **start, char **end, int *kind); // Note that the first argument to stream_close is the stream pointer // itself, not the stream_data pointer. @@ -25,7 +35,7 @@ typedef struct _stream { #define stream_nextbuf(s, start, end, kind) \ - ((s)->stream_nextbuf((s)->stream_data, start, end, kind)) + ((s)->stream_nextbuf((s), start, end, kind)) #define stream_close(s) ((s)->stream_close((s))) #endif /* NUMPY_CORE_SRC_MULTIARRAY_TEXTREADING_STREAM_H_ */ diff --git a/numpy/core/src/multiarray/textreading/stream_pyobject.c b/numpy/core/src/multiarray/textreading/stream_pyobject.c index 85f027221bed..1c07395407af 100644 --- a/numpy/core/src/multiarray/textreading/stream_pyobject.c +++ b/numpy/core/src/multiarray/textreading/stream_pyobject.c @@ -19,6 +19,7 @@ typedef struct { + stream stream; /* The Python file object being read. */ PyObject *file; @@ -111,14 +112,13 @@ fb_nextbuf(python_chunks_from_file *fb, char **start, char **end, int *kind) static int fb_del(stream *strm) { - python_chunks_from_file *fb = (python_chunks_from_file *)strm->stream_data; + python_chunks_from_file *fb = (python_chunks_from_file *)strm; Py_XDECREF(fb->file); Py_XDECREF(fb->read); Py_XDECREF(fb->chunksize); Py_XDECREF(fb->chunk); - PyMem_FREE(fb); PyMem_FREE(strm); return 0; @@ -129,29 +129,19 @@ NPY_NO_EXPORT stream * stream_python_file(PyObject *obj, const char *encoding) { python_chunks_from_file *fb; - stream *strm; - fb = (python_chunks_from_file *)PyMem_MALLOC(sizeof(python_chunks_from_file)); + fb = (python_chunks_from_file *)PyMem_Calloc(1, sizeof(python_chunks_from_file)); if (fb == NULL) { PyErr_NoMemory(); return NULL; } - fb->file = NULL; - fb->read = NULL; - fb->chunksize = NULL; - fb->chunk = NULL; - fb->encoding = encoding; - - strm = (stream *)PyMem_MALLOC(sizeof(stream)); - if (strm == NULL) { - PyErr_NoMemory(); - PyMem_FREE(fb); - return NULL; - } + fb->stream.stream_nextbuf = (void *)&fb_nextbuf; + fb->stream.stream_close = &fb_del; + fb->encoding = encoding; + Py_INCREF(obj); fb->file = obj; - Py_INCREF(fb->file); fb->read = PyObject_GetAttrString(obj, "read"); if (fb->read == NULL) { @@ -162,14 +152,10 @@ stream_python_file(PyObject *obj, const char *encoding) goto fail; } - strm->stream_data = (void *)fb; - strm->stream_nextbuf = (void *)&fb_nextbuf; - strm->stream_close = &fb_del; - - return strm; + return (stream *)fb; fail: - fb_del(strm); + fb_del((stream *)fb); return NULL; } @@ -178,6 +164,7 @@ stream_python_file(PyObject *obj, const char *encoding) * Stream from a Python iterable by interpreting each item as a line in a file */ typedef struct { + stream stream; /* The Python file object being read. */ PyObject *iterator; @@ -192,14 +179,12 @@ typedef struct { static int it_del(stream *strm) { - python_lines_from_iterator *it = (python_lines_from_iterator *)strm->stream_data; + python_lines_from_iterator *it = (python_lines_from_iterator *)strm; Py_XDECREF(it->iterator); Py_XDECREF(it->line); - PyMem_FREE(it); PyMem_FREE(strm); - return 0; } @@ -233,39 +218,25 @@ NPY_NO_EXPORT stream * stream_python_iterable(PyObject *obj, const char *encoding) { python_lines_from_iterator *it; - stream *strm; - it = (python_lines_from_iterator *)PyMem_MALLOC(sizeof(*it)); - if (it == NULL) { - PyErr_NoMemory(); + if (!PyIter_Check(obj)) { + PyErr_SetString(PyExc_TypeError, + "error reading from object, expected an iterable."); return NULL; } - it->iterator = NULL; - it->line = NULL; - it->encoding = encoding; - - strm = (stream *)PyMem_MALLOC(sizeof(stream)); - if (strm == NULL) { + it = (python_lines_from_iterator *)PyMem_Calloc(1, sizeof(*it)); + if (it == NULL) { PyErr_NoMemory(); - PyMem_FREE(it); return NULL; } - if (!PyIter_Check(obj)) { - PyErr_SetString(PyExc_TypeError, - "error reading from object, expected an iterable."); - goto fail; - } - Py_INCREF(obj); - it->iterator = obj; - strm->stream_data = (void *)it; - strm->stream_nextbuf = (void *)&it_nextbuf; - strm->stream_close = &it_del; + it->stream.stream_nextbuf = (void *)&it_nextbuf; + it->stream.stream_close = &it_del; - return strm; + it->encoding = encoding; + Py_INCREF(obj); + it->iterator = obj; -fail: - it_del(strm); - return NULL; + return (stream *)it; } diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index fd27fd6716e7..7ec44045f5ba 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -3347,3 +3347,48 @@ def test_loadtxt_unicode_whitespace_stripping_complex(dtype): def test_loadtxt_bad_complex(dtype, field): with pytest.raises(ValueError): np.loadtxt([field + "\n"], dtype=dtype, delimiter=",") + + +def test_loadtxt_iterator_fails_getting_next_line(): + class BadSequence: + def __len__(self): + return 100 + + def __getitem__(self, item): + if item == 50: + raise RuntimeError("Bad things happened!") + return f"{item}, {item+1}" + + with pytest.raises(RuntimeError, match="Bad things happened!"): + np.loadtxt(BadSequence(), dtype=int, delimiter=",") + + +class TestCReaderUnitTests: + # These are internal tests for path that should not be possible to hit + # unless things go very very wrong somewhere. + def test_not_an_filelike(self): + with pytest.raises(AttributeError, match=".*read"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype=np.dtype("i"), filelike=True) + + def test_filelike_read_fails(self): + # Can only be reached if loadtxt opens the file, so it is hard to do + # via the public interface (although maybe not impossible considering + # the current "DataClass" backing). + class BadFileLike: + counter = 0 + def read(self, size): + self.counter += 1 + if self.counter > 20: + raise RuntimeError("Bad bad bad!") + return "1,2,3\n" + + with pytest.raises(RuntimeError, match="Bad bad bad!"): + np.core._multiarray_umath._load_from_filelike( + BadFileLike(), dtype=np.dtype("i"), filelike=True) + + def test_not_an_iter(self): + with pytest.raises(TypeError, + match="error reading from object, expected an iterable"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype=np.dtype("i"), filelike=False) From d2473c052eb70849575246f99fde1a055087fb61 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Thu, 13 Jan 2022 20:30:33 -0600 Subject: [PATCH 41/70] TST,BUG: Additional bad-file-like test, add missing error path free And remove one silly leftover struct member that was unused --- numpy/core/src/multiarray/textreading/readtext.c | 4 +++- .../src/multiarray/textreading/stream_pyobject.c | 3 --- numpy/lib/tests/test_io.py | 14 ++++++++++++++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c index 151bf894c754..678b3be7c9d2 100644 --- a/numpy/core/src/multiarray/textreading/readtext.c +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -189,6 +189,7 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), for (Py_ssize_t i = 0; i < num_usecols; i++) { PyObject *tmp = PySequence_GetItem(usecols_obj, i); if (tmp == NULL) { + PyMem_FREE(usecols); return NULL; } usecols[i] = PyNumber_AsSsize_t(tmp, PyExc_OverflowError); @@ -197,9 +198,10 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), PyErr_Format(PyExc_TypeError, "usecols must be an int or a sequence of ints but " "it contains at least one element of type '%s'", - tmp->ob_type->tp_name); + Py_TYPE(tmp)->tp_name); } Py_DECREF(tmp); + PyMem_FREE(usecols); return NULL; } Py_DECREF(tmp); diff --git a/numpy/core/src/multiarray/textreading/stream_pyobject.c b/numpy/core/src/multiarray/textreading/stream_pyobject.c index 1c07395407af..6f84ff01ddbb 100644 --- a/numpy/core/src/multiarray/textreading/stream_pyobject.c +++ b/numpy/core/src/multiarray/textreading/stream_pyobject.c @@ -28,9 +28,6 @@ typedef struct { /* Amount to read each time we call `obj.read()` */ PyObject *chunksize; - /* file position when the file_buffer was created. */ - off_t initial_file_pos; - /* Python str object holding the line most recently read from the file. */ PyObject *chunk; diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 7ec44045f5ba..cf55c97bf4bc 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -3387,6 +3387,20 @@ def read(self, size): np.core._multiarray_umath._load_from_filelike( BadFileLike(), dtype=np.dtype("i"), filelike=True) + def test_filelike_bad_read(self): + # Can only be reached if loadtxt opens the file, so it is hard to do + # via the public interface (although maybe not impossible considering + # the current "DataClass" backing). + class BadFileLike: + counter = 0 + def read(self, size): + return 1234 # not a string! + + with pytest.raises(TypeError, + match="non-string returned while reading data"): + np.core._multiarray_umath._load_from_filelike( + BadFileLike(), dtype=np.dtype("i"), filelike=True) + def test_not_an_iter(self): with pytest.raises(TypeError, match="error reading from object, expected an iterable"): From 245af22c0c0459664da2f30502025abf16c3ceca Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Thu, 13 Jan 2022 22:44:21 -0600 Subject: [PATCH 42/70] TST,MAINT: New tests, byteswap cleanups and fixed assert --- .../src/multiarray/textreading/conversions.c | 22 +++++++++---- numpy/core/src/multiarray/textreading/rows.c | 4 ++- .../src/multiarray/textreading/str_to_int.c | 31 +++++++++++-------- numpy/lib/tests/test_io.py | 31 +++++++++++++++---- 4 files changed, 62 insertions(+), 26 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/conversions.c b/numpy/core/src/multiarray/textreading/conversions.c index 6d68e961d696..2570e643d899 100644 --- a/numpy/core/src/multiarray/textreading/conversions.c +++ b/numpy/core/src/multiarray/textreading/conversions.c @@ -5,6 +5,10 @@ #include #include +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "lowlevel_strided_loops.h" + #include "conversions.h" #include "str_to_int.h" @@ -122,7 +126,7 @@ to_float(PyArray_Descr *descr, float val = (float)double_val; memcpy(dataptr, &val, sizeof(float)); if (!PyArray_ISNBO(descr->byteorder)) { - descr->f->copyswap(dataptr, dataptr, 1, NULL); + npy_bswap4_unaligned(dataptr); } return 0; } @@ -144,7 +148,7 @@ to_double(PyArray_Descr *descr, memcpy(dataptr, &val, sizeof(double)); if (!PyArray_ISNBO(descr->byteorder)) { - descr->f->copyswap(dataptr, dataptr, 1, NULL); + npy_bswap8_unaligned(dataptr); } return 0; } @@ -241,7 +245,8 @@ to_cfloat(PyArray_Descr *descr, npy_complex64 val = {(float)real, (float)imag}; memcpy(dataptr, &val, sizeof(npy_complex64)); if (!PyArray_ISNBO(descr->byteorder)) { - descr->f->copyswap(dataptr, dataptr, 1, NULL); + npy_bswap4_unaligned(dataptr); + npy_bswap4_unaligned(dataptr + 4); } return 0; } @@ -264,7 +269,8 @@ to_cdouble(PyArray_Descr *descr, npy_complex128 val = {real, imag}; memcpy(dataptr, &val, sizeof(npy_complex128)); if (!PyArray_ISNBO(descr->byteorder)) { - descr->f->copyswap(dataptr, dataptr, 1, NULL); + npy_bswap8_unaligned(dataptr); + npy_bswap8_unaligned(dataptr + 8); } return 0; } @@ -319,7 +325,11 @@ to_unicode(PyArray_Descr *descr, } if (!PyArray_ISNBO(descr->byteorder)) { - descr->f->copyswap(dataptr, dataptr, 1, NULL); + /* manual byteswap, unicode requires the array to be passed... */ + for (int i = 0; i < descr->elsize; i++) { + npy_bswap4_unaligned(dataptr); + dataptr += 4; + } } return 0; } @@ -383,4 +393,4 @@ to_generic(PyArray_Descr *descr, parser_config *config) { return to_generic_with_converter(descr, str, end, dataptr, config, NULL); -} \ No newline at end of file +} diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index f7eed1855314..7681acae3025 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -176,6 +176,7 @@ read_rows(stream *s, Py_XINCREF(data_array); size_t rows_per_block = 1; /* will be increased depending on row size */ npy_intp data_allocated_rows = 0; + /* We give a warning if max_rows is used and an empty line is encountered */ bool give_empty_row_warning = max_rows >= 0; @@ -188,10 +189,11 @@ read_rows(stream *s, /* Set the actual number of fields if it is already known, otherwise -1 */ Py_ssize_t actual_num_fields = -1; if (usecols != NULL) { + assert(homogeneous || num_field_types == num_usecols); actual_num_fields = num_usecols; - assert(num_field_types == num_usecols); } else if (!homogeneous) { + assert(usecols == NULL || num_field_types == num_usecols); actual_num_fields = num_field_types; } diff --git a/numpy/core/src/multiarray/textreading/str_to_int.c b/numpy/core/src/multiarray/textreading/str_to_int.c index f7e09574c8d1..11b03e31c1a7 100644 --- a/numpy/core/src/multiarray/textreading/str_to_int.c +++ b/numpy/core/src/multiarray/textreading/str_to_int.c @@ -1,13 +1,16 @@ #include +#define NPY_NO_DEPRECATED_API NPY_API_VERSION +#define _MULTIARRAYMODULE +#include "lowlevel_strided_loops.h" + #include #include "textreading/str_to_int.h" -#include "textreading/conversions.h" #include "textreading/parser_config.h" -#define DECLARE_TO_INT(intw, INT_MIN, INT_MAX) \ +#define DECLARE_TO_INT(intw, INT_MIN, INT_MAX, byteswap_unaligned) \ NPY_NO_EXPORT int \ to_##intw(PyArray_Descr *descr, \ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ @@ -24,12 +27,12 @@ } \ memcpy(dataptr, &x, sizeof(x)); \ if (!PyArray_ISNBO(descr->byteorder)) { \ - descr->f->copyswap(dataptr, dataptr, 1, NULL); \ + byteswap_unaligned(dataptr); \ } \ return 0; \ } -#define DECLARE_TO_UINT(uintw, UINT_MAX) \ +#define DECLARE_TO_UINT(uintw, UINT_MAX, byteswap_unaligned) \ NPY_NO_EXPORT int \ to_##uintw(PyArray_Descr *descr, \ const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, \ @@ -46,17 +49,19 @@ } \ memcpy(dataptr, &x, sizeof(x)); \ if (!PyArray_ISNBO(descr->byteorder)) { \ - descr->f->copyswap(dataptr, dataptr, 1, NULL); \ + byteswap_unaligned(dataptr); \ } \ return 0; \ } -DECLARE_TO_INT(int8, INT8_MIN, INT8_MAX) -DECLARE_TO_INT(int16, INT16_MIN, INT16_MAX) -DECLARE_TO_INT(int32, INT32_MIN, INT32_MAX) -DECLARE_TO_INT(int64, INT64_MIN, INT64_MAX) +#define byteswap_nothing(ptr) + +DECLARE_TO_INT(int8, INT8_MIN, INT8_MAX, byteswap_nothing) +DECLARE_TO_INT(int16, INT16_MIN, INT16_MAX, npy_bswap2_unaligned) +DECLARE_TO_INT(int32, INT32_MIN, INT32_MAX, npy_bswap4_unaligned) +DECLARE_TO_INT(int64, INT64_MIN, INT64_MAX, npy_bswap8_unaligned) -DECLARE_TO_UINT(uint8, UINT8_MAX) -DECLARE_TO_UINT(uint16, UINT16_MAX) -DECLARE_TO_UINT(uint32, UINT32_MAX) -DECLARE_TO_UINT(uint64, UINT64_MAX) +DECLARE_TO_UINT(uint8, UINT8_MAX, byteswap_nothing) +DECLARE_TO_UINT(uint16, UINT16_MAX, npy_bswap2_unaligned) +DECLARE_TO_UINT(uint32, UINT32_MAX, npy_bswap4_unaligned) +DECLARE_TO_UINT(uint64, UINT64_MAX, npy_bswap8_unaligned) diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index cf55c97bf4bc..a28e24d96bbf 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -3288,18 +3288,27 @@ def test_loadtxt_consecutive_quotechar_escaped(): @pytest.mark.parametrize("data", ("", "\n\n\n", "# 1 2 3\n# 4 5 6\n")) @pytest.mark.parametrize("ndmin", (0, 1, 2)) -def test_loadtxt_warn_on_no_data(data, ndmin): +@pytest.mark.parametrize("usecols", [None, (1, 2, 3)]) +def test_loadtxt_warn_on_no_data(data, ndmin, usecols): """Check that a UserWarning is emitted when no data is read from input.""" + if usecols is not None: + expected_shape = (0, 3) + elif ndmin == 2: + expected_shape = (0, 1) # guess a single column?! + else: + expected_shape = (0,) + txt = TextIO(data) with pytest.warns(UserWarning, match="input contained no data"): - np.loadtxt(txt, ndmin=ndmin) + res = np.loadtxt(txt, ndmin=ndmin, usecols=usecols) + assert res.shape == expected_shape with NamedTemporaryFile(mode="w") as fh: fh.write(data) fh.seek(0) with pytest.warns(UserWarning, match="input contained no data"): - np.loadtxt(txt, ndmin=ndmin) - + res = np.loadtxt(txt, ndmin=ndmin, usecols=usecols) + assert res.shape == expected_shape @pytest.mark.parametrize("skiprows", (2, 3)) def test_loadtxt_warn_on_skipped_data(skiprows): @@ -3309,7 +3318,7 @@ def test_loadtxt_warn_on_skipped_data(skiprows): np.loadtxt(txt, skiprows=skiprows) @pytest.mark.parametrize("dtype", - np.typecodes["AllInteger"] + np.typecodes["AllFloat"]) + list(np.typecodes["AllInteger"] + np.typecodes["AllFloat"]) + ["U2"]) @pytest.mark.parametrize("swap", [True, False]) def test_loadtxt_byteswapping_and_unaligned(dtype, swap): data = ["x,1\n"] # no need for complicated data @@ -3320,7 +3329,7 @@ def test_loadtxt_byteswapping_and_unaligned(dtype, swap): # The above ensures that the interesting "b" field is unaligned: assert full_dt.fields["b"][1] == 1 res = np.loadtxt(data, dtype=full_dt, delimiter=",") - assert res["b"] == 1 + assert res["b"] == dtype.type(1) @pytest.mark.parametrize("dtype", np.typecodes["AllInteger"] + "efdFD" + "?") @@ -3406,3 +3415,13 @@ def test_not_an_iter(self): match="error reading from object, expected an iterable"): np.core._multiarray_umath._load_from_filelike( object(), dtype=np.dtype("i"), filelike=False) + + def test_bad_type(self): + with pytest.raises(TypeError, match="internal error: dtype must"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype="i", filelike=False) + + def test_bad_encoding(self): + with pytest.raises(TypeError, match="encoding must be a unicode"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype=np.dtype("i"), filelike=False, encoding=123) From cc67c19185ac21aecea7abf0c7e820b472e95b25 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Fri, 14 Jan 2022 09:38:42 -0600 Subject: [PATCH 43/70] TST: Improve test coverage, replace impossible error with assert --- .../src/multiarray/textreading/tokenize.c.src | 11 ++--- numpy/lib/tests/test_io.py | 49 +++++++++++++++++++ 2 files changed, 52 insertions(+), 8 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src index f5db64af6d1b..75d0d673317b 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.c.src +++ b/numpy/core/src/multiarray/textreading/tokenize.c.src @@ -348,17 +348,12 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config) } if (ts->buf_state == BUFFER_IS_FILEEND) { finished_reading_file = 1; - ts->pos = ts->end; /* should be guaranteed, but make sure. */ + ts->pos = ts->end; /* stream should ensure this. */ goto finish; } else if (ts->pos == ts->end) { - if (ts->buf_state != BUFFER_IS_LINEND) { - PyErr_SetString(PyExc_RuntimeError, - "Reader returned an empty buffer, " - "but did not indicate file or line end."); - return -1; - } - /* Otherwise, we are OK with this and assume an empty line. */ + /* This must be an empty line (and it must be indicated!). */ + assert(ts->buf_state == BUFFER_IS_LINEND); goto finish; } } diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index a28e24d96bbf..7f3f3e1f2e3d 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -3081,6 +3081,21 @@ def test_loadtxt_bool(): assert_array_equal(res.view(np.uint8), [[1, 0], [1, 1]]) +@pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) +def test_loadtxt_integer_signs(dtype): + dtype = np.dtype(dtype) + assert np.loadtxt(["+2"], dtype=dtype) == 2 + if dtype.kind == "u": + with pytest.raises(ValueError): + np.loadtxt(["-1\n"], dtype=dtype) + else: + assert np.loadtxt(["-2\n"], dtype=dtype) == -2 + + for sign in ["++", "+-", "--", "-+"]: + with pytest.raises(ValueError): + np.loadtxt([f"{sign}2\n"], dtype=dtype) + + @pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) def test_loadtxt_implicit_cast_float_to_int_fails(dtype): txt = TextIO("1.0, 2.1, 3.7\n4, 5, 6") @@ -3358,6 +3373,40 @@ def test_loadtxt_bad_complex(dtype, field): np.loadtxt([field + "\n"], dtype=dtype, delimiter=",") +@pytest.mark.parametrize("data", [ + ["1,2\n", "2\n,3\n"], + ["1,2\n", "2\r,3\n"]]) +def test_loadtxt_bad_newline_in_iterator(data): + # In NumPy <=1.22 this was accepted, because newlines were completely + # ignored when the input was an iterable. This could be changed, but right + # now, we raise an error. + with pytest.raises(ValueError, + match="Found an unquoted embedded newline within a single line"): + np.loadtxt(data, delimiter=",") + +@pytest.mark.parametrize("data", [ + ["1,2\n", "2,3\r\n"], # a universal newline + ["1,2\n", "'2\n',3\n"], # a quoted newline + ["1,2\n", "'2\r',3\n"], + ["1,2\n", "'2\r\n',3\n"], +]) +def test_loadtxt_good_newline_in_iterator(data): + # The quoted newlines will be untransformed here, but are just whitespace. + res = np.loadtxt(data, delimiter=",", quotechar="'") + assert_array_equal(res, [[1., 2.], [2., 3.]]) + + +@pytest.mark.parametrize("newline", ["\n", "\r", "\r\n"]) +def test_unviersal_newlines_quoted(newline): + # Check that universal newline support within the tokenizer is not applied + # to quoted fields. (note that lines must end in newline or quoted + # fields will not include a newline at all) + data = ['1,"2\n"\n', '3,"4\n', '1"\n'] + data = [row.replace("\n", newline) for row in data] + res = np.loadtxt(data, dtype=object, delimiter=",", quotechar='"') + assert_array_equal(res, [['1', f'2{newline}'], ['3', f'4{newline}1']]) + + def test_loadtxt_iterator_fails_getting_next_line(): class BadSequence: def __len__(self): From 6e67e17475004035d76f8b51c315bedd1cb2809f Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Fri, 14 Jan 2022 10:26:03 -0600 Subject: [PATCH 44/70] MAINT: Remove unused/unnecessary allow-embedded-newlines option No other parser has this option, so I think we really do not need it. --- .../src/multiarray/textreading/parser_config.h | 10 ---------- numpy/core/src/multiarray/textreading/readtext.c | 1 - .../src/multiarray/textreading/tokenize.c.src | 15 +-------------- 3 files changed, 1 insertion(+), 25 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/parser_config.h b/numpy/core/src/multiarray/textreading/parser_config.h index b6e7feec0666..00e911667f05 100644 --- a/numpy/core/src/multiarray/textreading/parser_config.h +++ b/numpy/core/src/multiarray/textreading/parser_config.h @@ -40,16 +40,6 @@ typedef struct { */ bool delimiter_is_whitespace; - /* - * A boolean value (0 or 1). If 1, quoted fields may span - * more than one line. For example, the following - * 100, 200, "FOO - * BAR" - * is one "row", containing three fields: 100, 200 and "FOO\nBAR". - * If 0, the parser considers an unclosed quote to be an error. (XXX Check!) - */ - bool allow_embedded_newline; - /* * The imaginary unit character. Default is `j`. */ diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c index 678b3be7c9d2..c1b174c99c56 100644 --- a/numpy/core/src/multiarray/textreading/readtext.c +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -119,7 +119,6 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), .comment = '#', .quote = '"', .imaginary_unit = 'j', - .allow_embedded_newline = true, .delimiter_is_whitespace = false, .ignore_leading_whitespace = false, .python_byte_converters = false, diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src index 75d0d673317b..68dd2ce933c1 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.c.src +++ b/numpy/core/src/multiarray/textreading/tokenize.c.src @@ -210,20 +210,7 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config) case TOKENIZE_QUOTED: chunk_start = pos; for (; pos < stop; pos++) { - if (!config->allow_embedded_newline) { - if (*pos == '\r') { - ts->state = TOKENIZE_EAT_CRLF; - break; - } - else if (*pos == '\n') { - ts->state = TOKENIZE_LINE_END; - break; - } - } - else if (*pos != config->quote) { - /* inside the field, nothing to do. */ - } - else { + if (*pos == config->quote) { ts->state = TOKENIZE_QUOTED_CHECK_DOUBLE_QUOTE; break; } From d58d3612cb3417c3516992b45ac04f7cbf1209a0 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Fri, 14 Jan 2022 11:55:45 -0600 Subject: [PATCH 45/70] TST: Add test for hard/impossible to reach universal-newline support paths --- numpy/lib/tests/test_io.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 7f3f3e1f2e3d..5f66e0b6aa47 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -3474,3 +3474,18 @@ def test_bad_encoding(self): with pytest.raises(TypeError, match="encoding must be a unicode"): np.core._multiarray_umath._load_from_filelike( object(), dtype=np.dtype("i"), filelike=False, encoding=123) + + @pytest.mark.parametrize("newline", ["\r", "\n", "\r\n"]) + def test_manual_universal_newlines(self, newline): + # This is currently not available to users, because we should always + # open files with universal newlines enabled `newlines=None`. + # (And reading from an iterator uses slightly different code paths.) + # We have no real support for `newline="\r"` or `newline="\n" as the + # user cannot specify those options. + data = StringIO('0\n1\n"2\n"\n3\n4 #\n'.replace("\n", newline), + newline="") + + res = np.core._multiarray_umath._load_from_filelike( + data, dtype=np.dtype("U10"), filelike=True, + quote='"', comment="#", skiplines=1) + assert_array_equal(res[:, 0], ["1", f"2{newline}", "3", "4 "]) From eb68e8709c76f466fae8f5229cea3ee1afbad754 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Fri, 14 Jan 2022 11:56:06 -0600 Subject: [PATCH 46/70] MAINT: Use skiplines rather than skiprows internally throughout Skiplines is just the more clear names since "rows" make a lot of sense for output rows (which implies that a line is not empty for example) --- numpy/core/src/multiarray/textreading/readtext.c | 10 +++++----- numpy/lib/npyio.py | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c index c1b174c99c56..5d3613736136 100644 --- a/numpy/core/src/multiarray/textreading/readtext.c +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -37,7 +37,7 @@ static PyObject * _readtext_from_stream(stream *s, parser_config *pc, Py_ssize_t num_usecols, Py_ssize_t usecols[], - Py_ssize_t skiprows, Py_ssize_t max_rows, + Py_ssize_t skiplines, Py_ssize_t max_rows, PyObject *converters, PyObject *dtype) { PyArrayObject *arr = NULL; @@ -69,7 +69,7 @@ _readtext_from_stream(stream *s, arr = read_rows( s, max_rows, num_fields, ft, pc, - num_usecols, usecols, skiprows, converters, + num_usecols, usecols, skiplines, converters, NULL, out_dtype, homogeneous); if (arr == NULL) { goto finish; @@ -105,7 +105,7 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames) { PyObject *file; - Py_ssize_t skiprows = 0; + Py_ssize_t skiplines = 0; Py_ssize_t max_rows = -1; PyObject *usecols_obj = Py_None; PyObject *converters = Py_None; @@ -136,7 +136,7 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), "|quote", &parse_control_character, &pc.quote, "|imaginary_unit", &parse_control_character, &pc.imaginary_unit, "|usecols", NULL, &usecols_obj, - "|skiprows", &PyArray_IntpFromPyIntConverter, &skiprows, + "|skiplines", &PyArray_IntpFromPyIntConverter, &skiplines, "|max_rows", &PyArray_IntpFromPyIntConverter, &max_rows, "|converters", NULL, &converters, "|dtype", NULL, &dtype, @@ -220,7 +220,7 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), } arr = _readtext_from_stream( - s, &pc, num_usecols, usecols, skiprows, max_rows, converters, dtype); + s, &pc, num_usecols, usecols, skiplines, max_rows, converters, dtype); stream_close(s); PyMem_FREE(usecols); return arr; diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index f15f94580673..a23150832321 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -799,7 +799,7 @@ def _preprocess_comments(iterable, comments, encoding): def _read(fname, *, delimiter=',', comment='#', quote='"', - imaginary_unit='j', usecols=None, skiprows=0, + imaginary_unit='j', usecols=None, skiplines=0, max_rows=None, converters=None, ndmin=None, unpack=False, dtype=np.float64, encoding="bytes"): r""" @@ -829,7 +829,7 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', A one-dimensional array of integer column numbers. These are the columns from the file to be included in the array. If this value is not given, all the columns are used. - skiprows : int, optional + skiplines : int, optional Number of lines to skip before interpreting the data in the file. max_rows : int, optional Maximum number of rows of data to read. Default is to read the @@ -953,7 +953,7 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', if len(imaginary_unit) != 1: raise ValueError('len(imaginary_unit) must be 1.') - _check_nonneg_int(skiprows) + _check_nonneg_int(skiplines) if max_rows is not None: _check_nonneg_int(max_rows) else: @@ -994,7 +994,7 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', arr = _load_from_filelike( data, delimiter=delimiter, comment=comment, quote=quote, imaginary_unit=imaginary_unit, - usecols=usecols, skiprows=skiprows, max_rows=max_rows, + usecols=usecols, skiplines=skiplines, max_rows=max_rows, converters=converters, dtype=dtype, encoding=encoding, filelike=filelike, byte_converters=byte_converters) @@ -1021,7 +1021,7 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', next_arr = _load_from_filelike( data, delimiter=delimiter, comment=comment, quote=quote, imaginary_unit=imaginary_unit, - usecols=usecols, skiprows=skiprows, max_rows=max_rows, + usecols=usecols, skiplines=skiplines, max_rows=max_rows, converters=converters, dtype=dtype, encoding=encoding, filelike=filelike, byte_converters=byte_converters, @@ -1238,7 +1238,7 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, x.decode('latin1') if isinstance(x, bytes) else x for x in comment] arr = _read(fname, dtype=dtype, comment=comment, delimiter=delimiter, - converters=converters, skiprows=skiprows, usecols=usecols, + converters=converters, skiplines=skiprows, usecols=usecols, unpack=unpack, ndmin=ndmin, encoding=encoding, max_rows=max_rows, quote=quotechar) From 46269314148bdd6787d1a8d27d240590044b0edd Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Fri, 14 Jan 2022 15:37:47 -0600 Subject: [PATCH 47/70] MAINT: Very minor style cleanups (mostly) I.e. no use of goto where it is not necessary. --- numpy/core/src/multiarray/textreading/rows.c | 16 ++++++---------- .../src/multiarray/textreading/tokenize.c.src | 19 +++++++++++++------ .../src/multiarray/textreading/tokenize.h | 4 ++-- 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index 7681acae3025..8003ff1dab21 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -388,23 +388,19 @@ read_rows(stream *s, } } - bool err = 0; + int parser_res; Py_UCS4 *str = ts.field_buffer + fields[col].offset; Py_UCS4 *end = ts.field_buffer + fields[col + 1].offset - 1; if (conv_funcs[i] == NULL) { - if (field_types[f].set_from_ucs4(field_types[f].descr, - str, end, item_ptr, pconfig) < 0) { - err = true; - } + parser_res = field_types[f].set_from_ucs4(field_types[f].descr, + str, end, item_ptr, pconfig); } else { - if (to_generic_with_converter(field_types[f].descr, - str, end, item_ptr, pconfig, conv_funcs[i]) < 0) { - err = true; - } + parser_res = to_generic_with_converter(field_types[f].descr, + str, end, item_ptr, pconfig, conv_funcs[i]); } - if (NPY_UNLIKELY(err)) { + if (NPY_UNLIKELY(parser_res < 0)) { PyObject *exc, *val, *tb; PyErr_Fetch(&exc, &val, &tb); diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src index 68dd2ce933c1..752eee2dd75e 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.c.src +++ b/numpy/core/src/multiarray/textreading/tokenize.c.src @@ -304,6 +304,10 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config) /* Add the first field */ while (1) { + /* + * This loop adds new fields to the result (to make up a full row) + * until the row ends (typically a line end or the file end) + */ if (ts->state == TOKENIZE_INIT) { /* Start a new field */ if (add_field(ts) < 0) { @@ -325,7 +329,7 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config) * in the string. `FileLike.readline()` does ensure it * is included. */ - goto finish; + break; } /* fetch new data */ ts->buf_state = stream_nextbuf(s, @@ -336,12 +340,12 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config) if (ts->buf_state == BUFFER_IS_FILEEND) { finished_reading_file = 1; ts->pos = ts->end; /* stream should ensure this. */ - goto finish; + break; } else if (ts->pos == ts->end) { /* This must be an empty line (and it must be indicated!). */ assert(ts->buf_state == BUFFER_IS_LINEND); - goto finish; + break; } } int status; @@ -360,11 +364,13 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config) } if (ts->state == TOKENIZE_LINE_END) { - goto finish; + break; } } - finish: + /* + * We have finished tokenizing a full row into fields, finalize result + */ if (ts->buf_state == BUFFER_IS_LINEND) { /* This line is "finished", make sure we don't touch it again: */ ts->buf_state = BUFFER_MAY_CONTAIN_NEWLINE; @@ -376,11 +382,12 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config) } } - /* Finish the last field */ + /* Finish the last field (we "append" one to store the last ones length) */ if (add_field(ts) < 0) { return -1; } ts->num_fields -= 1; + /* * If have one field, but that field is completely empty, this is an * empty line, and we just ignore it. diff --git a/numpy/core/src/multiarray/textreading/tokenize.h b/numpy/core/src/multiarray/textreading/tokenize.h index 02aa8a8d81a8..fa10bb9b0142 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.h +++ b/numpy/core/src/multiarray/textreading/tokenize.h @@ -38,7 +38,6 @@ typedef struct { tokenizer_parsing_state unquoted_state; int unicode_kind; int buf_state; - npy_intp num_fields; /* the buffer we are currently working on */ char *pos; char *end; @@ -60,8 +59,9 @@ typedef struct { * * The tokenizer assumes at least one field is allocated. */ - field_info *fields; + npy_intp num_fields; npy_intp fields_size; + field_info *fields; } tokenizer_state; From 0cb6bdcf2a28e8a3a74a302d0807cd054a15925f Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Fri, 14 Jan 2022 15:54:10 -0600 Subject: [PATCH 48/70] MAINT: Only allocate converters if necessary This actually *slows* things down very mildly, but it just seems a bit cleaner to me, so lets do it anyway... --- numpy/core/src/multiarray/textreading/rows.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index 8003ff1dab21..2ca97606066f 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -33,16 +33,15 @@ static PyObject ** create_conv_funcs( PyObject *converters, Py_ssize_t num_fields, const Py_ssize_t *usecols) { + assert(converters != Py_None); + PyObject **conv_funcs = PyMem_Calloc(num_fields, sizeof(PyObject *)); if (conv_funcs == NULL) { PyErr_NoMemory(); return NULL; } - if (converters == Py_None) { - return conv_funcs; - } - else if (PyCallable_Check(converters)) { + if (PyCallable_Check(converters)) { /* a single converter used for all columns individually */ for (Py_ssize_t i = 0; i < num_fields; i++) { Py_INCREF(converters); @@ -251,10 +250,12 @@ read_rows(stream *s, actual_num_fields = current_num_fields; } - conv_funcs = create_conv_funcs( - converters, actual_num_fields, usecols); - if (conv_funcs == NULL) { - goto error; + if (converters != Py_None) { + conv_funcs = create_conv_funcs( + converters, actual_num_fields, usecols); + if (conv_funcs == NULL) { + goto error; + } } /* Note that result_shape[1] is only used if homogeneous is true */ @@ -391,7 +392,7 @@ read_rows(stream *s, int parser_res; Py_UCS4 *str = ts.field_buffer + fields[col].offset; Py_UCS4 *end = ts.field_buffer + fields[col + 1].offset - 1; - if (conv_funcs[i] == NULL) { + if (conv_funcs == NULL || conv_funcs[i] == NULL) { parser_res = field_types[f].set_from_ucs4(field_types[f].descr, str, end, item_ptr, pconfig); } From 90c71f0a8a84d9f17243e28e01527b5fd1ecdbb9 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Fri, 14 Jan 2022 19:04:16 -0600 Subject: [PATCH 49/70] TST: Move most new loadtxt tests to its own file This also adds two basic new tests around files/strings containing the \0 character (prooving that we handle that gracefully). Also adds tests for: * the `_` thousands delimiter (should fail, but doesn't for float128 right now) * Failure modes when the number of rows changes (negative specifically) Many of these tests came originally from Warren Weckesser and others were added by Ross Barnowsky: Co-authored-by: Warren Weckesser Co-authored-by: Ross Barnowski --- numpy/lib/tests/test_io.py | 767 ----------------------------- numpy/lib/tests/test_loadtxt.py | 836 ++++++++++++++++++++++++++++++++ 2 files changed, 836 insertions(+), 767 deletions(-) create mode 100644 numpy/lib/tests/test_loadtxt.py diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 5f66e0b6aa47..f142972b2024 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -2722,770 +2722,3 @@ def test_load_refcount(): with assert_no_gc_cycles(): x = np.loadtxt(TextIO("0 1 2 3"), dtype=dt) assert_equal(x, np.array([((0, 1), (2, 3))], dtype=dt)) - - -def test_loadtxt_scientific_notation(): - """Test that both 'e' and 'E' are parsed correctly.""" - data = TextIO( - ( - "1.0e-1,2.0E1,3.0\n" - "4.0e-2,5.0E-1,6.0\n" - "7.0e-3,8.0E1,9.0\n" - "0.0e-4,1.0E-1,2.0" - ) - ) - expected = np.array( - [[0.1, 20., 3.0], [0.04, 0.5, 6], [0.007, 80., 9], [0, 0.1, 2]] - ) - assert_array_equal(np.loadtxt(data, delimiter=","), expected) - - -@pytest.mark.parametrize("comment", ["..", "//", "@-", "this is a comment:"]) -def test_loadtxt_comment_multiple_chars(comment): - content = "# IGNORE\n1.5, 2.5# ABC\n3.0,4.0# XXX\n5.5,6.0\n" - txt = TextIO(content.replace("#", comment)) - a = np.loadtxt(txt, delimiter=",", comments=comment) - assert_equal(a, [[1.5, 2.5], [3.0, 4.0], [5.5, 6.0]]) - - -@pytest.fixture -def mixed_types_structured(): - """ - Fixture providing hetergeneous input data with a structured dtype, along - with the associated structured array. - """ - data = TextIO( - ( - "1000;2.4;alpha;-34\n" - "2000;3.1;beta;29\n" - "3500;9.9;gamma;120\n" - "4090;8.1;delta;0\n" - "5001;4.4;epsilon;-99\n" - "6543;7.8;omega;-1\n" - ) - ) - dtype = np.dtype( - [('f0', np.uint16), ('f1', np.float64), ('f2', 'S7'), ('f3', np.int8)] - ) - expected = np.array( - [ - (1000, 2.4, "alpha", -34), - (2000, 3.1, "beta", 29), - (3500, 9.9, "gamma", 120), - (4090, 8.1, "delta", 0), - (5001, 4.4, "epsilon", -99), - (6543, 7.8, "omega", -1) - ], - dtype=dtype - ) - return data, dtype, expected - - -@pytest.mark.parametrize('skiprows', [0, 1, 2, 3]) -def test_loadtxt_structured_dtype_and_skiprows_no_empty_lines( - skiprows, mixed_types_structured - ): - data, dtype, expected = mixed_types_structured - a = np.loadtxt(data, dtype=dtype, delimiter=";", skiprows=skiprows) - assert_array_equal(a, expected[skiprows:]) - - -def test_loadtxt_unpack_structured(mixed_types_structured): - data, dtype, expected = mixed_types_structured - - a, b, c, d = np.loadtxt(data, dtype=dtype, delimiter=";", unpack=True) - assert_array_equal(a, expected["f0"]) - assert_array_equal(b, expected["f1"]) - assert_array_equal(c, expected["f2"]) - assert_array_equal(d, expected["f3"]) - - -def test_loadtxt_structured_dtype_with_shape(): - dtype = np.dtype([("a", "u1", 2), ("b", "u1", 2)]) - data = TextIO("0,1,2,3\n6,7,8,9\n") - expected = np.array([((0, 1), (2, 3)), ((6, 7), (8, 9))], dtype=dtype) - assert_array_equal(np.loadtxt(data, delimiter=",", dtype=dtype), expected) - - -def test_loadtxt_structured_dtype_with_multi_shape(): - dtype = np.dtype([("a", "u1", (2, 2))]) - data = TextIO("0 1 2 3\n") - expected = np.array([(((0, 1), (2, 3)),)], dtype=dtype) - assert_array_equal(np.loadtxt(data, dtype=dtype), expected) - - -def test_loadtxt_nested_structured_subarray(): - # Test from gh-16678 - point = np.dtype([('x', float), ('y', float)]) - dt = np.dtype([('code', int), ('points', point, (2,))]) - data = TextIO("100,1,2,3,4\n200,5,6,7,8\n") - expected = np.array( - [ - (100, [(1., 2.), (3., 4.)]), - (200, [(5., 6.), (7., 8.)]), - ], - dtype=dt - ) - assert_array_equal(np.loadtxt(data, dtype=dt, delimiter=","), expected) - - -def test_loadtxt_structured_dtype_offsets(): - # An aligned structured dtype will have additional padding - dt = np.dtype("i1, i4, i1, i4, i1, i4", align=True) - data = TextIO("1,2,3,4,5,6\n7,8,9,10,11,12\n") - expected = np.array([(1, 2, 3, 4, 5, 6), (7, 8, 9, 10, 11, 12)], dtype=dt) - assert_array_equal(np.loadtxt(data, delimiter=",", dtype=dt), expected) - - -@pytest.mark.parametrize("param", ("skiprows", "max_rows")) -def test_loadtxt_exception_negative_row_limits(param): - """skiprows and max_rows should raise for negative parameters.""" - with pytest.raises(ValueError, match="argument must be nonnegative"): - np.loadtxt("foo.bar", **{param: -3}) - - -@pytest.mark.parametrize("param", ("skiprows", "max_rows")) -def test_loadtxt_exception_noninteger_row_limits(param): - with pytest.raises(TypeError, match="argument must be an integer"): - np.loadtxt("foo.bar", **{param: 1.0}) - - -@pytest.mark.parametrize( - "data, shape", - [ - ("1 2 3 4 5\n", (1, 5)), # Single row - ("1\n2\n3\n4\n5\n", (5, 1)), # Single column - ] -) -def test_loadtxt_ndmin_single_row_or_col(data, shape): - arr = np.array([1, 2, 3, 4, 5]) - arr2d = arr.reshape(shape) - - assert_array_equal(np.loadtxt(TextIO(data), dtype=int), arr) - assert_array_equal(np.loadtxt(TextIO(data), dtype=int, ndmin=0), arr) - assert_array_equal(np.loadtxt(TextIO(data), dtype=int, ndmin=1), arr) - assert_array_equal(np.loadtxt(TextIO(data), dtype=int, ndmin=2), arr2d) - - -@pytest.mark.parametrize("badval", [-1, 3, None, "plate of shrimp"]) -def test_loadtxt_bad_ndmin(badval): - with pytest.raises(ValueError, match="Illegal value of ndmin keyword"): - np.loadtxt("foo.bar", ndmin=badval) - - -@pytest.mark.parametrize( - "ws", - ( - "\t", # tab - "\u2003", # em - "\u00A0", # non-break - "\u3000", # ideographic space - ) -) -def test_loadtxt_blank_lines_spaces_delimit(ws): - txt = StringIO( - f"1 2{ws}30\n\n4 5 60\n {ws} \n7 8 {ws} 90\n # comment\n3 2 1" - ) - # NOTE: It is unclear that the ` # comment` should succeed. Except - # for delimiter=None, which should use any whitespace (and maybe - # should just be implemented closer to Python - expected = np.array([[1, 2, 30], [4, 5, 60], [7, 8, 90], [3, 2, 1]]) - assert_equal( - np.loadtxt(txt, dtype=int, delimiter=None, comments="#"), expected - ) - - -def test_loadtxt_blank_lines_normal_delimiter(): - txt = StringIO('1,2,30\n\n4,5,60\n\n7,8,90\n# comment\n3,2,1') - expected = np.array([[1, 2, 30], [4, 5, 60], [7, 8, 90], [3, 2, 1]]) - assert_equal( - np.loadtxt(txt, dtype=int, delimiter=',', comments="#"), expected - ) - - -@pytest.mark.parametrize("dtype", (float, object)) -def test_loadtxt_maxrows_no_blank_lines(dtype): - txt = TextIO("1.5,2.5\n3.0,4.0\n5.5,6.0") - res = np.loadtxt(txt, dtype=dtype, delimiter=",", max_rows=2) - assert_equal(res.dtype, dtype) - assert_equal(res, np.array([["1.5", "2.5"], ["3.0", "4.0"]], dtype=dtype)) - - -@pytest.mark.parametrize("dtype", (np.dtype("f8"), np.dtype("i2"))) -def test_loadtxt_exception_message_bad_values(dtype): - txt = TextIO("1,2\n3,XXX\n5,6") - msg = f"could not convert string 'XXX' to {dtype} at row 1, column 2" - with pytest.raises(ValueError, match=msg): - np.loadtxt(txt, dtype=dtype, delimiter=",") - - -def test_loadtxt_converters_negative_indices(): - txt = TextIO('1.5,2.5\n3.0,XXX\n5.5,6.0') - conv = {-1: lambda s: np.nan if s == 'XXX' else float(s)} - expected = np.array([[1.5, 2.5], [3.0, np.nan], [5.5, 6.0]]) - res = np.loadtxt( - txt, dtype=np.float64, delimiter=",", converters=conv, encoding=None - ) - assert_equal(res, expected) - - -def test_loadtxt_converters_negative_indices_with_usecols(): - txt = TextIO('1.5,2.5,3.5\n3.0,4.0,XXX\n5.5,6.0,7.5\n') - conv = {-1: lambda s: np.nan if s == 'XXX' else float(s)} - expected = np.array([[1.5, 3.5], [3.0, np.nan], [5.5, 7.5]]) - res = np.loadtxt( - txt, - dtype=np.float64, - delimiter=",", - converters=conv, - usecols=[0, -1], - encoding=None, - ) - assert_equal(res, expected) - - -def test_loadtxt_ragged_usecols(): - # usecols, and negative ones, work even with varying number of columns. - txt = TextIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n") - expected = np.array([[0, 0], [0, 0], [0, 0]]) - res = np.loadtxt(txt, dtype=float, delimiter=",", usecols=[0, -2]) - assert_equal(res, expected) - - -def test_loadtxt_empty_usecols(): - txt = TextIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n") - res = np.loadtxt(txt, dtype=np.dtype([]), delimiter=",", usecols=[]) - assert res.shape == (3,) - assert res.dtype == np.dtype([]) - - -@pytest.mark.parametrize("c1", ["a", "の", "🫕"]) -@pytest.mark.parametrize("c2", ["a", "の", "🫕"]) -def test_loadtxt_large_unicode_characters(c1, c2): - # c1 and c2 span ascii, 16bit and 32bit range. - txt = StringIO(f"a,{c1},c,1.0\ne,{c2},2.0,g") - res = np.loadtxt(txt, dtype=np.dtype('U12'), delimiter=",") - expected = np.array( - [f"a,{c1},c,1.0".split(","), f"e,{c2},2.0,g".split(",")], - dtype=np.dtype('U12') - ) - assert_equal(res, expected) - - -def test_loadtxt_unicode_with_converter(): - txt = StringIO("cat,dog\nαβγ,δεζ\nabc,def\n") - conv = {0: lambda s: s.upper()} - res = np.loadtxt( - txt, - dtype=np.dtype("U12"), - converters=conv, - delimiter=",", - encoding=None - ) - expected = np.array([['CAT', 'dog'], ['ΑΒΓ', 'δεζ'], ['ABC', 'def']]) - assert_equal(res, expected) - - -def test_loadtxt_converter_with_structured_dtype(): - txt = TextIO('1.5,2.5,Abc\n3.0,4.0,dEf\n5.5,6.0,ghI\n') - dt = np.dtype([('m', np.int32), ('r', np.float32), ('code', 'U8')]) - conv = {0: lambda s: int(10*float(s)), -1: lambda s: s.upper()} - res = np.loadtxt(txt, dtype=dt, delimiter=",", converters=conv) - expected = np.array( - [(15, 2.5, 'ABC'), (30, 4.0, 'DEF'), (55, 6.0, 'GHI')], dtype=dt - ) - assert_equal(res, expected) - - -def test_loadtxt_converter_with_unicode_dtype(): - """ - With the default 'bytes' encoding, tokens are encoded prior to being passed - to the converter. This means that the output of the converter may be bytes - instead of unicode as expected by `read_rows`. - - This test checks that outputs from the above scenario are properly decoded - prior to parsing by `read_rows`. - """ - txt = StringIO('abc,def\nrst,xyz') - conv = bytes.upper - res = np.loadtxt(txt, dtype=np.dtype("U3"), converters=conv, delimiter=",") - expected = np.array([['ABC', 'DEF'], ['RST', 'XYZ']]) - assert_equal(res, expected) - - -def test_loadtxt_read_huge_row(): - row = "1.5, 2.5," * 50000 - row = row[:-1] + "\n" - txt = TextIO(row * 2) - res = np.loadtxt(txt, delimiter=",", dtype=float) - assert_equal(res, np.tile([1.5, 2.5], (2, 50000))) - - -@pytest.mark.parametrize("dtype", "edfgFDG") -def test_loadtxt_huge_float(dtype): - # Covers a non-optimized path that is rarely taken: - field = "0" * 1000 + ".123456789" - dtype = np.dtype(dtype) - value = np.loadtxt([field], dtype=dtype)[()] - assert value == dtype.type("0.123456789") - - -@pytest.mark.parametrize( - ("given_dtype", "expected_dtype"), - [ - ("S", np.dtype("S5")), - ("U", np.dtype("U5")), - ], -) -def test_loadtxt_string_no_length_given(given_dtype, expected_dtype): - """ - The given dtype is just 'S' or 'U' with no length. In these cases, the - length of the resulting dtype is determined by the longest string found - in the file. - """ - txt = TextIO("AAA,5-1\nBBBBB,0-3\nC,4-9\n") - res = np.loadtxt(txt, dtype=given_dtype, delimiter=",") - expected = np.array( - [['AAA', '5-1'], ['BBBBB', '0-3'], ['C', '4-9']], dtype=expected_dtype - ) - assert_equal(res, expected) - assert_equal(res.dtype, expected_dtype) - - -def test_loadtxt_float_conversion(): - """ - Some tests that the conversion to float64 works as accurately as the Python - built-in `float` function. In a naive version of the float parser, these - strings resulted in values that were off by an ULP or two. - """ - strings = [ - '0.9999999999999999', - '9876543210.123456', - '5.43215432154321e+300', - '0.901', - '0.333', - ] - txt = TextIO('\n'.join(strings)) - res = np.loadtxt(txt) - expected = np.array([float(s) for s in strings]) - assert_equal(res, expected) - - -def test_loadtxt_bool(): - # Simple test for bool via integer - txt = TextIO("1, 0\n10, -1") - res = np.loadtxt(txt, dtype=bool, delimiter=",") - assert res.dtype == bool - assert_array_equal(res, [[True, False], [True, True]]) - # Make sure we use only 1 and 0 on the byte level: - assert_array_equal(res.view(np.uint8), [[1, 0], [1, 1]]) - - -@pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) -def test_loadtxt_integer_signs(dtype): - dtype = np.dtype(dtype) - assert np.loadtxt(["+2"], dtype=dtype) == 2 - if dtype.kind == "u": - with pytest.raises(ValueError): - np.loadtxt(["-1\n"], dtype=dtype) - else: - assert np.loadtxt(["-2\n"], dtype=dtype) == -2 - - for sign in ["++", "+-", "--", "-+"]: - with pytest.raises(ValueError): - np.loadtxt([f"{sign}2\n"], dtype=dtype) - - -@pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) -def test_loadtxt_implicit_cast_float_to_int_fails(dtype): - txt = TextIO("1.0, 2.1, 3.7\n4, 5, 6") - with pytest.raises(ValueError): - np.loadtxt(txt, dtype=dtype, delimiter=",") - -@pytest.mark.parametrize("dtype", (np.complex64, np.complex128)) -@pytest.mark.parametrize("with_parens", (False, True)) -def test_loadtxt_complex_parsing(dtype, with_parens): - s = "(1.0-2.5j),3.75,(7+-5.0j)\n(4),(-19e2j),(0)" - if not with_parens: - s = s.replace("(", "").replace(")", "") - - res = np.loadtxt(TextIO(s), dtype=dtype, delimiter=",") - expected = np.array( - [[1.0-2.5j, 3.75, 7-5j], [4.0, -1900j, 0]], dtype=dtype - ) - assert_equal(res, expected) - - -def test_loadtxt_read_from_generator(): - def gen(): - for i in range(4): - yield f"{i},{2*i},{i**2}" - - res = np.loadtxt(gen(), dtype=int, delimiter=",") - expected = np.array([[0, 0, 0], [1, 2, 1], [2, 4, 4], [3, 6, 9]]) - assert_equal(res, expected) - - -def test_loadtxt_read_from_generator_multitype(): - def gen(): - for i in range(3): - yield f"{i} {i / 4}" - - res = np.loadtxt(gen(), dtype="i, d", delimiter=" ") - expected = np.array([(0, 0.0), (1, 0.25), (2, 0.5)], dtype="i, d") - assert_equal(res, expected) - - -def test_loadtxt_read_from_bad_generator(): - def gen(): - for entry in ["1,2", b"3, 5", 12738]: - yield entry - - with pytest.raises( - TypeError, match=r"non-string returned while reading data" - ): - np.loadtxt(gen(), dtype="i, i", delimiter=",") - - -@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts") -def test_loadtxt_object_cleanup_on_read_error(): - sentinel = object() - - already_read = 0 - def conv(x): - nonlocal already_read - if already_read > 4999: - raise ValueError("failed half-way through!") - already_read += 1 - return sentinel - - txt = TextIO("x\n" * 10000) - - with pytest.raises(ValueError, match="at row 5000, column 1"): - np.loadtxt(txt, dtype=object, converters={0: conv}) - - assert sys.getrefcount(sentinel) == 2 - - -def test_loadtxt_character_not_bytes_compatible(): - """Test exception when a character cannot be encoded as 'S'.""" - data = StringIO("–") # == \u2013 - with pytest.raises(ValueError): - np.loadtxt(data, dtype="S5") - - -@pytest.mark.parametrize("conv", (0, [float], "")) -def test_loadtxt_invalid_converter(conv): - msg = ( - "converters must be a dictionary mapping columns to converter " - "functions or a single callable." - ) - with pytest.raises(TypeError, match=msg): - np.loadtxt(TextIO("1 2\n3 4"), converters=conv) - - -def test_loadtxt_converters_dict_raises_non_integer_key(): - with pytest.raises(TypeError, match="keys of the converters dict"): - np.loadtxt(TextIO("1 2\n3 4"), converters={"a": int}) - with pytest.raises(TypeError, match="keys of the converters dict"): - np.loadtxt(TextIO("1 2\n3 4"), converters={"a": int}, usecols=0) - - -@pytest.mark.parametrize("bad_col_ind", (3, -3)) -def test_loadtxt_converters_dict_raises_non_col_key(bad_col_ind): - data = TextIO("1 2\n3 4") - with pytest.raises(ValueError, match="converter specified for column"): - np.loadtxt(data, converters={bad_col_ind: int}) - - -def test_loadtxt_converters_dict_raises_val_not_callable(): - with pytest.raises( - TypeError, match="values of the converters dictionary must be callable" - ): - np.loadtxt(StringIO("1 2\n3 4"), converters={0: 1}) - - -@pytest.mark.parametrize("q", ('"', "'", "`")) -def test_loadtxt_quoted_field(q): - txt = TextIO( - f"{q}alpha, x{q}, 2.5\n{q}beta, y{q}, 4.5\n{q}gamma, z{q}, 5.0\n" - ) - dtype = np.dtype([('f0', 'U8'), ('f1', np.float64)]) - expected = np.array( - [("alpha, x", 2.5), ("beta, y", 4.5), ("gamma, z", 5.0)], dtype=dtype - ) - - res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar=q) - assert_array_equal(res, expected) - - -def test_loadtxt_quote_support_default(): - """Support for quoted fields is disabled by default.""" - txt = TextIO('"lat,long", 45, 30\n') - dtype = np.dtype([('f0', 'U24'), ('f1', np.float64), ('f2', np.float64)]) - - with pytest.raises(ValueError, match="the number of columns changed"): - np.loadtxt(txt, dtype=dtype, delimiter=",") - - # Enable quoting support with non-None value for quotechar param - txt.seek(0) - expected = np.array([("lat,long", 45., 30.)], dtype=dtype) - - res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar='"') - assert_array_equal(res, expected) - - -def test_loadtxt_quotechar_multichar_error(): - txt = StringIO("1,2\n3,4") - msg = r".*must be a single unicode character or None" - with pytest.raises(TypeError, match=msg): - np.loadtxt(txt, delimiter=",", quotechar="''") - - -def test_loadtxt_comment_multichar_error_with_quote(): - txt = StringIO("1,2\n3,4") - msg = ( - "when multiple comments or a multi-character comment is given, " - "quotes are not supported." - ) - with pytest.raises(ValueError, match=msg): - np.loadtxt(txt, delimiter=",", comments="123", quotechar='"') - with pytest.raises(ValueError, match=msg): - np.loadtxt(txt, delimiter=",", comments=["#", "%"], quotechar='"') - - # A single character string in a tuple is unpacked though: - res = np.loadtxt(txt, delimiter=",", comments=("#",), quotechar="'") - assert_equal(res, [[1, 2], [3, 4]]) - - -def test_loadtxt_structured_dtype_with_quotes(): - data = TextIO( - ( - "1000;2.4;'alpha';-34\n" - "2000;3.1;'beta';29\n" - "3500;9.9;'gamma';120\n" - "4090;8.1;'delta';0\n" - "5001;4.4;'epsilon';-99\n" - "6543;7.8;'omega';-1\n" - ) - ) - dtype = np.dtype( - [('f0', np.uint16), ('f1', np.float64), ('f2', 'S7'), ('f3', np.int8)] - ) - expected = np.array( - [ - (1000, 2.4, "alpha", -34), - (2000, 3.1, "beta", 29), - (3500, 9.9, "gamma", 120), - (4090, 8.1, "delta", 0), - (5001, 4.4, "epsilon", -99), - (6543, 7.8, "omega", -1) - ], - dtype=dtype - ) - res = np.loadtxt(data, dtype=dtype, delimiter=";", quotechar="'") - assert_array_equal(res, expected) - - -def test_loadtxt_quoted_field_is_not_empty(): - txt = StringIO('1\n\n"4"\n""') - expected = np.array(["1", "4", ""], dtype="U1") - res = np.loadtxt(txt, delimiter=",", dtype="U1", quotechar='"') - assert_equal(res, expected) - - -def test_loadtxt_consecutive_quotechar_escaped(): - txt = TextIO('"Hello, my name is ""Monty""!"') - expected = np.array('Hello, my name is "Monty"!', dtype="U40") - res = np.loadtxt(txt, dtype="U40", delimiter=",", quotechar='"') - assert_equal(res, expected) - - -@pytest.mark.parametrize("data", ("", "\n\n\n", "# 1 2 3\n# 4 5 6\n")) -@pytest.mark.parametrize("ndmin", (0, 1, 2)) -@pytest.mark.parametrize("usecols", [None, (1, 2, 3)]) -def test_loadtxt_warn_on_no_data(data, ndmin, usecols): - """Check that a UserWarning is emitted when no data is read from input.""" - if usecols is not None: - expected_shape = (0, 3) - elif ndmin == 2: - expected_shape = (0, 1) # guess a single column?! - else: - expected_shape = (0,) - - txt = TextIO(data) - with pytest.warns(UserWarning, match="input contained no data"): - res = np.loadtxt(txt, ndmin=ndmin, usecols=usecols) - assert res.shape == expected_shape - - with NamedTemporaryFile(mode="w") as fh: - fh.write(data) - fh.seek(0) - with pytest.warns(UserWarning, match="input contained no data"): - res = np.loadtxt(txt, ndmin=ndmin, usecols=usecols) - assert res.shape == expected_shape - -@pytest.mark.parametrize("skiprows", (2, 3)) -def test_loadtxt_warn_on_skipped_data(skiprows): - data = "1 2 3\n4 5 6" - txt = TextIO(data) - with pytest.warns(UserWarning, match="input contained no data"): - np.loadtxt(txt, skiprows=skiprows) - -@pytest.mark.parametrize("dtype", - list(np.typecodes["AllInteger"] + np.typecodes["AllFloat"]) + ["U2"]) -@pytest.mark.parametrize("swap", [True, False]) -def test_loadtxt_byteswapping_and_unaligned(dtype, swap): - data = ["x,1\n"] # no need for complicated data - dtype = np.dtype(dtype) - if swap: - dtype = dtype.newbyteorder() - full_dt = np.dtype([("a", "S1"), ("b", dtype)], align=False) - # The above ensures that the interesting "b" field is unaligned: - assert full_dt.fields["b"][1] == 1 - res = np.loadtxt(data, dtype=full_dt, delimiter=",") - assert res["b"] == dtype.type(1) - -@pytest.mark.parametrize("dtype", - np.typecodes["AllInteger"] + "efdFD" + "?") -def test_loadtxt_unicode_whitespace_stripping(dtype): - # Test that all numeric types (and bool) strip whitespace correctly - # \u202F is a narrow no-break space, `\n` is just a whitespace if quoted. - # Currently, skip float128 as it did not always support this and has no - # "custom" parsing: - txt = StringIO(' 3 ,"\u202F2\n"') - res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar='"') - assert_array_equal(res, np.array([3, 2]).astype(dtype)) - -@pytest.mark.parametrize("dtype", "FD") -def test_loadtxt_unicode_whitespace_stripping_complex(dtype): - # Complex has a few extra cases since it has two components and parentheses - line = " 1 , 2+3j , ( 4+5j ), ( 6+-7j ) , 8j , ( 9j ) \n" - data = [line, line.replace(" ", "\u202F")] - res = np.loadtxt(data, dtype=dtype, delimiter=',') - assert_array_equal(res, np.array([[1, 2+3j, 4+5j, 6-7j, 8j, 9j]] * 2)) - -@pytest.mark.parametrize("dtype", "FD") -@pytest.mark.parametrize("field", - ["1 +2j", "1+ 2j", "1+2 j", "1+-+3", "(1j", "(1", "(1+2j", "1+2j)"]) -def test_loadtxt_bad_complex(dtype, field): - with pytest.raises(ValueError): - np.loadtxt([field + "\n"], dtype=dtype, delimiter=",") - - -@pytest.mark.parametrize("data", [ - ["1,2\n", "2\n,3\n"], - ["1,2\n", "2\r,3\n"]]) -def test_loadtxt_bad_newline_in_iterator(data): - # In NumPy <=1.22 this was accepted, because newlines were completely - # ignored when the input was an iterable. This could be changed, but right - # now, we raise an error. - with pytest.raises(ValueError, - match="Found an unquoted embedded newline within a single line"): - np.loadtxt(data, delimiter=",") - -@pytest.mark.parametrize("data", [ - ["1,2\n", "2,3\r\n"], # a universal newline - ["1,2\n", "'2\n',3\n"], # a quoted newline - ["1,2\n", "'2\r',3\n"], - ["1,2\n", "'2\r\n',3\n"], -]) -def test_loadtxt_good_newline_in_iterator(data): - # The quoted newlines will be untransformed here, but are just whitespace. - res = np.loadtxt(data, delimiter=",", quotechar="'") - assert_array_equal(res, [[1., 2.], [2., 3.]]) - - -@pytest.mark.parametrize("newline", ["\n", "\r", "\r\n"]) -def test_unviersal_newlines_quoted(newline): - # Check that universal newline support within the tokenizer is not applied - # to quoted fields. (note that lines must end in newline or quoted - # fields will not include a newline at all) - data = ['1,"2\n"\n', '3,"4\n', '1"\n'] - data = [row.replace("\n", newline) for row in data] - res = np.loadtxt(data, dtype=object, delimiter=",", quotechar='"') - assert_array_equal(res, [['1', f'2{newline}'], ['3', f'4{newline}1']]) - - -def test_loadtxt_iterator_fails_getting_next_line(): - class BadSequence: - def __len__(self): - return 100 - - def __getitem__(self, item): - if item == 50: - raise RuntimeError("Bad things happened!") - return f"{item}, {item+1}" - - with pytest.raises(RuntimeError, match="Bad things happened!"): - np.loadtxt(BadSequence(), dtype=int, delimiter=",") - - -class TestCReaderUnitTests: - # These are internal tests for path that should not be possible to hit - # unless things go very very wrong somewhere. - def test_not_an_filelike(self): - with pytest.raises(AttributeError, match=".*read"): - np.core._multiarray_umath._load_from_filelike( - object(), dtype=np.dtype("i"), filelike=True) - - def test_filelike_read_fails(self): - # Can only be reached if loadtxt opens the file, so it is hard to do - # via the public interface (although maybe not impossible considering - # the current "DataClass" backing). - class BadFileLike: - counter = 0 - def read(self, size): - self.counter += 1 - if self.counter > 20: - raise RuntimeError("Bad bad bad!") - return "1,2,3\n" - - with pytest.raises(RuntimeError, match="Bad bad bad!"): - np.core._multiarray_umath._load_from_filelike( - BadFileLike(), dtype=np.dtype("i"), filelike=True) - - def test_filelike_bad_read(self): - # Can only be reached if loadtxt opens the file, so it is hard to do - # via the public interface (although maybe not impossible considering - # the current "DataClass" backing). - class BadFileLike: - counter = 0 - def read(self, size): - return 1234 # not a string! - - with pytest.raises(TypeError, - match="non-string returned while reading data"): - np.core._multiarray_umath._load_from_filelike( - BadFileLike(), dtype=np.dtype("i"), filelike=True) - - def test_not_an_iter(self): - with pytest.raises(TypeError, - match="error reading from object, expected an iterable"): - np.core._multiarray_umath._load_from_filelike( - object(), dtype=np.dtype("i"), filelike=False) - - def test_bad_type(self): - with pytest.raises(TypeError, match="internal error: dtype must"): - np.core._multiarray_umath._load_from_filelike( - object(), dtype="i", filelike=False) - - def test_bad_encoding(self): - with pytest.raises(TypeError, match="encoding must be a unicode"): - np.core._multiarray_umath._load_from_filelike( - object(), dtype=np.dtype("i"), filelike=False, encoding=123) - - @pytest.mark.parametrize("newline", ["\r", "\n", "\r\n"]) - def test_manual_universal_newlines(self, newline): - # This is currently not available to users, because we should always - # open files with universal newlines enabled `newlines=None`. - # (And reading from an iterator uses slightly different code paths.) - # We have no real support for `newline="\r"` or `newline="\n" as the - # user cannot specify those options. - data = StringIO('0\n1\n"2\n"\n3\n4 #\n'.replace("\n", newline), - newline="") - - res = np.core._multiarray_umath._load_from_filelike( - data, dtype=np.dtype("U10"), filelike=True, - quote='"', comment="#", skiplines=1) - assert_array_equal(res[:, 0], ["1", f"2{newline}", "3", "4 "]) diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py new file mode 100644 index 000000000000..b8fd9a79686c --- /dev/null +++ b/numpy/lib/tests/test_loadtxt.py @@ -0,0 +1,836 @@ +""" +Tests specific to `np.loadtxt` added during the move of loadtxt to be backed +by C code. +These tests complement those found in `test_io.py`. +""" + +import sys +import pytest +from tempfile import NamedTemporaryFile +from io import StringIO + +import numpy as np +from numpy.ma.testutils import assert_equal +from numpy.testing import assert_array_equal, HAS_REFCOUNT + + +def test_scientific_notation(): + """Test that both 'e' and 'E' are parsed correctly.""" + data = StringIO( + ( + "1.0e-1,2.0E1,3.0\n" + "4.0e-2,5.0E-1,6.0\n" + "7.0e-3,8.0E1,9.0\n" + "0.0e-4,1.0E-1,2.0" + ) + ) + expected = np.array( + [[0.1, 20., 3.0], [0.04, 0.5, 6], [0.007, 80., 9], [0, 0.1, 2]] + ) + assert_array_equal(np.loadtxt(data, delimiter=","), expected) + + +@pytest.mark.parametrize("comment", ["..", "//", "@-", "this is a comment:"]) +def test_comment_multiple_chars(comment): + content = "# IGNORE\n1.5, 2.5# ABC\n3.0,4.0# XXX\n5.5,6.0\n" + txt = StringIO(content.replace("#", comment)) + a = np.loadtxt(txt, delimiter=",", comments=comment) + assert_equal(a, [[1.5, 2.5], [3.0, 4.0], [5.5, 6.0]]) + + +@pytest.fixture +def mixed_types_structured(): + """ + Fixture providing hetergeneous input data with a structured dtype, along + with the associated structured array. + """ + data = StringIO( + ( + "1000;2.4;alpha;-34\n" + "2000;3.1;beta;29\n" + "3500;9.9;gamma;120\n" + "4090;8.1;delta;0\n" + "5001;4.4;epsilon;-99\n" + "6543;7.8;omega;-1\n" + ) + ) + dtype = np.dtype( + [('f0', np.uint16), ('f1', np.float64), ('f2', 'S7'), ('f3', np.int8)] + ) + expected = np.array( + [ + (1000, 2.4, "alpha", -34), + (2000, 3.1, "beta", 29), + (3500, 9.9, "gamma", 120), + (4090, 8.1, "delta", 0), + (5001, 4.4, "epsilon", -99), + (6543, 7.8, "omega", -1) + ], + dtype=dtype + ) + return data, dtype, expected + + +@pytest.mark.parametrize('skiprows', [0, 1, 2, 3]) +def test_structured_dtype_and_skiprows_no_empty_lines( + skiprows, mixed_types_structured): + data, dtype, expected = mixed_types_structured + a = np.loadtxt(data, dtype=dtype, delimiter=";", skiprows=skiprows) + assert_array_equal(a, expected[skiprows:]) + + +def test_unpack_structured(mixed_types_structured): + data, dtype, expected = mixed_types_structured + + a, b, c, d = np.loadtxt(data, dtype=dtype, delimiter=";", unpack=True) + assert_array_equal(a, expected["f0"]) + assert_array_equal(b, expected["f1"]) + assert_array_equal(c, expected["f2"]) + assert_array_equal(d, expected["f3"]) + + +def test_structured_dtype_with_shape(): + dtype = np.dtype([("a", "u1", 2), ("b", "u1", 2)]) + data = StringIO("0,1,2,3\n6,7,8,9\n") + expected = np.array([((0, 1), (2, 3)), ((6, 7), (8, 9))], dtype=dtype) + assert_array_equal(np.loadtxt(data, delimiter=",", dtype=dtype), expected) + + +def test_structured_dtype_with_multi_shape(): + dtype = np.dtype([("a", "u1", (2, 2))]) + data = StringIO("0 1 2 3\n") + expected = np.array([(((0, 1), (2, 3)),)], dtype=dtype) + assert_array_equal(np.loadtxt(data, dtype=dtype), expected) + + +def test_nested_structured_subarray(): + # Test from gh-16678 + point = np.dtype([('x', float), ('y', float)]) + dt = np.dtype([('code', int), ('points', point, (2,))]) + data = StringIO("100,1,2,3,4\n200,5,6,7,8\n") + expected = np.array( + [ + (100, [(1., 2.), (3., 4.)]), + (200, [(5., 6.), (7., 8.)]), + ], + dtype=dt + ) + assert_array_equal(np.loadtxt(data, dtype=dt, delimiter=","), expected) + + +def test_structured_dtype_offsets(): + # An aligned structured dtype will have additional padding + dt = np.dtype("i1, i4, i1, i4, i1, i4", align=True) + data = StringIO("1,2,3,4,5,6\n7,8,9,10,11,12\n") + expected = np.array([(1, 2, 3, 4, 5, 6), (7, 8, 9, 10, 11, 12)], dtype=dt) + assert_array_equal(np.loadtxt(data, delimiter=",", dtype=dt), expected) + + +@pytest.mark.parametrize("param", ("skiprows", "max_rows")) +def test_exception_negative_row_limits(param): + """skiprows and max_rows should raise for negative parameters.""" + with pytest.raises(ValueError, match="argument must be nonnegative"): + np.loadtxt("foo.bar", **{param: -3}) + + +@pytest.mark.parametrize("param", ("skiprows", "max_rows")) +def test_exception_noninteger_row_limits(param): + with pytest.raises(TypeError, match="argument must be an integer"): + np.loadtxt("foo.bar", **{param: 1.0}) + + +@pytest.mark.parametrize( + "data, shape", + [ + ("1 2 3 4 5\n", (1, 5)), # Single row + ("1\n2\n3\n4\n5\n", (5, 1)), # Single column + ] +) +def test_ndmin_single_row_or_col(data, shape): + arr = np.array([1, 2, 3, 4, 5]) + arr2d = arr.reshape(shape) + + assert_array_equal(np.loadtxt(StringIO(data), dtype=int), arr) + assert_array_equal(np.loadtxt(StringIO(data), dtype=int, ndmin=0), arr) + assert_array_equal(np.loadtxt(StringIO(data), dtype=int, ndmin=1), arr) + assert_array_equal(np.loadtxt(StringIO(data), dtype=int, ndmin=2), arr2d) + + +@pytest.mark.parametrize("badval", [-1, 3, None, "plate of shrimp"]) +def test_bad_ndmin(badval): + with pytest.raises(ValueError, match="Illegal value of ndmin keyword"): + np.loadtxt("foo.bar", ndmin=badval) + + +@pytest.mark.parametrize( + "ws", + ( + "\t", # tab + "\u2003", # em + "\u00A0", # non-break + "\u3000", # ideographic space + ) +) +def test_blank_lines_spaces_delimit(ws): + txt = StringIO( + f"1 2{ws}30\n\n4 5 60\n {ws} \n7 8 {ws} 90\n # comment\n3 2 1" + ) + # NOTE: It is unclear that the ` # comment` should succeed. Except + # for delimiter=None, which should use any whitespace (and maybe + # should just be implemented closer to Python + expected = np.array([[1, 2, 30], [4, 5, 60], [7, 8, 90], [3, 2, 1]]) + assert_equal( + np.loadtxt(txt, dtype=int, delimiter=None, comments="#"), expected + ) + + +def test_blank_lines_normal_delimiter(): + txt = StringIO('1,2,30\n\n4,5,60\n\n7,8,90\n# comment\n3,2,1') + expected = np.array([[1, 2, 30], [4, 5, 60], [7, 8, 90], [3, 2, 1]]) + assert_equal( + np.loadtxt(txt, dtype=int, delimiter=',', comments="#"), expected + ) + + +@pytest.mark.parametrize("dtype", (float, object)) +def test_maxrows_no_blank_lines(dtype): + txt = StringIO("1.5,2.5\n3.0,4.0\n5.5,6.0") + res = np.loadtxt(txt, dtype=dtype, delimiter=",", max_rows=2) + assert_equal(res.dtype, dtype) + assert_equal(res, np.array([["1.5", "2.5"], ["3.0", "4.0"]], dtype=dtype)) + + +@pytest.mark.parametrize("dtype", (np.dtype("f8"), np.dtype("i2"))) +def test_exception_message_bad_values(dtype): + txt = StringIO("1,2\n3,XXX\n5,6") + msg = f"could not convert string 'XXX' to {dtype} at row 1, column 2" + with pytest.raises(ValueError, match=msg): + np.loadtxt(txt, dtype=dtype, delimiter=",") + + +def test_converters_negative_indices(): + txt = StringIO('1.5,2.5\n3.0,XXX\n5.5,6.0') + conv = {-1: lambda s: np.nan if s == 'XXX' else float(s)} + expected = np.array([[1.5, 2.5], [3.0, np.nan], [5.5, 6.0]]) + res = np.loadtxt( + txt, dtype=np.float64, delimiter=",", converters=conv, encoding=None + ) + assert_equal(res, expected) + + +def test_converters_negative_indices_with_usecols(): + txt = StringIO('1.5,2.5,3.5\n3.0,4.0,XXX\n5.5,6.0,7.5\n') + conv = {-1: lambda s: np.nan if s == 'XXX' else float(s)} + expected = np.array([[1.5, 3.5], [3.0, np.nan], [5.5, 7.5]]) + res = np.loadtxt( + txt, + dtype=np.float64, + delimiter=",", + converters=conv, + usecols=[0, -1], + encoding=None, + ) + assert_equal(res, expected) + + +def test_ragged_usecols(): + # usecols, and negative ones, work even with varying number of columns. + txt = StringIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n") + expected = np.array([[0, 0], [0, 0], [0, 0]]) + res = np.loadtxt(txt, dtype=float, delimiter=",", usecols=[0, -2]) + assert_equal(res, expected) + + txt = StringIO("0,0,XXX\n0\n0,XXX,XXX,0,XXX\n") + with pytest.raises(ValueError, + match="invalid column index -2 at row 1 with 2 columns"): + # There is no -2 column in the second row: + np.loadtxt(txt, dtype=float, delimiter=",", usecols=[0, -2]) + + +def test_empty_usecols(): + txt = StringIO("0,0,XXX\n0,XXX,0,XXX\n0,XXX,XXX,0,XXX\n") + res = np.loadtxt(txt, dtype=np.dtype([]), delimiter=",", usecols=[]) + assert res.shape == (3,) + assert res.dtype == np.dtype([]) + + +@pytest.mark.parametrize("c1", ["a", "の", "🫕"]) +@pytest.mark.parametrize("c2", ["a", "の", "🫕"]) +def test_large_unicode_characters(c1, c2): + # c1 and c2 span ascii, 16bit and 32bit range. + txt = StringIO(f"a,{c1},c,1.0\ne,{c2},2.0,g") + res = np.loadtxt(txt, dtype=np.dtype('U12'), delimiter=",") + expected = np.array( + [f"a,{c1},c,1.0".split(","), f"e,{c2},2.0,g".split(",")], + dtype=np.dtype('U12') + ) + assert_equal(res, expected) + + +def test_unicode_with_converter(): + txt = StringIO("cat,dog\nαβγ,δεζ\nabc,def\n") + conv = {0: lambda s: s.upper()} + res = np.loadtxt( + txt, + dtype=np.dtype("U12"), + converters=conv, + delimiter=",", + encoding=None + ) + expected = np.array([['CAT', 'dog'], ['ΑΒΓ', 'δεζ'], ['ABC', 'def']]) + assert_equal(res, expected) + + +def test_converter_with_structured_dtype(): + txt = StringIO('1.5,2.5,Abc\n3.0,4.0,dEf\n5.5,6.0,ghI\n') + dt = np.dtype([('m', np.int32), ('r', np.float32), ('code', 'U8')]) + conv = {0: lambda s: int(10*float(s)), -1: lambda s: s.upper()} + res = np.loadtxt(txt, dtype=dt, delimiter=",", converters=conv) + expected = np.array( + [(15, 2.5, 'ABC'), (30, 4.0, 'DEF'), (55, 6.0, 'GHI')], dtype=dt + ) + assert_equal(res, expected) + + +def test_converter_with_unicode_dtype(): + """ + With the default 'bytes' encoding, tokens are encoded prior to being passed + to the converter. This means that the output of the converter may be bytes + instead of unicode as expected by `read_rows`. + + This test checks that outputs from the above scenario are properly decoded + prior to parsing by `read_rows`. + """ + txt = StringIO('abc,def\nrst,xyz') + conv = bytes.upper + res = np.loadtxt(txt, dtype=np.dtype("U3"), converters=conv, delimiter=",") + expected = np.array([['ABC', 'DEF'], ['RST', 'XYZ']]) + assert_equal(res, expected) + + +def test_read_huge_row(): + row = "1.5, 2.5," * 50000 + row = row[:-1] + "\n" + txt = StringIO(row * 2) + res = np.loadtxt(txt, delimiter=",", dtype=float) + assert_equal(res, np.tile([1.5, 2.5], (2, 50000))) + + +@pytest.mark.parametrize("dtype", "edfgFDG") +def test_huge_float(dtype): + # Covers a non-optimized path that is rarely taken: + field = "0" * 1000 + ".123456789" + dtype = np.dtype(dtype) + value = np.loadtxt([field], dtype=dtype)[()] + assert value == dtype.type("0.123456789") + + +@pytest.mark.parametrize( + ("given_dtype", "expected_dtype"), + [ + ("S", np.dtype("S5")), + ("U", np.dtype("U5")), + ], +) +def test_string_no_length_given(given_dtype, expected_dtype): + """ + The given dtype is just 'S' or 'U' with no length. In these cases, the + length of the resulting dtype is determined by the longest string found + in the file. + """ + txt = StringIO("AAA,5-1\nBBBBB,0-3\nC,4-9\n") + res = np.loadtxt(txt, dtype=given_dtype, delimiter=",") + expected = np.array( + [['AAA', '5-1'], ['BBBBB', '0-3'], ['C', '4-9']], dtype=expected_dtype + ) + assert_equal(res, expected) + assert_equal(res.dtype, expected_dtype) + + +def test_float_conversion(): + """ + Some tests that the conversion to float64 works as accurately as the Python + built-in `float` function. In a naive version of the float parser, these + strings resulted in values that were off by an ULP or two. + """ + strings = [ + '0.9999999999999999', + '9876543210.123456', + '5.43215432154321e+300', + '0.901', + '0.333', + ] + txt = StringIO('\n'.join(strings)) + res = np.loadtxt(txt) + expected = np.array([float(s) for s in strings]) + assert_equal(res, expected) + + +def test_bool(): + # Simple test for bool via integer + txt = StringIO("1, 0\n10, -1") + res = np.loadtxt(txt, dtype=bool, delimiter=",") + assert res.dtype == bool + assert_array_equal(res, [[True, False], [True, True]]) + # Make sure we use only 1 and 0 on the byte level: + assert_array_equal(res.view(np.uint8), [[1, 0], [1, 1]]) + + +@pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) +def test_integer_signs(dtype): + dtype = np.dtype(dtype) + assert np.loadtxt(["+2"], dtype=dtype) == 2 + if dtype.kind == "u": + with pytest.raises(ValueError): + np.loadtxt(["-1\n"], dtype=dtype) + else: + assert np.loadtxt(["-2\n"], dtype=dtype) == -2 + + for sign in ["++", "+-", "--", "-+"]: + with pytest.raises(ValueError): + np.loadtxt([f"{sign}2\n"], dtype=dtype) + + +@pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) +def test_implicit_cast_float_to_int_fails(dtype): + txt = StringIO("1.0, 2.1, 3.7\n4, 5, 6") + with pytest.raises(ValueError): + np.loadtxt(txt, dtype=dtype, delimiter=",") + +@pytest.mark.parametrize("dtype", (np.complex64, np.complex128)) +@pytest.mark.parametrize("with_parens", (False, True)) +def test_complex_parsing(dtype, with_parens): + s = "(1.0-2.5j),3.75,(7+-5.0j)\n(4),(-19e2j),(0)" + if not with_parens: + s = s.replace("(", "").replace(")", "") + + res = np.loadtxt(StringIO(s), dtype=dtype, delimiter=",") + expected = np.array( + [[1.0-2.5j, 3.75, 7-5j], [4.0, -1900j, 0]], dtype=dtype + ) + assert_equal(res, expected) + + +def test_read_from_generator(): + def gen(): + for i in range(4): + yield f"{i},{2*i},{i**2}" + + res = np.loadtxt(gen(), dtype=int, delimiter=",") + expected = np.array([[0, 0, 0], [1, 2, 1], [2, 4, 4], [3, 6, 9]]) + assert_equal(res, expected) + + +def test_read_from_generator_multitype(): + def gen(): + for i in range(3): + yield f"{i} {i / 4}" + + res = np.loadtxt(gen(), dtype="i, d", delimiter=" ") + expected = np.array([(0, 0.0), (1, 0.25), (2, 0.5)], dtype="i, d") + assert_equal(res, expected) + + +def test_read_from_bad_generator(): + def gen(): + for entry in ["1,2", b"3, 5", 12738]: + yield entry + + with pytest.raises( + TypeError, match=r"non-string returned while reading data" + ): + np.loadtxt(gen(), dtype="i, i", delimiter=",") + + +@pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts") +def test_object_cleanup_on_read_error(): + sentinel = object() + + already_read = 0 + def conv(x): + nonlocal already_read + if already_read > 4999: + raise ValueError("failed half-way through!") + already_read += 1 + return sentinel + + txt = StringIO("x\n" * 10000) + + with pytest.raises(ValueError, match="at row 5000, column 1"): + np.loadtxt(txt, dtype=object, converters={0: conv}) + + assert sys.getrefcount(sentinel) == 2 + + +def test_character_not_bytes_compatible(): + """Test exception when a character cannot be encoded as 'S'.""" + data = StringIO("–") # == \u2013 + with pytest.raises(ValueError): + np.loadtxt(data, dtype="S5") + + +@pytest.mark.parametrize("conv", (0, [float], "")) +def test_invalid_converter(conv): + msg = ( + "converters must be a dictionary mapping columns to converter " + "functions or a single callable." + ) + with pytest.raises(TypeError, match=msg): + np.loadtxt(StringIO("1 2\n3 4"), converters=conv) + + +def test_converters_dict_raises_non_integer_key(): + with pytest.raises(TypeError, match="keys of the converters dict"): + np.loadtxt(StringIO("1 2\n3 4"), converters={"a": int}) + with pytest.raises(TypeError, match="keys of the converters dict"): + np.loadtxt(StringIO("1 2\n3 4"), converters={"a": int}, usecols=0) + + +@pytest.mark.parametrize("bad_col_ind", (3, -3)) +def test_converters_dict_raises_non_col_key(bad_col_ind): + data = StringIO("1 2\n3 4") + with pytest.raises(ValueError, match="converter specified for column"): + np.loadtxt(data, converters={bad_col_ind: int}) + + +def test_converters_dict_raises_val_not_callable(): + with pytest.raises( + TypeError, match="values of the converters dictionary must be callable" + ): + np.loadtxt(StringIO("1 2\n3 4"), converters={0: 1}) + + +@pytest.mark.parametrize("q", ('"', "'", "`")) +def test_quoted_field(q): + txt = StringIO( + f"{q}alpha, x{q}, 2.5\n{q}beta, y{q}, 4.5\n{q}gamma, z{q}, 5.0\n" + ) + dtype = np.dtype([('f0', 'U8'), ('f1', np.float64)]) + expected = np.array( + [("alpha, x", 2.5), ("beta, y", 4.5), ("gamma, z", 5.0)], dtype=dtype + ) + + res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar=q) + assert_array_equal(res, expected) + + +def test_quote_support_default(): + """Support for quoted fields is disabled by default.""" + txt = StringIO('"lat,long", 45, 30\n') + dtype = np.dtype([('f0', 'U24'), ('f1', np.float64), ('f2', np.float64)]) + + with pytest.raises(ValueError, match="the number of columns changed"): + np.loadtxt(txt, dtype=dtype, delimiter=",") + + # Enable quoting support with non-None value for quotechar param + txt.seek(0) + expected = np.array([("lat,long", 45., 30.)], dtype=dtype) + + res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar='"') + assert_array_equal(res, expected) + + +def test_quotechar_multichar_error(): + txt = StringIO("1,2\n3,4") + msg = r".*must be a single unicode character or None" + with pytest.raises(TypeError, match=msg): + np.loadtxt(txt, delimiter=",", quotechar="''") + + +def test_comment_multichar_error_with_quote(): + txt = StringIO("1,2\n3,4") + msg = ( + "when multiple comments or a multi-character comment is given, " + "quotes are not supported." + ) + with pytest.raises(ValueError, match=msg): + np.loadtxt(txt, delimiter=",", comments="123", quotechar='"') + with pytest.raises(ValueError, match=msg): + np.loadtxt(txt, delimiter=",", comments=["#", "%"], quotechar='"') + + # A single character string in a tuple is unpacked though: + res = np.loadtxt(txt, delimiter=",", comments=("#",), quotechar="'") + assert_equal(res, [[1, 2], [3, 4]]) + + +def test_structured_dtype_with_quotes(): + data = StringIO( + ( + "1000;2.4;'alpha';-34\n" + "2000;3.1;'beta';29\n" + "3500;9.9;'gamma';120\n" + "4090;8.1;'delta';0\n" + "5001;4.4;'epsilon';-99\n" + "6543;7.8;'omega';-1\n" + ) + ) + dtype = np.dtype( + [('f0', np.uint16), ('f1', np.float64), ('f2', 'S7'), ('f3', np.int8)] + ) + expected = np.array( + [ + (1000, 2.4, "alpha", -34), + (2000, 3.1, "beta", 29), + (3500, 9.9, "gamma", 120), + (4090, 8.1, "delta", 0), + (5001, 4.4, "epsilon", -99), + (6543, 7.8, "omega", -1) + ], + dtype=dtype + ) + res = np.loadtxt(data, dtype=dtype, delimiter=";", quotechar="'") + assert_array_equal(res, expected) + + +def test_quoted_field_is_not_empty(): + txt = StringIO('1\n\n"4"\n""') + expected = np.array(["1", "4", ""], dtype="U1") + res = np.loadtxt(txt, delimiter=",", dtype="U1", quotechar='"') + assert_equal(res, expected) + +def test_quoted_field_is_not_empty_nonstrict(): + # Same as test_quoted_field_is_not_empty but check that we are not strict + # about missing closing quote (this is the `csv.reader` default also) + txt = StringIO('1\n\n"4"\n"') + expected = np.array(["1", "4", ""], dtype="U1") + res = np.loadtxt(txt, delimiter=",", dtype="U1", quotechar='"') + assert_equal(res, expected) + +def test_consecutive_quotechar_escaped(): + txt = StringIO('"Hello, my name is ""Monty""!"') + expected = np.array('Hello, my name is "Monty"!', dtype="U40") + res = np.loadtxt(txt, dtype="U40", delimiter=",", quotechar='"') + assert_equal(res, expected) + + +@pytest.mark.parametrize("data", ("", "\n\n\n", "# 1 2 3\n# 4 5 6\n")) +@pytest.mark.parametrize("ndmin", (0, 1, 2)) +@pytest.mark.parametrize("usecols", [None, (1, 2, 3)]) +def test_warn_on_no_data(data, ndmin, usecols): + """Check that a UserWarning is emitted when no data is read from input.""" + if usecols is not None: + expected_shape = (0, 3) + elif ndmin == 2: + expected_shape = (0, 1) # guess a single column?! + else: + expected_shape = (0,) + + txt = StringIO(data) + with pytest.warns(UserWarning, match="input contained no data"): + res = np.loadtxt(txt, ndmin=ndmin, usecols=usecols) + assert res.shape == expected_shape + + with NamedTemporaryFile(mode="w") as fh: + fh.write(data) + fh.seek(0) + with pytest.warns(UserWarning, match="input contained no data"): + res = np.loadtxt(txt, ndmin=ndmin, usecols=usecols) + assert res.shape == expected_shape + +@pytest.mark.parametrize("skiprows", (2, 3)) +def test_warn_on_skipped_data(skiprows): + data = "1 2 3\n4 5 6" + txt = StringIO(data) + with pytest.warns(UserWarning, match="input contained no data"): + np.loadtxt(txt, skiprows=skiprows) + + +@pytest.mark.parametrize("dtype", + list(np.typecodes["AllInteger"] + np.typecodes["AllFloat"]) + ["U2"]) +@pytest.mark.parametrize("swap", [True, False]) +def test_byteswapping_and_unaligned(dtype, swap): + data = ["x,1\n"] # no need for complicated data + dtype = np.dtype(dtype) + if swap: + dtype = dtype.newbyteorder() + full_dt = np.dtype([("a", "S1"), ("b", dtype)], align=False) + # The above ensures that the interesting "b" field is unaligned: + assert full_dt.fields["b"][1] == 1 + res = np.loadtxt(data, dtype=full_dt, delimiter=",") + assert res["b"] == dtype.type(1) + + +@pytest.mark.parametrize("dtype", + np.typecodes["AllInteger"] + "efdFD" + "?") +def test_unicode_whitespace_stripping(dtype): + # Test that all numeric types (and bool) strip whitespace correctly + # \u202F is a narrow no-break space, `\n` is just a whitespace if quoted. + # Currently, skip float128 as it did not always support this and has no + # "custom" parsing: + txt = StringIO(' 3 ,"\u202F2\n"') + res = np.loadtxt(txt, dtype=dtype, delimiter=",", quotechar='"') + assert_array_equal(res, np.array([3, 2]).astype(dtype)) + + +@pytest.mark.parametrize("dtype", "FD") +def test_unicode_whitespace_stripping_complex(dtype): + # Complex has a few extra cases since it has two components and parentheses + line = " 1 , 2+3j , ( 4+5j ), ( 6+-7j ) , 8j , ( 9j ) \n" + data = [line, line.replace(" ", "\u202F")] + res = np.loadtxt(data, dtype=dtype, delimiter=',') + assert_array_equal(res, np.array([[1, 2+3j, 4+5j, 6-7j, 8j, 9j]] * 2)) + + +@pytest.mark.parametrize("dtype", "FD") +@pytest.mark.parametrize("field", + ["1 +2j", "1+ 2j", "1+2 j", "1+-+3", "(1j", "(1", "(1+2j", "1+2j)"]) +def test_bad_complex(dtype, field): + with pytest.raises(ValueError): + np.loadtxt([field + "\n"], dtype=dtype, delimiter=",") + + +@pytest.mark.parametrize("dtype", + np.typecodes["AllInteger"] + "efgdFDG" + "?") +def test_nul_character_error(dtype): + # Test that a \0 character is correctly recognized as an error even if + # what comes before is valid (not everything gets parsed internally). + if dtype.lower() == "g": + pytest.xfail("longdouble/clongdouble assignment may misbehave.") + with pytest.raises(ValueError): + np.loadtxt(["1\000"], dtype=dtype, delimiter=",", quotechar='"') + + +@pytest.mark.parametrize("dtype", + np.typecodes["AllInteger"] + "efgdFDG" + "?") +def test_no_thousands_support(dtype): + # Mainly to document behaviour, Python supports thousands like 1_1. + # (e and G may end up using different conversion and support it, this is + # a bug but happens...) + if dtype == "e": + pytest.skip("half assignment currently uses Python float converter") + if dtype in "eG": + pytest.xfail("clongdouble assignment is buggy (uses `complex` always).") + + assert int("1_1") == float("1_1") == complex("1_1") == 11 + with pytest.raises(ValueError): + np.loadtxt(["1_1\n"], dtype=dtype) + + +@pytest.mark.parametrize("data", [ + ["1,2\n", "2\n,3\n"], + ["1,2\n", "2\r,3\n"]]) +def test_bad_newline_in_iterator(data): + # In NumPy <=1.22 this was accepted, because newlines were completely + # ignored when the input was an iterable. This could be changed, but right + # now, we raise an error. + with pytest.raises(ValueError, + match="Found an unquoted embedded newline within a single line"): + np.loadtxt(data, delimiter=",") + + +@pytest.mark.parametrize("data", [ + ["1,2\n", "2,3\r\n"], # a universal newline + ["1,2\n", "'2\n',3\n"], # a quoted newline + ["1,2\n", "'2\r',3\n"], + ["1,2\n", "'2\r\n',3\n"], +]) +def test_good_newline_in_iterator(data): + # The quoted newlines will be untransformed here, but are just whitespace. + res = np.loadtxt(data, delimiter=",", quotechar="'") + assert_array_equal(res, [[1., 2.], [2., 3.]]) + + +@pytest.mark.parametrize("newline", ["\n", "\r", "\r\n"]) +def test_universal_newlines_quoted(newline): + # Check that universal newline support within the tokenizer is not applied + # to quoted fields. (note that lines must end in newline or quoted + # fields will not include a newline at all) + data = ['1,"2\n"\n', '3,"4\n', '1"\n'] + data = [row.replace("\n", newline) for row in data] + res = np.loadtxt(data, dtype=object, delimiter=",", quotechar='"') + assert_array_equal(res, [['1', f'2{newline}'], ['3', f'4{newline}1']]) + + +def test_null_character(): + # Basic tests to check that the NUL character is not special: + res = np.loadtxt(["1\0002\0003\n", "4\0005\0006"], delimiter="\000") + assert_array_equal(res, [[1, 2, 3], [4, 5, 6]]) + + # Also not as part of a field (avoid unicode/arrays as unicode strips \0) + res = np.loadtxt(["1\000,2\000,3\n", "4\000,5\000,6"], + delimiter=",", dtype=object) + assert res.tolist() == [["1\000", "2\000", "3"], ["4\000", "5\000", "6"]] + + +def test_iterator_fails_getting_next_line(): + class BadSequence: + def __len__(self): + return 100 + + def __getitem__(self, item): + if item == 50: + raise RuntimeError("Bad things happened!") + return f"{item}, {item+1}" + + with pytest.raises(RuntimeError, match="Bad things happened!"): + np.loadtxt(BadSequence(), dtype=int, delimiter=",") + + +class TestCReaderUnitTests: + # These are internal tests for path that should not be possible to hit + # unless things go very very wrong somewhere. + def test_not_an_filelike(self): + with pytest.raises(AttributeError, match=".*read"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype=np.dtype("i"), filelike=True) + + def test_filelike_read_fails(self): + # Can only be reached if loadtxt opens the file, so it is hard to do + # via the public interface (although maybe not impossible considering + # the current "DataClass" backing). + class BadFileLike: + counter = 0 + def read(self, size): + self.counter += 1 + if self.counter > 20: + raise RuntimeError("Bad bad bad!") + return "1,2,3\n" + + with pytest.raises(RuntimeError, match="Bad bad bad!"): + np.core._multiarray_umath._load_from_filelike( + BadFileLike(), dtype=np.dtype("i"), filelike=True) + + def test_filelike_bad_read(self): + # Can only be reached if loadtxt opens the file, so it is hard to do + # via the public interface (although maybe not impossible considering + # the current "DataClass" backing). + class BadFileLike: + counter = 0 + def read(self, size): + return 1234 # not a string! + + with pytest.raises(TypeError, + match="non-string returned while reading data"): + np.core._multiarray_umath._load_from_filelike( + BadFileLike(), dtype=np.dtype("i"), filelike=True) + + def test_not_an_iter(self): + with pytest.raises(TypeError, + match="error reading from object, expected an iterable"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype=np.dtype("i"), filelike=False) + + def test_bad_type(self): + with pytest.raises(TypeError, match="internal error: dtype must"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype="i", filelike=False) + + def test_bad_encoding(self): + with pytest.raises(TypeError, match="encoding must be a unicode"): + np.core._multiarray_umath._load_from_filelike( + object(), dtype=np.dtype("i"), filelike=False, encoding=123) + + @pytest.mark.parametrize("newline", ["\r", "\n", "\r\n"]) + def test_manual_universal_newlines(self, newline): + # This is currently not available to users, because we should always + # open files with universal newlines enabled `newlines=None`. + # (And reading from an iterator uses slightly different code paths.) + # We have no real support for `newline="\r"` or `newline="\n" as the + # user cannot specify those options. + data = StringIO('0\n1\n"2\n"\n3\n4 #\n'.replace("\n", newline), + newline="") + + res = np.core._multiarray_umath._load_from_filelike( + data, dtype=np.dtype("U10"), filelike=True, + quote='"', comment="#", skiplines=1) + assert_array_equal(res[:, 0], ["1", f"2{newline}", "3", "4 "]) From 1e6b72b42292e62c1c86e4f77e30324e43aaa218 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Fri, 14 Jan 2022 19:30:14 -0600 Subject: [PATCH 50/70] TST,STY: Add small additional tests for converters/usecols Also fix style a bit to silence linter (hopefully), removes some black style, but I am not too opinionated about that :) --- numpy/lib/tests/test_io.py | 2 +- numpy/lib/tests/test_loadtxt.py | 45 +++++++++++++++++++-------------- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index f142972b2024..37404cb4175b 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -1212,7 +1212,7 @@ def test_max_rows_larger(self): # Same as above, but do not skip any lines: (0, ["-1,0\n", "1,2\n", "\n", "3,4\n"]), (0, ["-1,0", "1,2", "", "3,4"]), - (0, StringIO("-1,0\n1,2\n\n3,4")),]) + (0, StringIO("-1,0\n1,2\n\n3,4"))]) def test_max_rows_empty_lines(self, skip, data): with pytest.warns(UserWarning, match=f"Input line 3.*max_rows={3-skip}"): diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index b8fd9a79686c..8e3adfeff4ed 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -232,6 +232,10 @@ def test_converters_negative_indices_with_usecols(): ) assert_equal(res, expected) + # Second test with variable number of rows: + res = np.loadtxt(StringIO('''0,1,2\n0,1,2,3,4'''), delimiter=",", + usecols=[0, -1], converters={-1: (lambda x: -1)}) + assert_array_equal(res, [[0, -1], [0, -1]]) def test_ragged_usecols(): # usecols, and negative ones, work even with varying number of columns. @@ -294,16 +298,17 @@ def test_converter_with_structured_dtype(): def test_converter_with_unicode_dtype(): """ - With the default 'bytes' encoding, tokens are encoded prior to being passed - to the converter. This means that the output of the converter may be bytes - instead of unicode as expected by `read_rows`. + With the default 'bytes' encoding, tokens are encoded prior to being + passed to the converter. This means that the output of the converter may + be bytes instead of unicode as expected by `read_rows`. This test checks that outputs from the above scenario are properly decoded prior to parsing by `read_rows`. """ txt = StringIO('abc,def\nrst,xyz') conv = bytes.upper - res = np.loadtxt(txt, dtype=np.dtype("U3"), converters=conv, delimiter=",") + res = np.loadtxt( + txt, dtype=np.dtype("U3"), converters=conv, delimiter=",") expected = np.array([['ABC', 'DEF'], ['RST', 'XYZ']]) assert_equal(res, expected) @@ -349,9 +354,9 @@ def test_string_no_length_given(given_dtype, expected_dtype): def test_float_conversion(): """ - Some tests that the conversion to float64 works as accurately as the Python - built-in `float` function. In a naive version of the float parser, these - strings resulted in values that were off by an ULP or two. + Some tests that the conversion to float64 works as accurately as the + Python built-in `float` function. In a naive version of the float parser, + these strings resulted in values that were off by an ULP or two. """ strings = [ '0.9999999999999999', @@ -437,16 +442,15 @@ def gen(): yield entry with pytest.raises( - TypeError, match=r"non-string returned while reading data" - ): + TypeError, match=r"non-string returned while reading data"): np.loadtxt(gen(), dtype="i, i", delimiter=",") @pytest.mark.skipif(not HAS_REFCOUNT, reason="Python lacks refcounts") def test_object_cleanup_on_read_error(): sentinel = object() - already_read = 0 + def conv(x): nonlocal already_read if already_read > 4999: @@ -494,9 +498,8 @@ def test_converters_dict_raises_non_col_key(bad_col_ind): def test_converters_dict_raises_val_not_callable(): - with pytest.raises( - TypeError, match="values of the converters dictionary must be callable" - ): + with pytest.raises(TypeError, + match="values of the converters dictionary must be callable"): np.loadtxt(StringIO("1 2\n3 4"), converters={0: 1}) @@ -664,7 +667,8 @@ def test_unicode_whitespace_stripping(dtype): @pytest.mark.parametrize("dtype", "FD") def test_unicode_whitespace_stripping_complex(dtype): - # Complex has a few extra cases since it has two components and parentheses + # Complex has a few extra cases since it has two components and + # parentheses line = " 1 , 2+3j , ( 4+5j ), ( 6+-7j ) , 8j , ( 9j ) \n" data = [line, line.replace(" ", "\u202F")] res = np.loadtxt(data, dtype=dtype, delimiter=',') @@ -699,7 +703,7 @@ def test_no_thousands_support(dtype): if dtype == "e": pytest.skip("half assignment currently uses Python float converter") if dtype in "eG": - pytest.xfail("clongdouble assignment is buggy (uses `complex` always).") + pytest.xfail("clongdouble assignment is buggy (uses `complex`?).") assert int("1_1") == float("1_1") == complex("1_1") == 11 with pytest.raises(ValueError): @@ -713,8 +717,8 @@ def test_bad_newline_in_iterator(data): # In NumPy <=1.22 this was accepted, because newlines were completely # ignored when the input was an iterable. This could be changed, but right # now, we raise an error. - with pytest.raises(ValueError, - match="Found an unquoted embedded newline within a single line"): + msg = "Found an unquoted embedded newline within a single line" + with pytest.raises(ValueError, match=msg): np.loadtxt(data, delimiter=",") @@ -780,6 +784,7 @@ def test_filelike_read_fails(self): # the current "DataClass" backing). class BadFileLike: counter = 0 + def read(self, size): self.counter += 1 if self.counter > 20: @@ -794,19 +799,21 @@ def test_filelike_bad_read(self): # Can only be reached if loadtxt opens the file, so it is hard to do # via the public interface (although maybe not impossible considering # the current "DataClass" backing). + class BadFileLike: counter = 0 + def read(self, size): return 1234 # not a string! with pytest.raises(TypeError, - match="non-string returned while reading data"): + match="non-string returned while reading data"): np.core._multiarray_umath._load_from_filelike( BadFileLike(), dtype=np.dtype("i"), filelike=True) def test_not_an_iter(self): with pytest.raises(TypeError, - match="error reading from object, expected an iterable"): + match="error reading from object, expected an iterable"): np.core._multiarray_umath._load_from_filelike( object(), dtype=np.dtype("i"), filelike=False) From 14cd1bb2b114076c5ef15c5572317107e3a25e71 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Fri, 14 Jan 2022 23:01:39 -0600 Subject: [PATCH 51/70] DOC: Remove outdated loadtxt TODOs from code --- numpy/core/src/multiarray/textreading/tokenize.c.src | 9 +++------ numpy/lib/npyio.py | 8 +++++--- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src index 752eee2dd75e..153ce6bfc131 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.c.src +++ b/numpy/core/src/multiarray/textreading/tokenize.c.src @@ -322,12 +322,9 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config) /* * Finished line, do not read anymore (also do not eat \n). * If we are in a quoted field and the "line" does not end with - * a newline, the quoted field will be missing it right now. - * (i.e. `np.loadtxt(['"a', 'b"'], dtype="S2")` reads "ab") - * TODO: We should possibly insert a '\n' character when inside - * a quoted field the and '\n' character is not included - * in the string. `FileLike.readline()` does ensure it - * is included. + * a newline, the quoted field will not have it either. + * I.e. `np.loadtxt(['"a', 'b"'], dtype="S2", quotechar='"')` + * reads "ab". This matches `next(csv.reader(['"a', 'b"']))`. */ break; } diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index a23150832321..3ad5d5ff86ed 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -965,7 +965,6 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', try: if isinstance(fname, os.PathLike): fname = os.fspath(fname) - # TODO: loadtxt actually uses `file + ''` to decide this?! if isinstance(fname, str): fh = np.lib._datasource.open(fname, 'rt', encoding=encoding) if encoding is None: @@ -1047,6 +1046,10 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', else: arr = np.concatenate(chunks, axis=0) + # NOTE: ndmin works as advertised for structured dtypes, but normally + # these would return a 1D result plus the structured dimension, + # so ndmin=2 adds a third dimension even when no squeezing occurs. + # A `squeeze=False` could be a better solution (pandas uses squeeze). arr = _ensure_ndmin_ndarray(arr, ndmin=ndmin) if arr.shape: @@ -1058,8 +1061,7 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', ) if unpack: - # Handle unpack like np.loadtxt. - # XXX Check interaction with ndmin! + # Unpack structured dtypes if requested: dt = arr.dtype if dt.names is not None: # For structured arrays, return an array for each field. From 4f3b3d2e225d960ef6e8fdb63efc5c75b02b3cbb Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Fri, 14 Jan 2022 23:03:35 -0600 Subject: [PATCH 52/70] BUG: Fix loadtxt no data warning stacklevel --- numpy/lib/npyio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 3ad5d5ff86ed..d22d835060c7 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -1057,7 +1057,7 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', warnings.warn( f'loadtxt: input contained no data: "{fname}"', category=UserWarning, - stacklevel=2 + stacklevel=3 ) if unpack: From 3e0d4329e27eca7c63fd894020091bfae5091560 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Sat, 15 Jan 2022 11:56:57 -0600 Subject: [PATCH 53/70] TST,BUG: Fortify byteswapping tests and make a small fix I had a lingering feeling I should double check this, turns out that feeling was right ;). (Fixes up the tokenizer doc a bit.) --- .../src/multiarray/textreading/conversions.c | 9 ++++---- .../src/multiarray/textreading/tokenize.c.src | 23 +++++++++++-------- numpy/lib/tests/test_loadtxt.py | 19 ++++++++++++--- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/conversions.c b/numpy/core/src/multiarray/textreading/conversions.c index 2570e643d899..11f4210f7283 100644 --- a/numpy/core/src/multiarray/textreading/conversions.c +++ b/numpy/core/src/multiarray/textreading/conversions.c @@ -313,20 +313,19 @@ to_unicode(PyArray_Descr *descr, const Py_UCS4 *str, const Py_UCS4 *end, char *dataptr, parser_config *NPY_UNUSED(unused)) { - size_t length = descr->elsize / 4; + int length = descr->elsize / 4; - if (length <= (size_t)(end - str)) { + if (length <= end - str) { memcpy(dataptr, str, length * 4); } else { size_t given_len = end - str; memcpy(dataptr, str, given_len * 4); - memset(dataptr + given_len * 4, '\0', (length -given_len) * 4); + memset(dataptr + given_len * 4, '\0', (length - given_len) * 4); } if (!PyArray_ISNBO(descr->byteorder)) { - /* manual byteswap, unicode requires the array to be passed... */ - for (int i = 0; i < descr->elsize; i++) { + for (int i = 0; i < length; i++) { npy_bswap4_unaligned(dataptr); dataptr += 4; } diff --git a/numpy/core/src/multiarray/textreading/tokenize.c.src b/numpy/core/src/multiarray/textreading/tokenize.c.src index 153ce6bfc131..6ddba334529c 100644 --- a/numpy/core/src/multiarray/textreading/tokenize.c.src +++ b/numpy/core/src/multiarray/textreading/tokenize.c.src @@ -275,20 +275,26 @@ tokenizer_core_@type@(tokenizer_state *ts, parser_config *const config) /* - * This version now always copies the full "row" (all tokens). This makes + * This tokenizer always copies the full "row" (all tokens). This makes * two things easier: * 1. It means that every word is guaranteed to be followed by a NUL character * (although it can include one as well). - * 2. In the usecols case we can sniff the first row easier by parsing it - * fully. + * 2. If usecols are used we can sniff the first row easier by parsing it + * fully. Further, usecols can be negative so we may not know which row we + * need up-front. * * The tokenizer could grow the ability to skip fields and check the - * maximum number of fields when known. + * maximum number of fields when known, it is unclear that this is worthwhile. * - * Unlike other tokenizers, this one tries to work in chunks and copies - * data to words only when it has to. The hope is that this makes multiple - * light-weight loops rather than a single heavy one, to allow e.g. quickly - * scanning for the end of a field. + * Unlike some tokenizers, this one tries to work in chunks and copies + * data in chunks as well. The hope is that this makes multiple light-weight + * loops rather than a single heavy one, to allow e.g. quickly scanning for the + * end of a field. Copying chunks also means we usually only check once per + * field whether the buffer is large enough. + * Different choices are possible, this one seems to work well, though. + * + * The core (main part) of the tokenizer is specialized for the three Python + * unicode flavors UCS1, UCS2, and UCS4 as a worthwhile optimization. */ NPY_NO_EXPORT int tokenize(stream *s, tokenizer_state *ts, parser_config *const config) @@ -301,7 +307,6 @@ tokenize(stream *s, tokenizer_state *ts, parser_config *const config) /* Reset to start of buffer */ ts->field_buffer_pos = 0; ts->num_fields = 0; - /* Add the first field */ while (1) { /* diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index 8e3adfeff4ed..0ebcf12dc2e3 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -642,15 +642,28 @@ def test_warn_on_skipped_data(skiprows): list(np.typecodes["AllInteger"] + np.typecodes["AllFloat"]) + ["U2"]) @pytest.mark.parametrize("swap", [True, False]) def test_byteswapping_and_unaligned(dtype, swap): - data = ["x,1\n"] # no need for complicated data + # Try to create "interesting" values within the valid unicode range: + byte_data = np.array([0x012345, 0x023456] * 8, dtype=np.uint32) dtype = np.dtype(dtype) + + # For (c)longdouble use double -> str -> longdouble to avoid round-tripping + # issues. (A bit convoluted, but necessary due to rounding.) + if dtype.type == np.longdouble: + value = np.longdouble(str(byte_data.view(np.double).item(0))) + elif dtype.type == np.clongdouble: + value = np.clongdouble(str(byte_data.view(np.double).item(0))) + else: + value = byte_data.view(dtype).item(0) + + data = [f"x,{value}\n"] if swap: dtype = dtype.newbyteorder() full_dt = np.dtype([("a", "S1"), ("b", dtype)], align=False) # The above ensures that the interesting "b" field is unaligned: assert full_dt.fields["b"][1] == 1 - res = np.loadtxt(data, dtype=full_dt, delimiter=",") - assert res["b"] == dtype.type(1) + res = np.loadtxt(data, dtype=full_dt, delimiter=",", encoding=None, + max_rows=1) # max-rows prevents over-allocation + assert res["b"] == value @pytest.mark.parametrize("dtype", From ecff02c30d774e3588535d85f9152ea6066d0b24 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Tue, 11 Jan 2022 17:26:50 -0800 Subject: [PATCH 54/70] Update and add converters examples. --- numpy/lib/npyio.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index d22d835060c7..87fa2e4e2577 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -1204,6 +1204,29 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, >>> y array([2., 4.]) + The `converters` argument is used to specify functions to preprocess the + text prior to parsing. `converters` can be a dictionary that maps + preprocessing functions to each column: + + >>> s = StringIO("1.618, 2.296\n3.141, 4.669\n") + >>> conv = { + ... 0: lambda x: np.floor(float(x)), # conversion fn for column 0 + ... 1: lambda x: np.ceil(float(x)), # conversion fn for column 1 + ... } + >>> np.loadtxt(s, delimiter=",", converters=conv) + array([[1., 3.], + [3., 5.]]) + + `converters` can be a callable instead of a dictionary, in which case it + is applied to all columns: + + >>> s = StringIO("0xDE 0xAD\n0xC0 0xDE") + >>> import functools + >>> conv = functools.partial(int, base=16) + >>> np.loadtxt(s, converters=conv) + array([[222., 173.], + [192., 222.]]) + This example shows how `converters` can be used to convert a field with a trailing minus sign into a negative number. @@ -1211,10 +1234,19 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, >>> def conv(fld): ... return -float(fld[:-1]) if fld.endswith(b'-') else float(fld) ... - >>> np.loadtxt(s, converters={0: conv, 1: conv}) + >>> np.loadtxt(s, converters=conv) array([[ 10.01, -31.25], [ 19.22, 64.31], [-17.57, 63.94]]) + + Note that with the default ``encoding="bytes"``, the inputs to the + converter function are latin-1 encoded byte strings. To deactivate the + implicit encoding prior to conversion, behavior use ``encoding=None`` + + >>> s = StringIO('10.01 31.25-\n19.22 64.31\n17.57- 63.94') + >>> conv = lambda x: -float(x[:-1]) if x.endswith('-') else float(x) + >>> np.loadtxt(s, converters=conv, encoding=None) + """ if like is not None: From e15d85324a1f5641aaccafbe3cc87556e88ff0a3 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Tue, 11 Jan 2022 23:02:59 -0800 Subject: [PATCH 55/70] Add quotechar to examples. --- numpy/lib/npyio.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 87fa2e4e2577..62d00cfb9fb2 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -1241,11 +1241,31 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, Note that with the default ``encoding="bytes"``, the inputs to the converter function are latin-1 encoded byte strings. To deactivate the - implicit encoding prior to conversion, behavior use ``encoding=None`` + implicit encoding prior to conversion, use ``encoding=None`` >>> s = StringIO('10.01 31.25-\n19.22 64.31\n17.57- 63.94') >>> conv = lambda x: -float(x[:-1]) if x.endswith('-') else float(x) >>> np.loadtxt(s, converters=conv, encoding=None) + array([[ 10.01, -31.25], + [ 19.22, 64.31], + [-17.57, 63.94]]) + + Support for quoted fields is enabled with the `quotechar` parameter. + Comment and delimiter characters are ignored when they appear within a + quoted item delineated by `quotechar`: + + >>> s = StringIO('"alpha, #42", 10.0\n"beta, #64", 2.0\n') + >>> dtype = np.dtype([("label", "U12"), ("value", float)]) + >>> np.loadtxt(s, dtype=dtype, delimiter=",", quotechar='"') + array([('alpha, #42', 10.), ('beta, #64', 2.)], + dtype=[('label', '>> s = StringIO('"Hello, my name is ""Monty""!"') + >>> np.loadtxt(s, dtype="U", delimiter=",", quotechar='"') + array('Hello, my name is "Monty"!', dtype=' Date: Wed, 19 Jan 2022 11:50:13 -0600 Subject: [PATCH 56/70] ENH: Give a clear error when control characters match/are newlines These never have a meaning (at best an implicit order meaning one is ignored), except theoretically in the `delimiter=None` case (or in strange cases with `\r` being a delimiter, but `\n` being the newline when a file is opened manually but not in universal newline mode). It seems more useful to just generally raise an error, since all of "features" are weird corner cases and likely surprising to users. --- .../src/multiarray/textreading/readtext.c | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c index 5d3613736136..93b00d18f0de 100644 --- a/numpy/core/src/multiarray/textreading/readtext.c +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -100,6 +100,84 @@ parse_control_character(PyObject *obj, Py_UCS4 *character) } +/* + * A (somewhat verbose) check that none of the control characters match or are + * newline. Most of these combinations are completely fine, just weird or + * surprising. + * (I.e. there is an implicit priority for control characters, so if a comment + * matches a delimiter, it would just be a comment.) + * In theory some `delimiter=None` paths could have a "meaning", but let us + * assume that users are better of setting one of the control chars to `None` + * for clarity. + * + * This also checks that the control characters cannot be newlines. + */ +static int +error_if_matching_control_characters( + Py_UCS4 delimiter, Py_UCS4 quote, Py_UCS4 comment) +{ + char *control_char1; + char *control_char2 = NULL; + if (comment != (Py_UCS4)-1) { + control_char1 = "comment"; + if (comment == '\r' || comment == '\n') { + goto error; + } + else if (comment == quote) { + control_char2 = "quotechar"; + goto error; + } + else if (comment == delimiter) { + control_char2 = "delimiter"; + goto error; + } + } + if (quote != (Py_UCS4)-1) { + control_char1 = "quotechar"; + if (quote == '\r' || quote == '\n') { + goto error; + } + else if (quote == delimiter) { + control_char2 = "delimiter"; + goto error; + } + } + if (delimiter != (Py_UCS4)-1) { + control_char1 = "delimiter"; + if (delimiter == '\r' || delimiter == '\n') { + goto error; + } + } + /* The above doesn't work with delimiter=None, which means "whitespace" */ + if (delimiter == (Py_UCS4)-1) { + control_char1 = "delimiter"; + if (Py_UNICODE_ISSPACE(comment)) { + control_char2 = "comment"; + goto error; + } + else if (Py_UNICODE_ISSPACE(quote)) { + control_char2 = "quotechar"; + goto error; + } + } + return 0; + + error: + if (control_char2 != NULL) { + PyErr_Format(PyExc_TypeError, + "control characters '%s' and '%s' are identical, please set one" + "of them to `None` to indicate that it should not be used.", + control_char1, control_char2); + } + else { + PyErr_Format(PyExc_TypeError, + "control character '%s' cannot be a newline (`\\r` or `\\n`).", + control_char1, control_char2); + } + return -1; +} + + NPY_NO_EXPORT PyObject * _load_from_filelike(PyObject *NPY_UNUSED(mod), PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames) @@ -148,6 +226,12 @@ _load_from_filelike(PyObject *NPY_UNUSED(mod), return NULL; } + /* Reject matching control characters, they just rarely make sense anyway */ + if (error_if_matching_control_characters( + pc.delimiter, pc.quote, pc.comment) < 0) { + return NULL; + } + if (pc.delimiter == (Py_UCS4)-1) { pc.delimiter_is_whitespace = true; /* Ignore leading whitespace to match `string.split(None)` */ From 5d98d672a85421ecdc2d20c7f0a74003eaff004d Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 19 Jan 2022 14:10:10 -0600 Subject: [PATCH 57/70] TST: Skip unparsable field error tests on PyPy PyPy has a small bug with error formatting, so these cause it to crash. Simply skip the tests on old PyPy versions for now. (Matti fixed the issue long ago, just waiting for a new PyPy release :)) --- numpy/core/src/multiarray/textreading/rows.c | 5 +++++ numpy/lib/tests/test_io.py | 6 ++++++ numpy/lib/tests/test_loadtxt.py | 16 +++++++++++++++- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/numpy/core/src/multiarray/textreading/rows.c b/numpy/core/src/multiarray/textreading/rows.c index 2ca97606066f..e30ff835eaeb 100644 --- a/numpy/core/src/multiarray/textreading/rows.c +++ b/numpy/core/src/multiarray/textreading/rows.c @@ -389,6 +389,11 @@ read_rows(stream *s, } } + /* + * The following function calls represent the main "conversion" + * step, i.e. parsing the unicode string for each field and storing + * the result in the array. + */ int parser_res; Py_UCS4 *str = ts.field_buffer + fields[col].offset; Py_UCS4 *end = ts.field_buffer + fields[col + 1].offset - 1; diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 37404cb4175b..a2758123b602 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -779,6 +779,8 @@ def test_comments_multiple(self): a = np.array([[1, 2, 3], [4, 5, 6]], int) assert_array_equal(x, a) + @pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") def test_comments_multi_chars(self): c = TextIO() c.write('/* comment\n1,2,3,5\n') @@ -995,6 +997,8 @@ def test_from_float_hex(self): c, dtype=dt, converters=float.fromhex, encoding="latin1") assert_equal(res, tgt, err_msg="%s" % dt) + @pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") def test_default_float_converter_no_default_hex_conversion(self): """ Ensure that fromhex is only used for values with the correct prefix and @@ -1005,6 +1009,8 @@ def test_default_float_converter_no_default_hex_conversion(self): match=".*convert string 'a' to float64 at row 0, column 1"): np.loadtxt(c) + @pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") def test_default_float_converter_exception(self): """ Ensure that the exception message raised during failed floating point diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index 0ebcf12dc2e3..df88cef9550a 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -11,7 +11,7 @@ import numpy as np from numpy.ma.testutils import assert_equal -from numpy.testing import assert_array_equal, HAS_REFCOUNT +from numpy.testing import assert_array_equal, HAS_REFCOUNT, IS_PYPY def test_scientific_notation(): @@ -200,6 +200,8 @@ def test_maxrows_no_blank_lines(dtype): assert_equal(res, np.array([["1.5", "2.5"], ["3.0", "4.0"]], dtype=dtype)) +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") @pytest.mark.parametrize("dtype", (np.dtype("f8"), np.dtype("i2"))) def test_exception_message_bad_values(dtype): txt = StringIO("1,2\n3,XXX\n5,6") @@ -381,6 +383,8 @@ def test_bool(): assert_array_equal(res.view(np.uint8), [[1, 0], [1, 1]]) +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") @pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) def test_integer_signs(dtype): dtype = np.dtype(dtype) @@ -396,6 +400,8 @@ def test_integer_signs(dtype): np.loadtxt([f"{sign}2\n"], dtype=dtype) +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") @pytest.mark.parametrize("dtype", np.typecodes["AllInteger"]) def test_implicit_cast_float_to_int_fails(dtype): txt = StringIO("1.0, 2.1, 3.7\n4, 5, 6") @@ -466,6 +472,8 @@ def conv(x): assert sys.getrefcount(sentinel) == 2 +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") def test_character_not_bytes_compatible(): """Test exception when a character cannot be encoded as 'S'.""" data = StringIO("–") # == \u2013 @@ -688,6 +696,8 @@ def test_unicode_whitespace_stripping_complex(dtype): assert_array_equal(res, np.array([[1, 2+3j, 4+5j, 6-7j, 8j, 9j]] * 2)) +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") @pytest.mark.parametrize("dtype", "FD") @pytest.mark.parametrize("field", ["1 +2j", "1+ 2j", "1+2 j", "1+-+3", "(1j", "(1", "(1+2j", "1+2j)"]) @@ -696,6 +706,8 @@ def test_bad_complex(dtype, field): np.loadtxt([field + "\n"], dtype=dtype, delimiter=",") +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") @pytest.mark.parametrize("dtype", np.typecodes["AllInteger"] + "efgdFDG" + "?") def test_nul_character_error(dtype): @@ -707,6 +719,8 @@ def test_nul_character_error(dtype): np.loadtxt(["1\000"], dtype=dtype, delimiter=",", quotechar='"') +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") @pytest.mark.parametrize("dtype", np.typecodes["AllInteger"] + "efgdFDG" + "?") def test_no_thousands_support(dtype): From 9ef1a61ced90dbfd8fdf345fc748e1041ad4e017 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 19 Jan 2022 14:46:22 -0600 Subject: [PATCH 58/70] TST: Use hand-picked values for byte-swapping tests Longdouble is a source of problems here especially (mainly due to it sometimes using double in the background, or maybe just buggy implementations). Together with strings that correctly parsed do not roundtrip (if printed using less precision), things just do not work out... This fixes it, and is simpler/clearer anyway. --- numpy/lib/tests/test_loadtxt.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index df88cef9550a..18a814b69d43 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -646,23 +646,25 @@ def test_warn_on_skipped_data(skiprows): np.loadtxt(txt, skiprows=skiprows) -@pytest.mark.parametrize("dtype", - list(np.typecodes["AllInteger"] + np.typecodes["AllFloat"]) + ["U2"]) +@pytest.mark.parametrize(["dtype", "value"], [ + ("i2", 0x0001), ("u2", 0x0001), + ("i4", 0x00010203), ("u4", 0x00010203), + ("i8", 0x0001020304050607), ("u8", 0x0001020304050607), + # The following values are constructed to lead to unique bytes: + ("float16", 3.07e-05), + ("float32", 9.2557e-41), ("complex64", 9.2557e-41+2.8622554e-29j), + ("float64", -1.758571353180402e-24), + ("complex128", 5.406409232372729e-29-1.758571353180402e-24j), + # Use integer values that fit into double. Everything else leads to + # problems due to longdoubles going via double and decimal strings + # causing rounding errors. + ("longdouble", 0x01020304050607), + ("clongdouble", 0x01020304050607 + (0x00121314151617 * 1j)), + ("U2", "\U00010203\U000a0b0c")]) @pytest.mark.parametrize("swap", [True, False]) -def test_byteswapping_and_unaligned(dtype, swap): +def test_byteswapping_and_unaligned(dtype, value, swap): # Try to create "interesting" values within the valid unicode range: - byte_data = np.array([0x012345, 0x023456] * 8, dtype=np.uint32) dtype = np.dtype(dtype) - - # For (c)longdouble use double -> str -> longdouble to avoid round-tripping - # issues. (A bit convoluted, but necessary due to rounding.) - if dtype.type == np.longdouble: - value = np.longdouble(str(byte_data.view(np.double).item(0))) - elif dtype.type == np.clongdouble: - value = np.clongdouble(str(byte_data.view(np.double).item(0))) - else: - value = byte_data.view(dtype).item(0) - data = [f"x,{value}\n"] if swap: dtype = dtype.newbyteorder() @@ -671,7 +673,7 @@ def test_byteswapping_and_unaligned(dtype, swap): assert full_dt.fields["b"][1] == 1 res = np.loadtxt(data, dtype=full_dt, delimiter=",", encoding=None, max_rows=1) # max-rows prevents over-allocation - assert res["b"] == value + assert res["b"] == dtype.type(value) @pytest.mark.parametrize("dtype", From dfc898913404533d54bf0000d1d17ff92b6ae0e4 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 19 Jan 2022 15:52:03 -0600 Subject: [PATCH 59/70] TST: Catch two more errors that runs into the PyPy issue --- numpy/lib/tests/test_loadtxt.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index 18a814b69d43..366210a0f656 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -491,6 +491,8 @@ def test_invalid_converter(conv): np.loadtxt(StringIO("1 2\n3 4"), converters=conv) +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") def test_converters_dict_raises_non_integer_key(): with pytest.raises(TypeError, match="keys of the converters dict"): np.loadtxt(StringIO("1 2\n3 4"), converters={"a": int}) @@ -541,6 +543,8 @@ def test_quote_support_default(): assert_array_equal(res, expected) +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") def test_quotechar_multichar_error(): txt = StringIO("1,2\n3,4") msg = r".*must be a single unicode character or None" From 763a3d4878671d383ba8aa573af90fee125efff4 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 19 Jan 2022 16:49:56 -0600 Subject: [PATCH 60/70] TST: Use repr in byteswapping tests The `str` values for those weird values used for longdouble are truncated by PyPy's complex `str` output. Which seems fine probably since PyPy's `repr` does the right thing and will not truncate. --- numpy/lib/tests/test_loadtxt.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index 366210a0f656..c5a14ed46bb9 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -658,18 +658,20 @@ def test_warn_on_skipped_data(skiprows): ("float16", 3.07e-05), ("float32", 9.2557e-41), ("complex64", 9.2557e-41+2.8622554e-29j), ("float64", -1.758571353180402e-24), - ("complex128", 5.406409232372729e-29-1.758571353180402e-24j), + # Here and below, the repr side-steps a small loss of precision in + # complex `str` in PyPy (which is probably fine, as repr works): + ("complex128", repr(5.406409232372729e-29-1.758571353180402e-24j)), # Use integer values that fit into double. Everything else leads to # problems due to longdoubles going via double and decimal strings # causing rounding errors. ("longdouble", 0x01020304050607), - ("clongdouble", 0x01020304050607 + (0x00121314151617 * 1j)), + ("clongdouble", repr(0x01020304050607 + (0x00121314151617 * 1j))), ("U2", "\U00010203\U000a0b0c")]) @pytest.mark.parametrize("swap", [True, False]) def test_byteswapping_and_unaligned(dtype, value, swap): # Try to create "interesting" values within the valid unicode range: dtype = np.dtype(dtype) - data = [f"x,{value}\n"] + data = [f"x,{value}\n"] # repr as PyPy `str` truncates some if swap: dtype = dtype.newbyteorder() full_dt = np.dtype([("a", "S1"), ("b", dtype)], align=False) From b335431699f86ab523dc6dba2c91efc799f4372b Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Fri, 28 Jan 2022 09:17:25 -0800 Subject: [PATCH 61/70] TST: Some tests for control character collisions. Adds some tests for the behavior of control characters, e.g. comments, delimiter and quotechar, when they have the same value. At this stage, these tests are more to frame the discussion about what the behavior should be, not to test what it currently is. I personally think raising an exception is correct for most of these situations, though it's worth noting that np.loadtxt currently doesn't for most of these corner cases (and seems to randomly assign precedence to delimiter over comments or vice versa depending on the values). --- .../src/multiarray/textreading/readtext.c | 4 +- numpy/lib/npyio.py | 13 ++--- numpy/lib/tests/test_loadtxt.py | 50 +++++++++++++++++++ 3 files changed, 59 insertions(+), 8 deletions(-) diff --git a/numpy/core/src/multiarray/textreading/readtext.c b/numpy/core/src/multiarray/textreading/readtext.c index 93b00d18f0de..7af5ee891288 100644 --- a/numpy/core/src/multiarray/textreading/readtext.c +++ b/numpy/core/src/multiarray/textreading/readtext.c @@ -165,8 +165,8 @@ error_if_matching_control_characters( error: if (control_char2 != NULL) { PyErr_Format(PyExc_TypeError, - "control characters '%s' and '%s' are identical, please set one" - "of them to `None` to indicate that it should not be used.", + "The values for control characters '%s' and '%s' are " + "incompatible", control_char1, control_char2); } else { diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 62d00cfb9fb2..be313d104d06 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -920,12 +920,6 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', if comment is None: comments = None - elif isinstance(comment, str): - if len(comment) > 1: # length of 0 is rejected later - comments = (comment,) - comment = None - else: - comments = None else: # assume comments are a sequence of strings comments = tuple(comment) @@ -938,6 +932,13 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', if isinstance(comments[0], str) and len(comments[0]) == 1: comment = comments[0] comments = None + else: + # Input validation if there are multiple comment characters + if delimiter in comments: + raise TypeError( + f"Comment characters '{comments}' cannot include the " + f"delimiter '{delimiter}'" + ) # comment is now either a 1 or 0 character string or a tuple: if comments is not None: diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index c5a14ed46bb9..2038bfc859ed 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -876,3 +876,53 @@ def test_manual_universal_newlines(self, newline): data, dtype=np.dtype("U10"), filelike=True, quote='"', comment="#", skiplines=1) assert_array_equal(res[:, 0], ["1", f"2{newline}", "3", "4 "]) + + +def test_delimiter_comment_collision_raises(): + with pytest.raises(TypeError, match="control characters.*are identical"): + np.loadtxt(StringIO("1, 2, 3"), delimiter=",", comments=",") + + +def test_delimiter_quotechar_collision_raises(): + with pytest.raises(TypeError, match="control characters.*are identical"): + np.loadtxt(StringIO("1, 2, 3"), delimiter=",", quotechar=",") + + +def test_comment_quotechar_collision_raises(): + with pytest.raises(TypeError, match="control characters.*are identical"): + np.loadtxt(StringIO("1 2 3"), comments="#", quotechar="#") + + +def test_delimiter_and_multiple_comments_collision_raises(): + with pytest.raises( + TypeError, match="Comment characters.*cannot include the delimiter" + ): + np.loadtxt(StringIO("1, 2, 3"), delimiter=",", comments=["#", ","]) + + +@pytest.mark.parametrize( + "ws", + ( + " ", # space + "\t", # tab + "\u2003", # em + "\u00A0", # non-break + "\u3000", # ideographic space + ) +) +def test_collision_with_default_delimiter_raises(ws): + with pytest.raises(TypeError, match="control characters.*are identical"): + np.loadtxt(StringIO(f"1{ws}2{ws}3\n4{ws}5{ws}6\n"), comments=ws) + with pytest.raises(TypeError, match="control characters.*are identical"): + np.loadtxt(StringIO(f"1{ws}2{ws}3\n4{ws}5{ws}6\n"), quotechar=ws) + + +@pytest.mark.parametrize("nl", ("\n", "\r")) +def test_control_character_newline_raises(nl): + txt = StringIO(f"1{nl}2{nl}3{nl}{nl}4{nl}5{nl}6{nl}{nl}") + with pytest.raises(TypeError, match="control character.*cannot be a newline"): + np.loadtxt(txt, delimiter=nl) + with pytest.raises(TypeError, match="control character.*cannot be a newline"): + np.loadtxt(txt, comments=nl) + with pytest.raises(TypeError, match="control character.*cannot be a newline"): + np.loadtxt(txt, quotechar=nl) From 057632782f0c26572ef7bafa195b513d758c34f3 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Fri, 28 Jan 2022 09:09:43 -0800 Subject: [PATCH 62/70] Add test for datetime parametric unit discovery. --- numpy/lib/tests/test_loadtxt.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index 2038bfc859ed..5870915ae6d2 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -6,7 +6,7 @@ import sys import pytest -from tempfile import NamedTemporaryFile +from tempfile import NamedTemporaryFile, mkstemp from io import StringIO import numpy as np @@ -926,3 +926,25 @@ def test_control_character_newline_raises(nl): np.loadtxt(txt, comments=nl) with pytest.raises(TypeError, match="control character.*cannot be a newline"): np.loadtxt(txt, quotechar=nl) + + +def test_datetime_parametric_unit_discovery(): + """Check that the correct unit (e.g. month, day, second) is discovered from + the data when a user specifies a unitless datetime.""" + # Unit should be "D" (days) due to last entry + data = ["2012-03"] * 50000 + ["2013-01-15"] + expected = np.array(data, dtype="M8[D]") + + # file-like path + txt = StringIO("\n".join(data)) + a = np.loadtxt(txt, dtype="M8") + assert a.dtype == expected.dtype + assert_equal(a, expected) + + # file-obj path + fd, fname = mkstemp() + with open(fname, "w") as fh: + fh.write("\n".join(data)) + a = np.loadtxt(fname, dtype="M8") + assert a.dtype == expected.dtype + assert_equal(a, expected) From 370792b3929aa9c66403c9509f08cb7921347352 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Fri, 28 Jan 2022 11:41:46 -0800 Subject: [PATCH 63/70] Add test for unicode, parametrize for chunksize. --- numpy/lib/tests/test_loadtxt.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index 5870915ae6d2..5a34267ed28d 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -928,16 +928,26 @@ def test_control_character_newline_raises(nl): np.loadtxt(txt, quotechar=nl) -def test_datetime_parametric_unit_discovery(): +@pytest.mark.parametrize( + ("generic_data", "long_datum", "unitless_dtype", "expected_dtype"), + [ + ("2012-03", "2013-01-15", "M8", "M8[D]"), # Datetimes + ("spam-a-lot", "tis_but_a_scratch", "U", "U17"), # str + ], +) +@pytest.mark.parametrize("nrows", (10, 50000, 60000)) # lt, eq, gt chunksize +def test_datetime_parametric_unit_discovery( + generic_data, long_datum, unitless_dtype, expected_dtype, nrows +): """Check that the correct unit (e.g. month, day, second) is discovered from the data when a user specifies a unitless datetime.""" # Unit should be "D" (days) due to last entry - data = ["2012-03"] * 50000 + ["2013-01-15"] - expected = np.array(data, dtype="M8[D]") + data = [generic_data] * 50000 + [long_datum] + expected = np.array(data, dtype=expected_dtype) # file-like path txt = StringIO("\n".join(data)) - a = np.loadtxt(txt, dtype="M8") + a = np.loadtxt(txt, dtype=unitless_dtype) assert a.dtype == expected.dtype assert_equal(a, expected) @@ -945,6 +955,6 @@ def test_datetime_parametric_unit_discovery(): fd, fname = mkstemp() with open(fname, "w") as fh: fh.write("\n".join(data)) - a = np.loadtxt(fname, dtype="M8") + a = np.loadtxt(fname, dtype=unitless_dtype) assert a.dtype == expected.dtype assert_equal(a, expected) From 8a31abccb1928d3ca1e669e105020f3a21dfb1a8 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Fri, 28 Jan 2022 12:16:00 -0800 Subject: [PATCH 64/70] Add test for empty string as control characters. Includes comments param, which is handled on the Python side. --- numpy/lib/npyio.py | 5 +++++ numpy/lib/tests/test_loadtxt.py | 11 +++++++++++ 2 files changed, 16 insertions(+) diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index be313d104d06..63fffffbc9eb 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -922,6 +922,11 @@ def _read(fname, *, delimiter=',', comment='#', quote='"', comments = None else: # assume comments are a sequence of strings + if "" in comment: + raise ValueError( + "comments cannot be an empty string. Use comments=None to " + "disable comments." + ) comments = tuple(comment) comment = None if len(comments) == 0: diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index 5a34267ed28d..b1e8671577d9 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -958,3 +958,14 @@ def test_datetime_parametric_unit_discovery( a = np.loadtxt(fname, dtype=unitless_dtype) assert a.dtype == expected.dtype assert_equal(a, expected) + + +def test_control_character_empty(): + with pytest.raises(TypeError, match="Text reading control character must"): + np.loadtxt(StringIO("1 2 3"), delimiter="") + with pytest.raises(TypeError, match="Text reading control character must"): + np.loadtxt(StringIO("1 2 3"), quotechar="") + with pytest.raises(ValueError, match="comments cannot be an empty string"): + np.loadtxt(StringIO("1 2 3"), comments="") + with pytest.raises(ValueError, match="comments cannot be an empty string"): + np.loadtxt(StringIO("1 2 3"), comments=["#", ""]) From 0ee03c847469a89a8b0e8a2584853151f6795d72 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Fri, 28 Jan 2022 12:46:11 -0800 Subject: [PATCH 65/70] Add test for str dtype len discovery with converters. nrows gt chunksize. --- numpy/lib/tests/test_loadtxt.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index b1e8671577d9..3aa16eeb07c9 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -936,7 +936,7 @@ def test_control_character_newline_raises(nl): ], ) @pytest.mark.parametrize("nrows", (10, 50000, 60000)) # lt, eq, gt chunksize -def test_datetime_parametric_unit_discovery( +def test_parametric_unit_discovery( generic_data, long_datum, unitless_dtype, expected_dtype, nrows ): """Check that the correct unit (e.g. month, day, second) is discovered from @@ -960,6 +960,28 @@ def test_datetime_parametric_unit_discovery( assert_equal(a, expected) +def test_str_dtype_unit_discovery_with_converter(): + data = ["spam-a-lot"] * 60000 + ["XXXtis_but_a_scratch"] + expected = np.array( + ["spam-a-lot"] * 60000 + ["tis_but_a_scratch"], dtype="U17" + ) + conv = lambda s: s.strip("XXX") + + # file-like path + txt = StringIO("\n".join(data)) + a = np.loadtxt(txt, dtype="U", converters=conv, encoding=None) + assert a.dtype == expected.dtype + assert_equal(a, expected) + + # file-obj path + fd, fname = mkstemp() + with open(fname, "w") as fh: + fh.write("\n".join(data)) + a = np.loadtxt(fname, dtype="U", converters=conv, encoding=None) + assert a.dtype == expected.dtype + assert_equal(a, expected) + + def test_control_character_empty(): with pytest.raises(TypeError, match="Text reading control character must"): np.loadtxt(StringIO("1 2 3"), delimiter="") From bab8610773c74f1894ae7797cc7fcfc9c4f05741 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Fri, 28 Jan 2022 13:20:44 -0800 Subject: [PATCH 66/70] Handle delimiter as bytes. --- numpy/lib/npyio.py | 4 +++- numpy/lib/tests/test_loadtxt.py | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index 63fffffbc9eb..b39230cdca3e 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -1290,12 +1290,14 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, dtype = np.float64 comment = comments - # Type conversions for Py3 convenience + # Control character type conversions for Py3 convenience if comment is not None: if isinstance(comment, (str, bytes)): comment = [comment] comment = [ x.decode('latin1') if isinstance(x, bytes) else x for x in comment] + if isinstance(delimiter, bytes): + delimiter = delimiter.decode('latin1') arr = _read(fname, dtype=dtype, comment=comment, delimiter=delimiter, converters=converters, skiplines=skiprows, usecols=usecols, diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index 3aa16eeb07c9..7be1e89385ef 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -991,3 +991,9 @@ def test_control_character_empty(): np.loadtxt(StringIO("1 2 3"), comments="") with pytest.raises(ValueError, match="comments cannot be an empty string"): np.loadtxt(StringIO("1 2 3"), comments=["#", ""]) + + +def test_control_characters_as_bytes(): + """Byte control characters (comments, delimiter) are supported.""" + a = np.loadtxt(StringIO("#header\n1,2,3"), comments=b"#", delimiter=b",") + assert_equal(a, [1, 2, 3]) From 5332a41ddc2d0ebd37c732808253145d5b528b0a Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Fri, 28 Jan 2022 13:22:52 -0800 Subject: [PATCH 67/70] Linting. --- numpy/lib/tests/test_loadtxt.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index 7be1e89385ef..e2f93534c1a9 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -920,11 +920,12 @@ def test_collision_with_default_delimiter_raises(ws): @pytest.mark.parametrize("nl", ("\n", "\r")) def test_control_character_newline_raises(nl): txt = StringIO(f"1{nl}2{nl}3{nl}{nl}4{nl}5{nl}6{nl}{nl}") - with pytest.raises(TypeError, match="control character.*cannot be a newline"): + msg = "control character.*cannot be a newline" + with pytest.raises(TypeError, match=msg): np.loadtxt(txt, delimiter=nl) - with pytest.raises(TypeError, match="control character.*cannot be a newline"): + with pytest.raises(TypeError, match=msg): np.loadtxt(txt, comments=nl) - with pytest.raises(TypeError, match="control character.*cannot be a newline"): + with pytest.raises(TypeError, match=msg): np.loadtxt(txt, quotechar=nl) From 59c20848a53402bfcf70625390391054aabac760 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Fri, 28 Jan 2022 14:09:22 -0800 Subject: [PATCH 68/70] TST: Fix exception msg matching in tests. --- numpy/lib/tests/test_loadtxt.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index e2f93534c1a9..4d42917eb140 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -879,17 +879,17 @@ def test_manual_universal_newlines(self, newline): def test_delimiter_comment_collision_raises(): - with pytest.raises(TypeError, match="control characters.*are identical"): + with pytest.raises(TypeError, match=".*control characters.*incompatible"): np.loadtxt(StringIO("1, 2, 3"), delimiter=",", comments=",") def test_delimiter_quotechar_collision_raises(): - with pytest.raises(TypeError, match="control characters.*are identical"): + with pytest.raises(TypeError, match=".*control characters.*incompatible"): np.loadtxt(StringIO("1, 2, 3"), delimiter=",", quotechar=",") def test_comment_quotechar_collision_raises(): - with pytest.raises(TypeError, match="control characters.*are identical"): + with pytest.raises(TypeError, match=".*control characters.*incompatible"): np.loadtxt(StringIO("1 2 3"), comments="#", quotechar="#") @@ -911,9 +911,9 @@ def test_delimiter_and_multiple_comments_collision_raises(): ) ) def test_collision_with_default_delimiter_raises(ws): - with pytest.raises(TypeError, match="control characters.*are identical"): + with pytest.raises(TypeError, match=".*control characters.*incompatible"): np.loadtxt(StringIO(f"1{ws}2{ws}3\n4{ws}5{ws}6\n"), comments=ws) - with pytest.raises(TypeError, match="control characters.*are identical"): + with pytest.raises(TypeError, match=".*control characters.*incompatible"): np.loadtxt(StringIO(f"1{ws}2{ws}3\n4{ws}5{ws}6\n"), quotechar=ws) From a756bfb313624eb47e28cba2553cc1ce497bbb1c Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Sun, 30 Jan 2022 06:59:21 -0800 Subject: [PATCH 69/70] TST: Skip error test using on PyPy (test uses %.100R) --- numpy/lib/tests/test_loadtxt.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/numpy/lib/tests/test_loadtxt.py b/numpy/lib/tests/test_loadtxt.py index 4d42917eb140..cca328b1632c 100644 --- a/numpy/lib/tests/test_loadtxt.py +++ b/numpy/lib/tests/test_loadtxt.py @@ -983,6 +983,8 @@ def test_str_dtype_unit_discovery_with_converter(): assert_equal(a, expected) +@pytest.mark.skipif(IS_PYPY and sys.implementation.version <= (7, 3, 8), + reason="PyPy bug in error formatting") def test_control_character_empty(): with pytest.raises(TypeError, match="Text reading control character must"): np.loadtxt(StringIO("1 2 3"), delimiter="") From ef7492c236ee899efbce8598a7a3bf45f24190e0 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Mon, 7 Feb 2022 11:19:56 -0800 Subject: [PATCH 70/70] Add two new examples of converters to docstring examples - Floats with underscores - Floats + hex floats. --- numpy/lib/npyio.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index b39230cdca3e..c9cf9da6ecf1 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -1245,6 +1245,25 @@ def loadtxt(fname, dtype=float, comments='#', delimiter=None, [ 19.22, 64.31], [-17.57, 63.94]]) + Using a callable as the converter can be particularly useful for handling + values with different formatting, e.g. floats with underscores: + + >>> s = StringIO("1 2.7 100_000") + >>> np.loadtxt(s, converters=float) + array([1.e+00, 2.7e+00, 1.e+05]) + + This idea can be extended to automatically handle values specified in + many different formats: + + >>> def conv(val): + ... try: + ... return float(val) + ... except ValueError: + ... return float.fromhex(val) + >>> s = StringIO("1, 2.5, 3_000, 0b4, 0x1.4000000000000p+2") + >>> np.loadtxt(s, delimiter=",", converters=conv, encoding=None) + array([1.0e+00, 2.5e+00, 3.0e+03, 1.8e+02, 5.0e+00]) + Note that with the default ``encoding="bytes"``, the inputs to the converter function are latin-1 encoded byte strings. To deactivate the implicit encoding prior to conversion, use ``encoding=None``