From bfb0830283211acc98548f054c8ac32f9a7f7b44 Mon Sep 17 00:00:00 2001 From: Andrew Liu Date: Sun, 22 Sep 2019 11:02:16 -0700 Subject: [PATCH 1/2] ENH: add quoting support --- numpy/lib/_iotools.py | 31 +++++++++++++++++++++++++++++-- numpy/lib/npyio.py | 12 ++++++++++-- 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py index c392929fd879..793205209631 100644 --- a/numpy/lib/_iotools.py +++ b/numpy/lib/_iotools.py @@ -210,12 +210,19 @@ def autostrip(self, method): return lambda input: [_.strip() for _ in method(input)] # - def __init__(self, delimiter=None, comments='#', autostrip=True, encoding=None): + def __init__(self, delimiter=None, comments='#', autostrip=True, encoding=None, quoter=None): delimiter = _decode_line(delimiter) comments = _decode_line(comments) + quoter = _decode_line(quoter) self.comments = comments + # Quoter is a character or None + if (quoter is None) or (isinstance(quoter, basestring) and len(quoter) == 1): + self.quoter = quoter or None + else: + self.quoter = None + # Delimiter is a character if (delimiter is None) or isinstance(delimiter, basestring): delimiter = delimiter or None @@ -246,7 +253,27 @@ def _delimited_splitter(self, line): line = line.strip(" \r\n") if not line: return [] - return line.split(self.delimiter) + + if self.quoter is None: + return line.split(self.delimiter) + else: + out = [] + isQuoted = False + chars = list(line) + word = '' + + for char in chars: + if char == self.quoter: + isQuoted = not isQuoted + else if char == self.delimiter and not isQuoted: + out.append(word) + else: + word += char + + if word: + out.append(word) + + return out # def _fixedwidth_splitter(self, line): diff --git a/numpy/lib/npyio.py b/numpy/lib/npyio.py index e57a6dd47b98..a77c56fd89d1 100644 --- a/numpy/lib/npyio.py +++ b/numpy/lib/npyio.py @@ -1564,7 +1564,7 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, deletechars=''.join(sorted(NameValidator.defaultdeletechars)), replace_space='_', autostrip=False, case_sensitive=True, defaultfmt="f%i", unpack=None, usemask=False, loose=True, - invalid_raise=True, max_rows=None, encoding='bytes'): + invalid_raise=True, max_rows=None, encoding='bytes', quoter=None): """ Load data from a text file, with missing values handled as specified. @@ -1662,6 +1662,13 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, .. versionadded:: 1.14.0 + quoter: str, optional + The string used as the quoting character. By default, it is assumed + that the values are not quoted. If invalid value is provided, quoter + defaults to None. + + .. versionadded:: 1.18.0 + Returns ------- out : ndarray @@ -1780,7 +1787,8 @@ def genfromtxt(fname, dtype=float, comments='#', delimiter=None, with fid_ctx: split_line = LineSplitter(delimiter=delimiter, comments=comments, - autostrip=autostrip, encoding=encoding) + autostrip=autostrip, encoding=encoding, + quoter=quoter) validate_names = NameValidator(excludelist=excludelist, deletechars=deletechars, case_sensitive=case_sensitive, From 20f868f9630454dd323cb5563ca45b1e85e8288c Mon Sep 17 00:00:00 2001 From: Andrew Liu Date: Sun, 22 Sep 2019 13:28:49 -0700 Subject: [PATCH 2/2] ENH: support quoting and added tests --- numpy/lib/_iotools.py | 16 ++++++++++------ numpy/lib/tests/test_io.py | 12 ++++++++++++ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/numpy/lib/_iotools.py b/numpy/lib/_iotools.py index 793205209631..9ce17f7eea21 100644 --- a/numpy/lib/_iotools.py +++ b/numpy/lib/_iotools.py @@ -258,23 +258,27 @@ def _delimited_splitter(self, line): return line.split(self.delimiter) else: out = [] + index = 0 isQuoted = False - chars = list(line) word = '' - for char in chars: + while index < len(line): + char = line[index] if char == self.quoter: - isQuoted = not isQuoted - else if char == self.delimiter and not isQuoted: + if len(word) == 0 and not isQuoted: + isQuoted = True + else: + isQuoted = False + elif char == self.delimiter and not isQuoted: out.append(word) + word = '' else: word += char - + index += 1 if word: out.append(word) return out - # def _fixedwidth_splitter(self, line): if self.comments is not None: diff --git a/numpy/lib/tests/test_io.py b/numpy/lib/tests/test_io.py index 6ee17c830f5c..74c0080de378 100644 --- a/numpy/lib/tests/test_io.py +++ b/numpy/lib/tests/test_io.py @@ -2420,6 +2420,18 @@ def test_genfromtxt(self): data = np.genfromtxt(path) assert_array_equal(a, data) + def test_genfromtxt_quoter(self): + with temppath(suffix='.txt') as path: + path = Path(path) + # "This is my text, that has a comma inside","Other value","3" + # "Another text, with coma","More text, with comma",5 + with path.open('w') as f: + a = u"\"This is my text, that has a comma inside\",\"Other value\",\"3\"\n\"Another text, with coma\",\"More text, with comma\",5" + f.write(a) + + data = np.genfromtxt(path, delimiter=',', quoter='"', encoding=None, dtype=None) + assert_equal(data.shape, (2,)) + def test_ndfromtxt(self): # Test outputting a standard ndarray with temppath(suffix='.txt') as path: