Add option to ignore blank lines

poliquin · poliquin · commit b8d204d659dc · 2022-03-19T14:59:48.000-07:00
This option skips blank lines in the input data, which would normally
raise an error. Only truly blank lines are skipped. Lines of all
whitespace are parsed like other lines.
diff --git a/README.md b/README.md
@@ -85,7 +85,11 @@ usage, but requires files be opened in binary mode:
 import fixwidth
 
 with open('example/data1.txt', 'rb') as fh:
-    rdr = fixwidth.DictReader(fh, fieldinfo='example/data.layout')
+    rdr = fixwidth.DictReader(
+        fh,
+        fieldinfo='example/data.layout',
+        skip_blank_lines=True
+    )
     next(rdr)
 ```
 
diff --git a/fixwidth/__main__.py b/fixwidth/__main__.py
@@ -6,7 +6,8 @@
 from .fixwidth import read_file_format, parse_file
 
 
-def main(schema, files, output=None, delimiter='\t', ignore_type_errors=False):
+def main(schema, files, output=None, delimiter='\t', ignore_type_errors=False,
+         skip_blank_lines=False):
     """Process fixed width files and write to standard output.
 
     Args:
@@ -15,6 +16,7 @@ def main(schema, files, output=None, delimiter='\t', ignore_type_errors=False):
         output (file): file for writing processed data (default is sys.stdout).
         delimiter (str): field delimiter for output.
         ignore_type_errors (bool): replace invalid field data with None.
+        skip_blank_lines (bool): whether to ignore blank lines in input data.
     """
 
     if output is None:
@@ -38,7 +40,8 @@ def main(schema, files, output=None, delimiter='\t', ignore_type_errors=False):
         rows = parse_file(
             fpath,
             spec=spec,
-            type_errors='ignore' if ignore_type_errors else 'raise'
+            type_errors='ignore' if ignore_type_errors else 'raise',
+            skip_blank_lines=skip_blank_lines
         )
 
         try:
@@ -64,6 +67,8 @@ def main(schema, files, output=None, delimiter='\t', ignore_type_errors=False):
     argp.add_argument('files', nargs='+', help='Paths to data files')
     argp.add_argument('-i', '--ignore-type-errors', action='store_true',
                       help='Set fields that raise errors to None and continue')
+    argp.add_argument('-s', '--skip-blank-lines', action='store_true',
+                      help='Ignore blank lines in input data')
     argp.add_argument('-d', '--delimiter', default='\t', help='Field separator')
     argp.add_argument('-o', '--output', type=argparse.FileType('w'),
                       default=sys.stdout, help='Output file (default stdout)')
@@ -77,4 +82,4 @@ def main(schema, files, output=None, delimiter='\t', ignore_type_errors=False):
     )
 
     spec = main(opts.schema, opts.files, opts.output, opts.delimiter,
-                opts.ignore_type_errors)
+                opts.ignore_type_errors, opts.skip_blank_lines)
diff --git a/fixwidth/fixwidth.py b/fixwidth/fixwidth.py
@@ -38,7 +38,7 @@ def read_file_format(fpath):
 
 
 def parse_lines(lines, spec, strip=True, type_errors='raise', encoding='utf-8',
-                src_file=None):
+                src_file=None, skip_blank_lines=False):
     """Parse iterable of lines of fixed width data."""
 
     fieldstruct = struct.Struct(
@@ -50,6 +50,9 @@ def parse_lines(lines, spec, strip=True, type_errors='raise', encoding='utf-8',
 
     for idx, line in enumerate(lines, start=1):
 
+        if skip_blank_lines and len(line.rstrip(b'\r\n')) == 0:
+            continue
+
         data = fieldstruct.unpack_from(line)
         data = tuple(
             s.decode(encoding).strip() if strip else s.decode(encoding) for s in data
@@ -83,15 +86,18 @@ def parse_lines(lines, spec, strip=True, type_errors='raise', encoding='utf-8',
         yield OrderedDict(zip(colnames, values))
 
 
-def parse_file(fpath, spec, strip=True, type_errors='raise', encoding='ascii'):
+def parse_file(fpath, spec, strip=True, type_errors='raise', encoding='ascii',
+               skip_blank_lines=False):
     """Read data from fixed width file."""
 
     with open(fpath, 'rb') as fh:
-        yield from parse_lines(fh, spec, strip, type_errors, encoding, src_file=fpath)
+        yield from parse_lines(
+            fh, spec, strip, type_errors, encoding, fpath, skip_blank_lines
+        )
 
 
 class DictReader:
-    def __init__(self, f, fieldinfo):
+    def __init__(self, f, fieldinfo, skip_blank_lines=False):
 
         try:
             if os.path.isfile(fieldinfo):
@@ -107,7 +113,9 @@ def __init__(self, f, fieldinfo):
         self._f = f
         self.line_num = 0
         self.fieldnames = tuple(n for w, t, n in self._spec)
-        self._records = parse_lines(self._f, self._spec)
+        self._records = parse_lines(
+            self._f, self._spec, skip_blank_lines=skip_blank_lines
+        )
 
     def __iter__(self):
         return self
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 setup(
     name='pyfixwidth',
     packages=['fixwidth'],
-    version='0.1.1',
+    version='0.2.1',
     description="Read fixed width data files",
     author='Chris Poliquin',
     author_email='chrispoliquin@gmail.com',
diff --git a/tests/test_fixwidth.py b/tests/test_fixwidth.py
@@ -30,6 +30,36 @@ def test_file_parsing():
         ('row_id', 3), ('name', 'Amy')
     ])
 
+    # test skipping of blank lines in input
+    data = io.BytesIO(b'\n01Bob  \n\n02Susan\n\n\n03Amy  \n\n')
+
+    records = parse_lines(data, layout, skip_blank_lines=True)
+
+    assert next(records) == OrderedDict([
+        ('row_id', 1), ('name', 'Bob')
+    ])
+    assert next(records) == OrderedDict([
+        ('row_id', 2), ('name', 'Susan')
+    ])
+    assert next(records) == OrderedDict([
+        ('row_id', 3), ('name', 'Amy')
+    ])
+
+    # lines of all whitespace should not be skipped
+    data = io.BytesIO(b'\n01Bob  \n\n       \n\n03Amy  \n\n')
+
+    records = parse_lines(data, layout, skip_blank_lines=True)
+
+    assert next(records) == OrderedDict([
+        ('row_id', 1), ('name', 'Bob')
+    ])
+    assert next(records) == OrderedDict([
+        ('row_id', None), ('name', None)
+    ])
+    assert next(records) == OrderedDict([
+        ('row_id', 3), ('name', 'Amy')
+    ])
+
 
 def test_read_file_format():
     """Check parsing of files describing fixed width layouts."""