Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b8d204d

Browse files
committed
Add option to ignore blank lines
This option skips blank lines in the input data, which would normally raise an error. Only truly blank lines are skipped. Lines of all whitespace are parsed like other lines.
1 parent a24efda commit b8d204d

5 files changed

Lines changed: 57 additions & 10 deletions

File tree

README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,11 @@ usage, but requires files be opened in binary mode:
8585
import fixwidth
8686

8787
with open('example/data1.txt', 'rb') as fh:
88-
rdr = fixwidth.DictReader(fh, fieldinfo='example/data.layout')
88+
rdr = fixwidth.DictReader(
89+
fh,
90+
fieldinfo='example/data.layout',
91+
skip_blank_lines=True
92+
)
8993
next(rdr)
9094
```
9195

fixwidth/__main__.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
from .fixwidth import read_file_format, parse_file
77

88

9-
def main(schema, files, output=None, delimiter='\t', ignore_type_errors=False):
9+
def main(schema, files, output=None, delimiter='\t', ignore_type_errors=False,
10+
skip_blank_lines=False):
1011
"""Process fixed width files and write to standard output.
1112
1213
Args:
@@ -15,6 +16,7 @@ def main(schema, files, output=None, delimiter='\t', ignore_type_errors=False):
1516
output (file): file for writing processed data (default is sys.stdout).
1617
delimiter (str): field delimiter for output.
1718
ignore_type_errors (bool): replace invalid field data with None.
19+
skip_blank_lines (bool): whether to ignore blank lines in input data.
1820
"""
1921

2022
if output is None:
@@ -38,7 +40,8 @@ def main(schema, files, output=None, delimiter='\t', ignore_type_errors=False):
3840
rows = parse_file(
3941
fpath,
4042
spec=spec,
41-
type_errors='ignore' if ignore_type_errors else 'raise'
43+
type_errors='ignore' if ignore_type_errors else 'raise',
44+
skip_blank_lines=skip_blank_lines
4245
)
4346

4447
try:
@@ -64,6 +67,8 @@ def main(schema, files, output=None, delimiter='\t', ignore_type_errors=False):
6467
argp.add_argument('files', nargs='+', help='Paths to data files')
6568
argp.add_argument('-i', '--ignore-type-errors', action='store_true',
6669
help='Set fields that raise errors to None and continue')
70+
argp.add_argument('-s', '--skip-blank-lines', action='store_true',
71+
help='Ignore blank lines in input data')
6772
argp.add_argument('-d', '--delimiter', default='\t', help='Field separator')
6873
argp.add_argument('-o', '--output', type=argparse.FileType('w'),
6974
default=sys.stdout, help='Output file (default stdout)')
@@ -77,4 +82,4 @@ def main(schema, files, output=None, delimiter='\t', ignore_type_errors=False):
7782
)
7883

7984
spec = main(opts.schema, opts.files, opts.output, opts.delimiter,
80-
opts.ignore_type_errors)
85+
opts.ignore_type_errors, opts.skip_blank_lines)

fixwidth/fixwidth.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def read_file_format(fpath):
3838

3939

4040
def parse_lines(lines, spec, strip=True, type_errors='raise', encoding='utf-8',
41-
src_file=None):
41+
src_file=None, skip_blank_lines=False):
4242
"""Parse iterable of lines of fixed width data."""
4343

4444
fieldstruct = struct.Struct(
@@ -50,6 +50,9 @@ def parse_lines(lines, spec, strip=True, type_errors='raise', encoding='utf-8',
5050

5151
for idx, line in enumerate(lines, start=1):
5252

53+
if skip_blank_lines and len(line.rstrip(b'\r\n')) == 0:
54+
continue
55+
5356
data = fieldstruct.unpack_from(line)
5457
data = tuple(
5558
s.decode(encoding).strip() if strip else s.decode(encoding) for s in data
@@ -83,15 +86,18 @@ def parse_lines(lines, spec, strip=True, type_errors='raise', encoding='utf-8',
8386
yield OrderedDict(zip(colnames, values))
8487

8588

86-
def parse_file(fpath, spec, strip=True, type_errors='raise', encoding='ascii'):
89+
def parse_file(fpath, spec, strip=True, type_errors='raise', encoding='ascii',
90+
skip_blank_lines=False):
8791
"""Read data from fixed width file."""
8892

8993
with open(fpath, 'rb') as fh:
90-
yield from parse_lines(fh, spec, strip, type_errors, encoding, src_file=fpath)
94+
yield from parse_lines(
95+
fh, spec, strip, type_errors, encoding, fpath, skip_blank_lines
96+
)
9197

9298

9399
class DictReader:
94-
def __init__(self, f, fieldinfo):
100+
def __init__(self, f, fieldinfo, skip_blank_lines=False):
95101

96102
try:
97103
if os.path.isfile(fieldinfo):
@@ -107,7 +113,9 @@ def __init__(self, f, fieldinfo):
107113
self._f = f
108114
self.line_num = 0
109115
self.fieldnames = tuple(n for w, t, n in self._spec)
110-
self._records = parse_lines(self._f, self._spec)
116+
self._records = parse_lines(
117+
self._f, self._spec, skip_blank_lines=skip_blank_lines
118+
)
111119

112120
def __iter__(self):
113121
return self

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
setup(
66
name='pyfixwidth',
77
packages=['fixwidth'],
8-
version='0.1.1',
8+
version='0.2.1',
99
description="Read fixed width data files",
1010
author='Chris Poliquin',
1111
author_email='[email protected]',

tests/test_fixwidth.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,36 @@ def test_file_parsing():
3030
('row_id', 3), ('name', 'Amy')
3131
])
3232

33+
# test skipping of blank lines in input
34+
data = io.BytesIO(b'\n01Bob \n\n02Susan\n\n\n03Amy \n\n')
35+
36+
records = parse_lines(data, layout, skip_blank_lines=True)
37+
38+
assert next(records) == OrderedDict([
39+
('row_id', 1), ('name', 'Bob')
40+
])
41+
assert next(records) == OrderedDict([
42+
('row_id', 2), ('name', 'Susan')
43+
])
44+
assert next(records) == OrderedDict([
45+
('row_id', 3), ('name', 'Amy')
46+
])
47+
48+
# lines of all whitespace should not be skipped
49+
data = io.BytesIO(b'\n01Bob \n\n \n\n03Amy \n\n')
50+
51+
records = parse_lines(data, layout, skip_blank_lines=True)
52+
53+
assert next(records) == OrderedDict([
54+
('row_id', 1), ('name', 'Bob')
55+
])
56+
assert next(records) == OrderedDict([
57+
('row_id', None), ('name', None)
58+
])
59+
assert next(records) == OrderedDict([
60+
('row_id', 3), ('name', 'Amy')
61+
])
62+
3363

3464
def test_read_file_format():
3565
"""Check parsing of files describing fixed width layouts."""

0 commit comments

Comments
 (0)