Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 1448d47

Browse files
author
Skip Montanaro
committed
rework Sniffer api significantly
1 parent 48816c6 commit 1448d47

1 file changed

Lines changed: 26 additions & 41 deletions

File tree

Lib/csv.py

Lines changed: 26 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@
99
QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
1010
__doc__
1111

12+
try:
13+
from cStringIO import StringIO
14+
except ImportError:
15+
from StringIO import StringIO
16+
1217
__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
1318
"Error", "Dialect", "excel", "excel_tab", "reader", "writer",
1419
"register_dialect", "get_dialect", "list_dialects", "Sniffer",
@@ -147,52 +152,39 @@ def writerows(self, rowdicts):
147152
class Sniffer:
148153
'''
149154
"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
150-
Returns a csv.Dialect object.
155+
Returns a Dialect object.
151156
'''
152-
def __init__(self, sample = 16 * 1024):
157+
def __init__(self):
153158
# in case there is more than one possible delimiter
154159
self.preferred = [',', '\t', ';', ' ', ':']
155160

156-
# amount of data (in bytes) to sample
157-
self.sample = sample
158161

159-
160-
def sniff(self, fileobj):
162+
def sniff(self, sample):
161163
"""
162-
Takes a file-like object and returns a dialect (or None)
164+
Returns a dialect (or None) corresponding to the sample
163165
"""
164-
self.fileobj = fileobj
165-
166-
data = fileobj.read(self.sample)
167166

168167
quotechar, delimiter, skipinitialspace = \
169-
self._guessQuoteAndDelimiter(data)
168+
self._guess_quote_and_delimiter(sample)
170169
if delimiter is None:
171-
delimiter, skipinitialspace = self._guessDelimiter(data)
170+
delimiter, skipinitialspace = self._guess_delimiter(sample)
172171

173-
class SniffedDialect(Dialect):
172+
class dialect(Dialect):
174173
_name = "sniffed"
175174
lineterminator = '\r\n'
176175
quoting = QUOTE_MINIMAL
177176
# escapechar = ''
178177
doublequote = False
179-
SniffedDialect.delimiter = delimiter
180-
SniffedDialect.quotechar = quotechar
181-
SniffedDialect.skipinitialspace = skipinitialspace
182178

183-
self.dialect = SniffedDialect
184-
return self.dialect
179+
dialect.delimiter = delimiter
180+
# _csv.reader won't accept a quotechar of ''
181+
dialect.quotechar = quotechar or '"'
182+
dialect.skipinitialspace = skipinitialspace
185183

184+
return dialect
186185

187-
def hasHeaders(self):
188-
return self._hasHeaders(self.fileobj, self.dialect)
189186

190-
191-
def register_dialect(self, name='sniffed'):
192-
register_dialect(name, self.dialect)
193-
194-
195-
def _guessQuoteAndDelimiter(self, data):
187+
def _guess_quote_and_delimiter(self, data):
196188
"""
197189
Looks for text enclosed between two identical quotes
198190
(the probable quotechar) which are preceded and followed
@@ -256,7 +248,7 @@ def _guessQuoteAndDelimiter(self, data):
256248
return (quotechar, delim, skipinitialspace)
257249

258250

259-
def _guessDelimiter(self, data):
251+
def _guess_delimiter(self, data):
260252
"""
261253
The delimiter /should/ occur the same number of times on
262254
each row. However, due to malformed data, it may not. We don't want
@@ -290,12 +282,12 @@ def _guessDelimiter(self, data):
290282
iteration += 1
291283
for line in data[start:end]:
292284
for char in ascii:
293-
metafrequency = charFrequency.get(char, {})
285+
metaFrequency = charFrequency.get(char, {})
294286
# must count even if frequency is 0
295287
freq = line.strip().count(char)
296288
# value is the mode
297-
metafrequency[freq] = metafrequency.get(freq, 0) + 1
298-
charFrequency[char] = metafrequency
289+
metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
290+
charFrequency[char] = metaFrequency
299291

300292
for char in charFrequency.keys():
301293
items = charFrequency[char].items()
@@ -356,7 +348,7 @@ def _guessDelimiter(self, data):
356348
return (delim, skipinitialspace)
357349

358350

359-
def _hasHeaders(self, fileobj, dialect):
351+
def has_header(self, sample):
360352
# Creates a dictionary of types of data in each column. If any
361353
# column is of a single type (say, integers), *except* for the first
362354
# row, then the first row is presumed to be labels. If the type
@@ -373,23 +365,16 @@ def seval(item):
373365
"""
374366
return eval(item.replace('(', '').replace(')', ''))
375367

376-
# rewind the fileobj - this might not work for some file-like
377-
# objects...
378-
fileobj.seek(0)
379-
380-
r = csv.reader(fileobj,
381-
delimiter=dialect.delimiter,
382-
quotechar=dialect.quotechar,
383-
skipinitialspace=dialect.skipinitialspace)
368+
rdr = reader(StringIO(sample), self.sniff(sample))
384369

385-
header = r.next() # assume first row is header
370+
header = rdr.next() # assume first row is header
386371

387372
columns = len(header)
388373
columnTypes = {}
389374
for i in range(columns): columnTypes[i] = None
390375

391376
checked = 0
392-
for row in r:
377+
for row in rdr:
393378
# arbitrary number of rows to check, to keep it sane
394379
if checked > 20:
395380
break

0 commit comments

Comments
 (0)