Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 7789237

Browse files
author
Skip Montanaro
committed
* Correct Sniffer doc to correspond to the implementation.
* Add optional delimiters arg to Sniffer.sniff() which restricts the set of candidate field delimiters.
1 parent c626658 commit 7789237

3 files changed

Lines changed: 31 additions & 14 deletions

File tree

Doc/lib/libcsv.tex

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -152,17 +152,17 @@ \subsection{Module Contents}
152152
\class{reader} or \class{writer} instance.
153153
\end{classdesc*}
154154

155-
\begin{classdesc}{Sniffer}{\optional{sample=16384}}
156-
The \class{Sniffer} class is used to deduce the format of a CSV file. The
157-
optional \var{sample} argument to the constructor specifies the number of
158-
bytes to use when determining Dialect parameters.
155+
\begin{classdesc}{Sniffer}{}
156+
The \class{Sniffer} class is used to deduce the format of a CSV file.
159157
\end{classdesc}
160158

161159
The \class{Sniffer} class provides a single method:
162160

163-
\begin{methoddesc}{sniff}{fileobj}
164-
Analyze the next chunk of \var{fileobj} and return a \class{Dialect} subclass
165-
reflecting the parameters found.
161+
\begin{methoddesc}{sniff}{sample\optional{,delimiters=None}}
162+
Analyze the given \var{sample} and return a \class{Dialect} subclass
163+
reflecting the parameters found. If the optional \var{delimiters} parameter
164+
is given, it is interpreted as a string containing possible valid delimiter
165+
characters.
166166
\end{methoddesc}
167167

168168
\begin{methoddesc}{has_header}{sample}

Lib/csv.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -159,15 +159,16 @@ def __init__(self):
159159
self.preferred = [',', '\t', ';', ' ', ':']
160160

161161

162-
def sniff(self, sample):
162+
def sniff(self, sample, delimiters=None):
163163
"""
164164
Returns a dialect (or None) corresponding to the sample
165165
"""
166166

167167
quotechar, delimiter, skipinitialspace = \
168-
self._guess_quote_and_delimiter(sample)
168+
self._guess_quote_and_delimiter(sample, delimiters)
169169
if delimiter is None:
170-
delimiter, skipinitialspace = self._guess_delimiter(sample)
170+
delimiter, skipinitialspace = self._guess_delimiter(sample,
171+
delimiters)
171172

172173
class dialect(Dialect):
173174
_name = "sniffed"
@@ -184,7 +185,7 @@ class dialect(Dialect):
184185
return dialect
185186

186187

187-
def _guess_quote_and_delimiter(self, data):
188+
def _guess_quote_and_delimiter(self, data, delimiters):
188189
"""
189190
Looks for text enclosed between two identical quotes
190191
(the probable quotechar) which are preceded and followed
@@ -222,7 +223,7 @@ def _guess_quote_and_delimiter(self, data):
222223
key = m[n]
223224
except KeyError:
224225
continue
225-
if key:
226+
if key and (delimiters is None or key in delimiters):
226227
delims[key] = delims.get(key, 0) + 1
227228
try:
228229
n = regexp.groupindex['space'] - 1
@@ -248,7 +249,7 @@ def _guess_quote_and_delimiter(self, data):
248249
return (quotechar, delim, skipinitialspace)
249250

250251

251-
def _guess_delimiter(self, data):
252+
def _guess_delimiter(self, data, delimiters):
252253
"""
253254
The delimiter /should/ occur the same number of times on
254255
each row. However, due to malformed data, it may not. We don't want
@@ -316,7 +317,8 @@ def _guess_delimiter(self, data):
316317
while len(delims) == 0 and consistency >= threshold:
317318
for k, v in modeList:
318319
if v[0] > 0 and v[1] > 0:
319-
if (v[1]/total) >= consistency:
320+
if ((v[1]/total) >= consistency and
321+
(delimiters is None or k in delimiters)):
320322
delims[k] = v
321323
consistency -= 0.01
322324

Lib/test/test_csv.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -551,6 +551,12 @@ class TestSniffer(unittest.TestCase):
551551
header = '''\
552552
"venue","city","state","date","performers"
553553
'''
554+
sample3 = '''\
555+
05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03
556+
05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03
557+
05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03
558+
'''
559+
554560
def test_has_header(self):
555561
sniffer = csv.Sniffer()
556562
self.assertEqual(sniffer.has_header(self.sample1), False)
@@ -568,6 +574,15 @@ def test_sniff(self):
568574
self.assertEqual(dialect.quotechar, "'")
569575
self.assertEqual(dialect.skipinitialspace, False)
570576

577+
def test_delimiters(self):
578+
sniffer = csv.Sniffer()
579+
dialect = sniffer.sniff(self.sample3)
580+
self.assertEqual(dialect.delimiter, "0")
581+
dialect = sniffer.sniff(self.sample3, delimiters="?,")
582+
self.assertEqual(dialect.delimiter, "?")
583+
dialect = sniffer.sniff(self.sample3, delimiters="/,")
584+
self.assertEqual(dialect.delimiter, "/")
585+
571586
if not hasattr(sys, "gettotalrefcount"):
572587
if test_support.verbose: print "*** skipping leakage tests ***"
573588
else:

0 commit comments

Comments
 (0)