1414from .html_parser import HTMLParseError , HTMLParser
1515
1616# Configuration for urlize() function.
17- TRAILING_PUNCTUATION_RE = re .compile (
18- '^' # Beginning of word
19- '(.*?)' # The URL in word
20- '([.,:;!]+)' # Allowed non-wrapping, trailing punctuation
21- '$' # End of word
22- )
17+ TRAILING_PUNCTUATION_CHARS = '.,:;!'
2318WRAPPING_PUNCTUATION = [('(' , ')' ), ('<' , '>' ), ('[' , ']' ), ('<' , '>' ), ('"' , '"' ), ('\' ' , '\' ' )]
2419
2520# List of possible strings used for bullets in bulleted lists.
2924word_split_re = re .compile (r'''([\s<>"']+)''' )
3025simple_url_re = re .compile (r'^https?://\[?\w' , re .IGNORECASE )
3126simple_url_2_re = re .compile (r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$' , re .IGNORECASE )
32- simple_email_re = re .compile (r'^\S+@\S+\.\S+$' )
3327
3428
3529@keep_lazy (str , SafeText )
@@ -276,10 +270,10 @@ def trim_punctuation(lead, middle, trail):
276270 trimmed_something = False
277271
278272 # Trim trailing punctuation.
279- match = TRAILING_PUNCTUATION_RE . match ( middle )
280- if match :
281- middle = match . group ( 1 )
282- trail = match . group ( 2 ) + trail
273+ stripped = middle . rstrip ( TRAILING_PUNCTUATION_CHARS )
274+ if middle != stripped :
275+ trail = middle [ len ( stripped ):] + trail
276+ middle = stripped
283277 trimmed_something = True
284278
285279 # Trim wrapping punctuation.
@@ -296,6 +290,21 @@ def trim_punctuation(lead, middle, trail):
296290 trimmed_something = True
297291 return lead , middle , trail
298292
293+ def is_email_simple (value ):
294+ """Return True if value looks like an email address."""
295+ # An @ must be in the middle of the value.
296+ if '@' not in value or value .startswith ('@' ) or value .endswith ('@' ):
297+ return False
298+ try :
299+ p1 , p2 = value .split ('@' )
300+ except ValueError :
301+ # value contains more than one @.
302+ return False
303+ # Dot must be in p2 (e.g. example.com)
304+ if '.' not in p2 or p2 .startswith ('.' ):
305+ return False
306+ return True
307+
299308 words = word_split_re .split (force_text (text ))
300309 for i , word in enumerate (words ):
301310 if '.' in word or '@' in word or ':' in word :
@@ -315,7 +324,7 @@ def trim_punctuation(lead, middle, trail):
315324 elif simple_url_2_re .match (middle ):
316325 middle , middle_unescaped , trail = unescape (middle , trail )
317326 url = smart_urlquote ('http://%s' % middle_unescaped )
318- elif ':' not in middle and simple_email_re . match (middle ):
327+ elif ':' not in middle and is_email_simple (middle ):
319328 local , domain = middle .rsplit ('@' , 1 )
320329 try :
321330 domain = domain .encode ('idna' ).decode ('ascii' )
0 commit comments