@@ -1006,42 +1006,15 @@ def _sanitize_html(self, s):
10061006 raise MarkdownError ("invalid value for 'safe_mode': %r (must be "
10071007 "'escape' or 'replace')" % self .safe_mode )
10081008
1009- """
1010- The expression [^ \t '"]* is used instead of .* because of special cases
1011- for links. Specifically: inline links, quotes in links, and odd anchors.
1012- """
1013- _tail_of_inline_link_re_str = r'''
1014- \( # literal paren
1015- [ \t]*
1016- (?P<url> # \1
1017- <[^{char_blacklist}]*>
1018- |
1019- [^{char_blacklist}]*
1020- )
1021- {title_separator}
1022- ( # \2
1023- {title_prefix}
1024- (['"]) # quote char = \3
1009+ _inline_link_title = re .compile (r'''
1010+ ( # \1
1011+ [ \t]+
1012+ (['"]) # quote char = \2
10251013 (?P<title>.*?)
1026- \3 # matching quote
1014+ \2
10271015 )? # title is optional
1028- \)
1029- ''' ;
1030- _tail_of_inline_link_re = re .compile (
1031- # Match tail of: [text](/url/) or [text](/url/ "title")
1032- _tail_of_inline_link_re_str .format (
1033- char_blacklist = r''' \t'"''' ,
1034- title_separator = r'''[ \t]*''' ,
1035- title_prefix = "" ,
1036- ), re .X )
1037- _tail_of_inline_link_wth_whtspc_re = re .compile (
1038- # Special case of the above where url contains whitespace
1039- # but no closing parensthesis
1040- _tail_of_inline_link_re_str .format (
1041- char_blacklist = r'''\)'"''' ,
1042- title_separator = "" ,
1043- title_prefix = r'''[ \t]+''' ,
1044- ), re .X )
1016+ \)$
1017+ ''' , re .X | re .S )
10451018 _tail_of_reference_link_re = re .compile (r'''
10461019 # Match tail of: [text][id]
10471020 [ ]? # one optional space
@@ -1051,6 +1024,52 @@ def _sanitize_html(self, s):
10511024 \]
10521025 ''' , re .X | re .S )
10531026
1027+ _whitespace = re .compile (r'\s*' )
1028+
1029+ _strip_anglebrackets = re .compile (r'<(.*)>.*' )
1030+
1031+ def _find_non_whitespace (self , text , start ):
1032+ """Returns the index of the first non-whitespace character in text
1033+ after (and including) start
1034+ """
1035+ match = self ._whitespace .match (text , start )
1036+ return match .end ()
1037+
1038+ def _find_balanced (self , text , start , open_c , close_c ):
1039+ """Returns the index where the open_c and close_c characters balance
1040+ out - the same number of open_c and close_c are encountered - or the
1041+ end of string if it's reached before the balance point is found.
1042+ """
1043+ i = start
1044+ l = len (text )
1045+ count = 1
1046+ while count > 0 and i < l :
1047+ if text [i ] == open_c :
1048+ count += 1
1049+ elif text [i ] == close_c :
1050+ count -= 1
1051+ i += 1
1052+ return i
1053+
1054+ def _extract_url_and_title (self , text , start ):
1055+ """Extracts the url and (optional) title from the tail of a link"""
1056+ # text[start] equals the opening parenthesis
1057+ idx = self ._find_non_whitespace (text , start + 1 )
1058+ if idx == len (text ):
1059+ return None , None , None
1060+ end_idx = idx
1061+ has_anglebrackets = text [idx ] == "<"
1062+ if has_anglebrackets :
1063+ end_idx = self ._find_balanced (text , end_idx + 1 , "<" , ">" )
1064+ end_idx = self ._find_balanced (text , end_idx , "(" , ")" )
1065+ match = self ._inline_link_title .search (text , idx , end_idx )
1066+ if not match :
1067+ return None , None , None
1068+ url , title = text [idx :match .start ()], match .group ("title" )
1069+ if has_anglebrackets :
1070+ url = self ._strip_anglebrackets .sub (r'\1' , url )
1071+ return url , title , end_idx
1072+
10541073 def _do_links (self , text ):
10551074 """Turn Markdown link shortcuts into XHTML <a> and <img> tags.
10561075
@@ -1133,23 +1152,13 @@ def _do_links(self, text):
11331152
11341153 # Inline anchor or img?
11351154 if text [p ] == '(' : # attempt at perf improvement
1136- m1 = self ._tail_of_inline_link_re .match (text , p )
1137- m2 = self ._tail_of_inline_link_wth_whtspc_re .match (text , p )
1138- if m1 and m2 :
1139- match = m1 if m1 .end () >= m2 .end () else m2
1140- elif m1 :
1141- match = m1
1142- else :
1143- match = m2
1144- if match :
1155+ url , title , url_end_idx = self ._extract_url_and_title (text , p )
1156+ if url is not None :
11451157 # Handle an inline anchor or img.
11461158 is_img = start_idx > 0 and text [start_idx - 1 ] == "!"
11471159 if is_img :
11481160 start_idx -= 1
11491161
1150- url , title = match .group ("url" ), match .group ("title" )
1151- if url and url [0 ] == '<' :
1152- url = url [1 :- 1 ] # '<url>' -> 'url'
11531162 # We've got to encode these to avoid conflicting
11541163 # with italics/bold.
11551164 url = url .replace ('*' , self ._escape_table ['*' ]) \
@@ -1170,7 +1179,7 @@ def _do_links(self, text):
11701179 if "smarty-pants" in self .extras :
11711180 result = result .replace ('"' , self ._escape_table ['"' ])
11721181 curr_pos = start_idx + len (result )
1173- text = text [:start_idx ] + result + text [match . end () :]
1182+ text = text [:start_idx ] + result + text [url_end_idx :]
11741183 elif start_idx >= anchor_allowed_pos :
11751184 result_head = '<a href="%s"%s>' % (url , title_str )
11761185 result = '%s%s</a>' % (result_head , link_text )
@@ -1180,7 +1189,7 @@ def _do_links(self, text):
11801189 # anchor_allowed_pos on.
11811190 curr_pos = start_idx + len (result_head )
11821191 anchor_allowed_pos = start_idx + len (result )
1183- text = text [:start_idx ] + result + text [match . end () :]
1192+ text = text [:start_idx ] + result + text [url_end_idx :]
11841193 else :
11851194 # Anchor not allowed here.
11861195 curr_pos = start_idx + 1
0 commit comments