3
3
4
4
# Standard/builtin Python modules
5
5
import string
6
+ from string import joinfields , splitfields , find , rfind
6
7
7
8
# A classification of schemes ('' means apply by default)
8
9
uses_relative = ['ftp' , 'http' , 'gopher' , 'nntp' , 'wais' , 'file' ,
18
19
# Characters valid in scheme names
19
20
scheme_chars = string .letters + string .digits + '+-.'
20
21
22
+ _parse_cache = {}
23
+
24
+ def clear_cache ():
25
+ global _parse_cache
26
+ _parse_cache = {}
27
+
28
+
21
29
# Parse a URL into 6 components:
22
30
# <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
23
31
# Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
24
32
# Note that we don't break the components up in smaller bits
25
33
# (e.g. netloc is a single string) and we don't expand % escapes.
26
34
def urlparse (url , scheme = '' , allow_framents = 1 ):
27
- netloc = ''
28
- path = ''
29
- params = ''
30
- query = ''
31
- fragment = ''
35
+ key = url , scheme , allow_framents
36
+ if _parse_cache .has_key (key ):
37
+ return _parse_cache [key ]
38
+ netloc = path = params = query = fragment = ''
32
39
i = string .find (url , ':' )
33
40
if i > 0 :
34
41
for c in url [:i ]:
@@ -54,7 +61,9 @@ def urlparse(url, scheme = '', allow_framents = 1):
54
61
i = string .find (url , ';' )
55
62
if i >= 0 :
56
63
url , params = url [:i ], url [i + 1 :]
57
- return scheme , netloc , url , params , query , fragment
64
+ tuple = scheme , netloc , url , params , query , fragment
65
+ _parse_cache [key ] = tuple
66
+ return tuple
58
67
59
68
# Put a parsed URL back together again. This may result in a slightly
60
69
# different, but equivalent URL, if the URL that was parsed originally
@@ -80,7 +89,7 @@ def urljoin(base, url, allow_framents = 1):
80
89
if not base :
81
90
return url
82
91
bscheme , bnetloc , bpath , bparams , bquery , bfragment = \
83
- urlparse (base , '' , allow_framents )
92
+ urlparse (base , '' , allow_framents )
84
93
scheme , netloc , path , params , query , fragment = \
85
94
urlparse (url , bscheme , allow_framents )
86
95
# XXX Unofficial hack: default netloc to bnetloc even if
@@ -90,9 +99,9 @@ def urljoin(base, url, allow_framents = 1):
90
99
scheme in uses_netloc and bscheme in uses_netloc :
91
100
netloc = bnetloc
92
101
# Strip the port number
93
- i = string . find (netloc , '@' )
102
+ i = find (netloc , '@' )
94
103
if i < 0 : i = 0
95
- i = string . find (netloc , ':' , i )
104
+ i = find (netloc , ':' , i )
96
105
if i >= 0 :
97
106
netloc = netloc [:i ]
98
107
if scheme != bscheme or scheme not in uses_relative :
@@ -107,15 +116,12 @@ def urljoin(base, url, allow_framents = 1):
107
116
return urlunparse ((scheme , netloc , path ,
108
117
params , query , fragment ))
109
118
if not path :
110
- path = bpath
111
- if not query :
112
- query = bquery
113
- return urlunparse ((scheme , netloc , path ,
114
- params , query , fragment ))
115
- i = string .rfind (bpath , '/' )
119
+ return urlunparse ((scheme , netloc , bpath ,
120
+ params , query or bquery , fragment ))
121
+ i = rfind (bpath , '/' )
116
122
if i >= 0 :
117
123
path = bpath [:i ] + '/' + path
118
- segments = string . splitfields (path , '/' )
124
+ segments = splitfields (path , '/' )
119
125
if segments [- 1 ] == '.' :
120
126
segments [- 1 ] = ''
121
127
while '.' in segments :
@@ -132,10 +138,21 @@ def urljoin(base, url, allow_framents = 1):
132
138
break
133
139
if len (segments ) >= 2 and segments [- 1 ] == '..' :
134
140
segments [- 2 :] = ['' ]
135
- path = string .joinfields (segments , '/' )
136
- return urlunparse ((scheme , netloc , path ,
141
+ return urlunparse ((scheme , netloc , joinfields (segments , '/' ),
137
142
params , query , fragment ))
138
143
144
+ def urldefrag (url ):
145
+ """Removes any existing fragment from URL.
146
+
147
+ Returns a tuple of the defragmented URL and the fragment. If
148
+ the URL contained no fragments, the second element is the
149
+ empty string.
150
+ """
151
+ s , n , p , a , q , frag = urlparse (url )
152
+ defrag = urlunparse ((s , n , p , a , q , '' ))
153
+ return defrag , frag
154
+
155
+
139
156
test_input = """
140
157
http://a/b/c/d
141
158
0 commit comments