Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 26f0b9e

Browse files
gh-69589, gh-84774: Fix path normalization in urllib.parse.urljoin()
* Preserve double slashes in path. * Fix the case when the base path is relative and the relative reference path starts with '..'.
1 parent 450db61 commit 26f0b9e

File tree

4 files changed

+113
-23
lines changed

4 files changed

+113
-23
lines changed

Lib/test/test_urlparse.py

Lines changed: 99 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,11 @@ def checkJoin(self, base, relurl, expected, *, relroundtrip=True):
362362
self.assertEqual(urllib.parse.urljoin(base, relurl), expected)
363363
relurlb = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb))
364364
self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb)
365+
else:
366+
relurl = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl))
367+
self.assertNotEqual(urllib.parse.urljoin(base, relurl), expected)
368+
relurlb = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb))
369+
self.assertNotEqual(urllib.parse.urljoin(baseb, relurlb), expectedb)
365370

366371
def test_unparse_parse(self):
367372
str_cases = ['Python', './Python','x-newscheme://foo.com/stuff','x://y','x:/y','x:/','/',]
@@ -568,6 +573,9 @@ def test_urljoins(self):
568573
# slashes
569574
self.checkJoin('http://a/b/c/d/e/', '../../f/g/', 'http://a/b/c/f/g/')
570575
self.checkJoin('http://a/b/c/d/e', '../../f/g/', 'http://a/b/f/g/')
576+
self.checkJoin('http://a/b/c/d/e//', '../../f/g/', 'http://a/b/c/d/f/g/')
577+
self.checkJoin('http://a/b/c/d/e///', '../../f/g/', 'http://a/b/c/d/e/f/g/')
578+
self.checkJoin('http://a/b/c/d/e////', '../../f/g/', 'http://a/b/c/d/e//f/g/')
571579
self.checkJoin('http://a/b/c/d/e/', '/../../f/g/', 'http://a/f/g/')
572580
self.checkJoin('http://a/b/c/d/e', '/../../f/g/', 'http://a/f/g/')
573581
self.checkJoin('http://a/b/c/d/e/', '../../f/g', 'http://a/b/c/f/g')
@@ -645,6 +653,16 @@ def test_urljoins_relative_base(self):
645653
self.checkJoin('//', '/w', '///w')
646654
self.checkJoin('//', '///w', '///w')
647655
self.checkJoin('//', 'w', '///w')
656+
self.checkJoin('//', '../w', '///w')
657+
self.checkJoin('//', './w', '///w')
658+
self.checkJoin('//', '..//w', '///w')
659+
self.checkJoin('//', './/w', '///w')
660+
self.checkJoin('//', '..', '//')
661+
self.checkJoin('//', '.', '//')
662+
self.checkJoin('//', '../', '//')
663+
self.checkJoin('//', './', '//')
664+
self.checkJoin('//', '..//', '///')
665+
self.checkJoin('//', './/', '///')
648666

649667
self.checkJoin('//a', '', '//a')
650668
self.checkJoin('//a', '//', '//a')
@@ -653,6 +671,16 @@ def test_urljoins_relative_base(self):
653671
self.checkJoin('//a', '/w', '//a/w')
654672
self.checkJoin('//a', '///w', '//a/w')
655673
self.checkJoin('//a', 'w', '//a/w')
674+
self.checkJoin('//a', '../w', '//a/w')
675+
self.checkJoin('//a', './w', '//a/w')
676+
self.checkJoin('//a', '..//w', '//a/w')
677+
self.checkJoin('//a', './/w', '//a/w')
678+
self.checkJoin('//a', '..', '//a')
679+
self.checkJoin('//a', '.', '//a')
680+
self.checkJoin('//a', '../', '//a')
681+
self.checkJoin('//a', './', '//a')
682+
self.checkJoin('//a', '..//', '//a/')
683+
self.checkJoin('//a', './/', '//a/')
656684

657685
for scheme in '', 'http:':
658686
self.checkJoin('http:', scheme + '', 'http:')
@@ -661,7 +689,21 @@ def test_urljoins_relative_base(self):
661689
self.checkJoin('http:', scheme + '//v/w', 'http://v/w')
662690
self.checkJoin('http:', scheme + '/w', 'http:/w')
663691
self.checkJoin('http:', scheme + '///w', 'http:/w')
664-
self.checkJoin('http:', scheme + 'w', 'http:/w')
692+
self.checkJoin('http:', scheme + 'w', 'http:w')
693+
self.checkJoin('http:', scheme + '../w', 'http:w')
694+
self.checkJoin('http:', scheme + './w', 'http:w')
695+
self.checkJoin('http:', scheme + '..//w', 'http:/w')
696+
self.checkJoin('http:', scheme + './/w', 'http:/w')
697+
self.checkJoin('http:', scheme + '..///w', 'http:////w')
698+
self.checkJoin('http:', scheme + './//w', 'http:////w')
699+
self.checkJoin('http:', scheme + '..', 'http:')
700+
self.checkJoin('http:', scheme + '.', 'http:')
701+
self.checkJoin('http:', scheme + '../', 'http:')
702+
self.checkJoin('http:', scheme + './', 'http:')
703+
self.checkJoin('http:', scheme + '..//', 'http:/')
704+
self.checkJoin('http:', scheme + './/', 'http:/')
705+
self.checkJoin('http:', scheme + '..///', 'http:////')
706+
self.checkJoin('http:', scheme + './//', 'http:////')
665707

666708
self.checkJoin('http://', scheme + '', 'http://')
667709
self.checkJoin('http://', scheme + '//', 'http://')
@@ -670,6 +712,20 @@ def test_urljoins_relative_base(self):
670712
self.checkJoin('http://', scheme + '/w', 'http:///w')
671713
self.checkJoin('http://', scheme + '///w', 'http:///w')
672714
self.checkJoin('http://', scheme + 'w', 'http:///w')
715+
self.checkJoin('http://', scheme + '../w', 'http:///w')
716+
self.checkJoin('http://', scheme + './w', 'http:///w')
717+
self.checkJoin('http://', scheme + '..//w', 'http:///w')
718+
self.checkJoin('http://', scheme + './/w', 'http:///w')
719+
self.checkJoin('http://', scheme + '..///w', 'http:////w')
720+
self.checkJoin('http://', scheme + './//w', 'http:////w')
721+
self.checkJoin('http://', scheme + '..', 'http://')
722+
self.checkJoin('http://', scheme + '.', 'http://')
723+
self.checkJoin('http://', scheme + '../', 'http://')
724+
self.checkJoin('http://', scheme + './', 'http://')
725+
self.checkJoin('http://', scheme + '..//', 'http:///')
726+
self.checkJoin('http://', scheme + './/', 'http:///')
727+
self.checkJoin('http://', scheme + '..///', 'http:////')
728+
self.checkJoin('http://', scheme + './//', 'http:////')
673729

674730
self.checkJoin('http://a', scheme + '', 'http://a')
675731
self.checkJoin('http://a', scheme + '//', 'http://a')
@@ -678,6 +734,38 @@ def test_urljoins_relative_base(self):
678734
self.checkJoin('http://a', scheme + '/w', 'http://a/w')
679735
self.checkJoin('http://a', scheme + '///w', 'http://a/w')
680736
self.checkJoin('http://a', scheme + 'w', 'http://a/w')
737+
self.checkJoin('http://a', scheme + '../w', 'http://a/w')
738+
self.checkJoin('http://a', scheme + './w', 'http://a/w')
739+
self.checkJoin('http://a', scheme + '..//w', 'http://a/w')
740+
self.checkJoin('http://a', scheme + './/w', 'http://a/w')
741+
self.checkJoin('http://a', scheme + '..///w', 'http://a//w')
742+
self.checkJoin('http://a', scheme + './//w', 'http://a//w')
743+
self.checkJoin('http://a', scheme + '..', 'http://a')
744+
self.checkJoin('http://a', scheme + '.', 'http://a')
745+
self.checkJoin('http://a', scheme + '../', 'http://a')
746+
self.checkJoin('http://a', scheme + './', 'http://a')
747+
self.checkJoin('http://a', scheme + '..//', 'http://a/')
748+
self.checkJoin('http://a', scheme + './/', 'http://a/')
749+
self.checkJoin('http://a', scheme + '..///', 'http://a//')
750+
self.checkJoin('http://a', scheme + './//', 'http://a//')
751+
752+
self.checkJoin('b/c', '', 'b/c')
753+
self.checkJoin('b/c', '//', 'b/c')
754+
self.checkJoin('b/c', '//v', '//v')
755+
self.checkJoin('b/c', '//v/w', '//v/w')
756+
self.checkJoin('b/c', '/w', '/w')
757+
self.checkJoin('b/c', '///w', '/w')
758+
self.checkJoin('b/c', 'w', 'b/w')
759+
self.checkJoin('b/c', '../w', 'w')
760+
self.checkJoin('b/c', '../../w', 'w')
761+
self.checkJoin('b/c', '../../../w', 'w')
762+
self.checkJoin('b/c', 'w/.', 'b/w/')
763+
self.checkJoin('b/c', '../w/.', 'w/')
764+
self.checkJoin('b/c', '../../w/.', 'w/')
765+
self.checkJoin('b/c', '../../../w/.', 'w/')
766+
self.checkJoin('b/c', '..', '')
767+
self.checkJoin('b/c', '../..', '')
768+
self.checkJoin('b/c', '../../..', '')
681769

682770
self.checkJoin('/b/c', '', '/b/c')
683771
self.checkJoin('/b/c', '//', '/b/c')
@@ -686,6 +774,16 @@ def test_urljoins_relative_base(self):
686774
self.checkJoin('/b/c', '/w', '/w')
687775
self.checkJoin('/b/c', '///w', '/w')
688776
self.checkJoin('/b/c', 'w', '/b/w')
777+
self.checkJoin('/b/c', '../w', '/w')
778+
self.checkJoin('/b/c', '../../w', '/w')
779+
self.checkJoin('/b/c', '../../../w', '/w')
780+
self.checkJoin('/b/c', 'w/.', '/b/w/')
781+
self.checkJoin('/b/c', '../w/.', '/w/')
782+
self.checkJoin('/b/c', '../../w/.', '/w/')
783+
self.checkJoin('/b/c', '../../../w/.', '/w/')
784+
self.checkJoin('/b/c', '..', '/')
785+
self.checkJoin('/b/c', '../..', '/')
786+
self.checkJoin('/b/c', '../../..', '/')
689787

690788
self.checkJoin('///b/c', '', '///b/c')
691789
self.checkJoin('///b/c', '//', '///b/c')

Lib/urllib/parse.py

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -594,31 +594,22 @@ def urljoin(base, url, allow_fragments=True):
594594
return _coerce_result(_urlunsplit(scheme, netloc, path,
595595
query, fragment))
596596

597-
base_parts = bpath.split('/')
598-
if base_parts[-1] != '':
599-
# the last item is not a directory, so will not be taken into account
600-
# in resolving the relative path
601-
del base_parts[-1]
602-
603597
# for rfc3986, ignore all base path should the first character be root.
604-
if path[:1] == '/':
605-
segments = path.split('/')
606-
else:
607-
segments = base_parts + path.split('/')
608-
# filter out elements that would cause redundant slashes on re-joining
609-
# the resolved_path
610-
segments[1:-1] = filter(None, segments[1:-1])
598+
if path[:1] != '/' and '/' in bpath:
599+
path = bpath.rsplit('/', 1)[0] + '/' + path
611600

612-
resolved_path = []
601+
path = _remove_dot_segments(path)
602+
return _coerce_result(_urlunsplit(scheme, netloc, path, query, fragment))
613603

604+
def _remove_dot_segments(path):
605+
segments = path.split('/')
606+
min_len = 0 if segments[0] else 1
607+
608+
resolved_path = []
614609
for seg in segments:
615610
if seg == '..':
616-
try:
611+
if len(resolved_path) > min_len:
617612
resolved_path.pop()
618-
except IndexError:
619-
# ignore any .. segments that would otherwise cause an IndexError
620-
# when popped from resolved_path if resolving for rfc3986
621-
pass
622613
elif seg == '.':
623614
continue
624615
else:
@@ -629,9 +620,7 @@ def urljoin(base, url, allow_fragments=True):
629620
# then we need to append the trailing '/'
630621
resolved_path.append('')
631622

632-
return _coerce_result(_urlunsplit(scheme, netloc, '/'.join(
633-
resolved_path) or '/', query, fragment))
634-
623+
return '/'.join(resolved_path)
635624

636625
def urldefrag(url):
637626
"""Removes any existing fragment from URL.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix `urllib.parse.urljoin` for the case when the base path is relative
2+
and the relative reference path starts with '..'.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Preserve double slashes in the path in :func:`urllib.parse.urljoin`.

0 commit comments

Comments
 (0)