From 343a556c1c2fb499cd8c0addb12eed4681d82bbf Mon Sep 17 00:00:00 2001 From: WGH Date: Thu, 13 Jan 2022 02:04:03 +0300 Subject: [PATCH] Work around invalid URL encoding in path Go net/http cannot send HTTP requests containing invalid URL encoding in path (e.g. bare percent) at all[1]. Browsers send a bare percent in such scenario, and do not implicitly autoencode it. Until the upstream issue is resolved somehow, we have only two alternatives: either fail to fetch such URLs, or at least attempt the autoencoded variant. Lots of webservers handle them the same way, so it's worth trying. There aren't too many websites with invalid URL encoding in path component, though. [1] https://github.com/golang/go/issues/29808 --- colly.go | 6 ++++-- colly_test.go | 37 +++++++++++++++++++++++++++++++++++++ queue/queue.go | 4 +++- request.go | 6 ++---- 4 files changed, 46 insertions(+), 7 deletions(-) diff --git a/colly.go b/colly.go index 14835028..4e7b7e92 100644 --- a/colly.go +++ b/colly.go @@ -260,6 +260,8 @@ var envMap = map[string]func(*Collector, string){ }, } +var urlParser = whatwgUrl.NewParser(whatwgUrl.WithPercentEncodeSinglePercentSign()) + // NewCollector creates a new Collector instance with default configuration func NewCollector(options ...CollectorOption) *Collector { c := &Collector{} @@ -550,7 +552,7 @@ func (c *Collector) UnmarshalRequest(r []byte) (*Request, error) { } func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, checkRevisit bool) error { - parsedWhatwgURL, err := whatwgUrl.Parse(u) + parsedWhatwgURL, err := urlParser.Parse(u) if err != nil { return err } @@ -1082,7 +1084,7 @@ func (c *Collector) handleOnHTML(resp *Response) error { return err } if href, found := doc.Find("base[href]").Attr("href"); found { - u, err := whatwgUrl.ParseRef(resp.Request.URL.String(), href) + u, err := urlParser.ParseRef(resp.Request.URL.String(), href) if err == nil { baseURL, err := url.Parse(u.Href(false)) if err == nil { diff --git a/colly_test.go b/colly_test.go index bffdd3d0..ac40c16f 100644 --- a/colly_test.go +++ b/colly_test.go @@ -197,6 +197,10 @@ y">link `)) }) + mux.HandleFunc("/100%25", func(w http.ResponseWriter, r *http.Request) { + w.Write([]byte("100 percent")) + }) + mux.HandleFunc("/large_binary", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/octet-stream") ww := bufio.NewWriter(w) @@ -914,6 +918,39 @@ func TestTabsAndNewlines(t *testing.T) { } } +func TestLonePercent(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + var visitedPath string + + c := NewCollector() + c.OnResponse(func(res *Response) { + visitedPath = res.Request.URL.RequestURI() + }) + if err := c.Visit(ts.URL + "/100%"); err != nil { + t.Errorf("visit failed: %v", err) + } + // Automatic encoding is not really correct: browsers + // would send bare percent here. However, Go net/http + // cannot send such requests due to + // https://github.com/golang/go/issues/29808. So we have two + // alternatives really: return an error when attempting + // to fetch such URLs, or at least try the encoded variant. + // This test checks that the latter is attempted. + if got, want := visitedPath, "/100%25"; got != want { + t.Errorf("got=%q want=%q", got, want) + } + // invalid URL escape in query component is not a problem, + // but check it anyway + if err := c.Visit(ts.URL + "/?a=100%zz"); err != nil { + t.Errorf("visit failed: %v", err) + } + if got, want := visitedPath, "/?a=100%zz"; got != want { + t.Errorf("got=%q want=%q", got, want) + } +} + func TestCollectorCookies(t *testing.T) { ts := newTestServer() defer ts.Close() diff --git a/queue/queue.go b/queue/queue.go index af00c8d0..b8e4417a 100644 --- a/queue/queue.go +++ b/queue/queue.go @@ -11,6 +11,8 @@ import ( const stop = true +var urlParser = whatwgUrl.NewParser(whatwgUrl.WithPercentEncodeSinglePercentSign()) + // Storage is the interface of the queue's storage backend // Storage must be concurrently safe for multiple goroutines. type Storage interface { @@ -77,7 +79,7 @@ func (q *Queue) IsEmpty() bool { // AddURL adds a new URL to the queue func (q *Queue) AddURL(URL string) error { - u, err := whatwgUrl.Parse(URL) + u, err := urlParser.Parse(URL) if err != nil { return err } diff --git a/request.go b/request.go index 0c4b9cd9..1c0c80a3 100644 --- a/request.go +++ b/request.go @@ -23,8 +23,6 @@ import ( "net/url" "strings" "sync/atomic" - - whatwgUrl "github.com/nlnwa/whatwg-url/url" ) // Request is the representation of a HTTP request made by a Collector @@ -66,7 +64,7 @@ type serializableRequest struct { // New creates a new request with the context of the original request func (r *Request) New(method, URL string, body io.Reader) (*Request, error) { - u, err := whatwgUrl.Parse(URL) + u, err := urlParser.Parse(URL) if err != nil { return nil, err } @@ -104,7 +102,7 @@ func (r *Request) AbsoluteURL(u string) string { base = r.URL } - absURL, err := whatwgUrl.ParseRef(base.String(), u) + absURL, err := urlParser.ParseRef(base.String(), u) if err != nil { return "" }