From 436664e996d4f16dcd9a8832007473fd55b7be79 Mon Sep 17 00:00:00 2001 From: joelazar Date: Tue, 10 Aug 2021 13:31:52 +0200 Subject: [PATCH 1/2] move filterings to one function (checkFilters) and use it in case of redirect too --- colly.go | 36 +++++++++++++++++++++--------------- colly_test.go | 17 +++++++++++++++++ 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/colly.go b/colly.go index 4f61f9d3..c70c4391 100644 --- a/colly.go +++ b/colly.go @@ -713,18 +713,8 @@ func (c *Collector) requestCheck(u string, parsedURL *url.URL, method string, re if c.MaxDepth > 0 && c.MaxDepth < depth { return ErrMaxDepth } - if len(c.DisallowedURLFilters) > 0 { - if isMatchingFilter(c.DisallowedURLFilters, []byte(u)) { - return ErrForbiddenURL - } - } - if len(c.URLFilters) > 0 { - if !isMatchingFilter(c.URLFilters, []byte(u)) { - return ErrNoURLFiltersMatch - } - } - if !c.isDomainAllowed(parsedURL.Hostname()) { - return ErrForbiddenDomain + if err := c.checkFilters(u, parsedURL.Hostname()); err != nil { + return err } if method != "HEAD" && !c.IgnoreRobotsTxt { if err := c.checkRobots(parsedURL); err != nil { @@ -757,6 +747,23 @@ func (c *Collector) requestCheck(u string, parsedURL *url.URL, method string, re return nil } +func (c *Collector) checkFilters(URL, domain string) error { + if len(c.DisallowedURLFilters) > 0 { + if isMatchingFilter(c.DisallowedURLFilters, []byte(URL)) { + return ErrForbiddenURL + } + } + if len(c.URLFilters) > 0 { + if !isMatchingFilter(c.URLFilters, []byte(URL)) { + return ErrNoURLFiltersMatch + } + } + if !c.isDomainAllowed(domain) { + return ErrForbiddenDomain + } + return nil +} + func (c *Collector) isDomainAllowed(domain string) bool { for _, d2 := range c.DisallowedDomains { if d2 == domain { @@ -1285,10 +1292,9 @@ func (c *Collector) Clone() *Collector { func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Request) error { return func(req *http.Request, via []*http.Request) error { - if !c.isDomainAllowed(req.URL.Hostname()) { - return fmt.Errorf("Not following redirect to %s because its not in AllowedDomains", req.URL.Host) + if err := c.checkFilters(req.URL.String(), req.URL.Hostname()); err != nil { + return err } - if c.redirectHandler != nil { return c.redirectHandler(req, via) } diff --git a/colly_test.go b/colly_test.go index 9c37a985..041d82a8 100644 --- a/colly_test.go +++ b/colly_test.go @@ -804,6 +804,23 @@ func TestRedirect(t *testing.T) { c.Visit(ts.URL + "/redirect") } +func TestRedirectWithDisallowedURLs(t *testing.T) { + ts := newTestServer() + defer ts.Close() + + c := NewCollector() + c.DisallowedURLFilters = []*regexp.Regexp{regexp.MustCompile(ts.URL + "/redirected/test")} + c.OnHTML("a[href]", func(e *HTMLElement) { + u := e.Request.AbsoluteURL(e.Attr("href")) + err := c.Visit(u) + if err != ErrForbiddenURL { + t.Error("URL should have been forbidden: " + u) + } + }) + + c.Visit(ts.URL + "/redirect") +} + func TestBaseTag(t *testing.T) { ts := newTestServer() defer ts.Close() From c5df65cfce9e0c5f64b90207cc7a990d9f487105 Mon Sep 17 00:00:00 2001 From: joelazar Date: Sun, 30 Jan 2022 15:31:15 +0100 Subject: [PATCH 2/2] preserve error message at filtered redirect attempt --- colly.go | 2 +- colly_test.go | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/colly.go b/colly.go index c70c4391..b77fe6cd 100644 --- a/colly.go +++ b/colly.go @@ -1293,7 +1293,7 @@ func (c *Collector) Clone() *Collector { func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Request) error { return func(req *http.Request, via []*http.Request) error { if err := c.checkFilters(req.URL.String(), req.URL.Hostname()); err != nil { - return err + return fmt.Errorf("Not following redirect to %q: %w", req.URL, err) } if c.redirectHandler != nil { return c.redirectHandler(req, via) diff --git a/colly_test.go b/colly_test.go index 041d82a8..63697956 100644 --- a/colly_test.go +++ b/colly_test.go @@ -18,6 +18,7 @@ import ( "bufio" "bytes" "context" + "errors" "fmt" "net/http" "net/http/httptest" @@ -813,7 +814,7 @@ func TestRedirectWithDisallowedURLs(t *testing.T) { c.OnHTML("a[href]", func(e *HTMLElement) { u := e.Request.AbsoluteURL(e.Attr("href")) err := c.Visit(u) - if err != ErrForbiddenURL { + if !errors.Is(err, ErrForbiddenURL) { t.Error("URL should have been forbidden: " + u) } })