From 720da0945234d48ab3638a8b31b86dd4ec1d73ec Mon Sep 17 00:00:00 2001 From: twiny Date: Fri, 18 Aug 2023 00:19:28 +0100 Subject: [PATCH 01/10] - refactor: removed old files. --- .gitignore | 6 +- config.go | 10 --- fetcher.go | 150 ------------------------------------- filter.go | 60 --------------- limiter.go | 37 --------- logger.go | 30 -------- option.go | 85 --------------------- plugin/fetcher/chromedp.go | 7 -- plugin/queue/bolt.go | 108 -------------------------- queue.go | 63 ---------------- request.go | 61 --------------- response.go | 12 --- rotator.go | 39 ---------- store.go | 47 ------------ 14 files changed, 2 insertions(+), 713 deletions(-) delete mode 100644 config.go delete mode 100644 fetcher.go delete mode 100644 filter.go delete mode 100644 limiter.go delete mode 100644 logger.go delete mode 100644 option.go delete mode 100644 plugin/fetcher/chromedp.go delete mode 100644 plugin/queue/bolt.go delete mode 100644 queue.go delete mode 100644 request.go delete mode 100644 response.go delete mode 100644 rotator.go delete mode 100644 store.go diff --git a/.gitignore b/.gitignore index 9046723..7b5ae83 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,2 @@ -cmd/tests -makefile -logs -tmp \ No newline at end of file +tests/ +.idea.md \ No newline at end of file diff --git a/config.go b/config.go deleted file mode 100644 index ac056ad..0000000 --- a/config.go +++ /dev/null @@ -1,10 +0,0 @@ -package wbot - -// Config -type config struct { - maxDepth int32 - parallel int - maxBodySize int64 - userAgents *rotator - proxies *rotator -} diff --git a/fetcher.go b/fetcher.go deleted file mode 100644 index 0875087..0000000 --- a/fetcher.go +++ /dev/null @@ -1,150 +0,0 @@ -package wbot - -import ( - "bytes" - "io" - "net" - "net/http" - "net/url" - "time" - - "github.com/PuerkitoBio/goquery" -) - -// Fetcher -type Fetcher interface { - Fetch(req Request) (Response, error) - Close() error -} - -// Default Fetcher - -// -var ( - defaultUserAgent = `wbot/0.1` -) - -// Fetcher -type fetcher struct { - cli *http.Client -} - -// defaultFetcher -func defaultFetcher() *fetcher { - return &fetcher{ - cli: newHTTPClient(), - } -} - -// Fetch -func (f *fetcher) Fetch(req Request) (Response, error) { - var ( - userAgent = defaultUserAgent - maxBodySize = int64(1024 * 1024 * 10) - ) - - if req.Param.UserAgent != "" { - userAgent = req.Param.UserAgent - } - - if req.Param.MaxBodySize > 0 { - maxBodySize = req.Param.MaxBodySize - } - - // add headers - var header = make(http.Header) - header.Set("User-Agent", userAgent) - header.Set("Referer", req.Param.Referer) - - f.cli.Transport = newHTTPTransport(req.Param.Proxy) - - resp, err := f.cli.Do(&http.Request{ - Method: http.MethodGet, - URL: req.URL, - Header: header, - Proto: "HTTP/1.1", - ProtoMajor: 1, - ProtoMinor: 1, - }) - if err != nil { - return Response{}, err - } - - // Limit response body reading - bodyReader := io.LimitReader(resp.Body, maxBodySize) - - body, err := io.ReadAll(bodyReader) - if err != nil { - return Response{}, err - } - - nextURLs := findLinks(body) - - resp.Body.Close() - - return Response{ - URL: req.URL, - Status: resp.StatusCode, - Body: body, - NextURLs: nextURLs, - Depth: req.Depth, - }, nil -} - -// Close -func (f *fetcher) Close() error { - f.cli.CloseIdleConnections() - return nil -} - -// newHTTPClient -func newHTTPClient() *http.Client { - return &http.Client{ - Jar: http.DefaultClient.Jar, - Timeout: 5 * time.Second, - } -} - -// newHTTPTransport -func newHTTPTransport(purl string) *http.Transport { - var proxy = http.ProxyFromEnvironment - - if purl != "" { - proxy = func(req *http.Request) (*url.URL, error) { - return url.Parse(purl) - } - } - return &http.Transport{ - Proxy: proxy, - DialContext: (&net.Dialer{ - Timeout: 30 * time.Second, - KeepAlive: 30 * time.Second, - DualStack: true, - }).DialContext, - ForceAttemptHTTP2: true, - MaxIdleConns: 100, // Default: 100 - MaxIdleConnsPerHost: 2, // Default: 2 - IdleConnTimeout: 10 * time.Second, - TLSHandshakeTimeout: 5 * time.Second, - ExpectContinueTimeout: 1 * time.Second, - // DisableKeepAlives: true, // twiny - } -} - -// linkFinder finds links in a response -func findLinks(body []byte) []string { - var hrefs []string - - doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) - if err != nil { - return hrefs - } - - doc.Find("a[href]").Each(func(index int, item *goquery.Selection) { - if href, found := item.Attr("href"); found { - hrefs = append(hrefs, href) - } - }) - - return hrefs -} diff --git a/filter.go b/filter.go deleted file mode 100644 index 6339601..0000000 --- a/filter.go +++ /dev/null @@ -1,60 +0,0 @@ -package wbot - -import ( - "net/url" - "regexp" -) - -var ( - badExtensions = regexp.MustCompile(`^.*\.(png|jpg|jpeg|gif|ico|eps|pdf|iso|mp3|mp4|zip|aif|mpa|wav|wma|7z|deb|pkg|rar|rpm|bin|dmg|dat|tar|exe|ps|psd|svg|tif|tiff|pps|ppt|pptx|xls|xlsx|wmv|doc|docx|txt|mov|mpl)$`) -) - -// -// Filter -type filter struct { - allowed []*regexp.Regexp - disallowed []*regexp.Regexp -} - -// newFilter -func newFilter(allowed, disallowed []string) *filter { - var f = &filter{ - allowed: make([]*regexp.Regexp, 0), - disallowed: make([]*regexp.Regexp, 0), - } - - for _, p := range allowed { - f.allowed = append(f.allowed, regexp.MustCompile(p)) - } - - for _, p := range disallowed { - f.disallowed = append(f.disallowed, regexp.MustCompile(p)) - } - - return f -} - -// Allow -func (f *filter) Allow(l *url.URL) bool { - raw := l.String() - - if badExtensions.MatchString(l.Path) { - return false - } - - // disallowed - for _, d := range f.disallowed { - if d.MatchString(raw) { - return false - } - } - - // allowed - for _, a := range f.allowed { - if !a.MatchString(raw) { - return false - } - } - - return true -} diff --git a/limiter.go b/limiter.go deleted file mode 100644 index 651c5c0..0000000 --- a/limiter.go +++ /dev/null @@ -1,37 +0,0 @@ -package wbot - -import ( - "net/url" - "time" - - "github.com/twiny/ratelimit" -) - -// Limiter -type limiter struct { - rate int - duration time.Duration - list map[string]*ratelimit.Limiter -} - -// newLimiter -func newLimiter(r int, d time.Duration) *limiter { - return &limiter{ - rate: r, - duration: d, - list: make(map[string]*ratelimit.Limiter), - } -} - -// Take -func (l *limiter) take(u *url.URL) { - hostname := u.Hostname() - - limit, found := l.list[hostname] - if !found { - limit = ratelimit.NewLimiter(l.rate, l.duration) - l.list[hostname] = limit - } - - limit.Take() -} diff --git a/logger.go b/logger.go deleted file mode 100644 index 7ce739b..0000000 --- a/logger.go +++ /dev/null @@ -1,30 +0,0 @@ -package wbot - -// Logger -type Logger interface { - Send(rep Report) - Close() error -} - -// Report -type Report struct { - RequestURL string - Status int - Depth int32 - Err error -} - -// newReport -func newReport(resp Response, err error) Report { - requestURL := "" - if resp.URL != nil { - requestURL = resp.URL.String() - } - // - return Report{ - RequestURL: requestURL, - Status: resp.Status, - Depth: resp.Depth, - Err: err, - } -} diff --git a/option.go b/option.go deleted file mode 100644 index b722a53..0000000 --- a/option.go +++ /dev/null @@ -1,85 +0,0 @@ -package wbot - -import ( - "time" -) - -// Option -type Option func(*WBot) - -// SetFetcher -func SetFetcher(f Fetcher) Option { - return func(w *WBot) { - w.fetcher = f - } -} - -// SetStore -func SetStore(s Store) Option { - return func(w *WBot) { - w.store = s - } -} - -// SetQueue -func SetQueue(q Queue) Option { - return func(w *WBot) { - w.queue = q - } -} - -// SetLogger -func SetLogger(l Logger) Option { - return func(w *WBot) { - w.log = l - } -} - -// SetLimiter -func SetRateLimit(rate int, interval time.Duration) Option { - return func(w *WBot) { - w.limit = newLimiter(rate, interval) - } -} - -// SetFilter -func SetFilter(allowed, disallowed []string) Option { - return func(w *WBot) { - w.filter = newFilter(allowed, disallowed) - } -} - -// SetMaxDepth -func SetMaxDepth(depth int32) Option { - return func(w *WBot) { - w.conf.maxDepth = depth - } -} - -// SetParallel -func SetParallel(parallel int) Option { - return func(w *WBot) { - w.conf.parallel = parallel - } -} - -// SetMaxBodySize -func SetMaxBodySize(size int64) Option { - return func(w *WBot) { - w.conf.maxBodySize = size - } -} - -// SetUserAgents -func SetUserAgents(agents []string) Option { - return func(w *WBot) { - w.conf.userAgents = newRotator(agents) - } -} - -// SetProxies -func SetProxies(proxies []string) Option { - return func(w *WBot) { - w.conf.proxies = newRotator(proxies) - } -} diff --git a/plugin/fetcher/chromedp.go b/plugin/fetcher/chromedp.go deleted file mode 100644 index e2d50b5..0000000 --- a/plugin/fetcher/chromedp.go +++ /dev/null @@ -1,7 +0,0 @@ -package fetcher - -// CDPClient -// TODO: to implement fetcher using ChromeDP -type CDPClient struct { - // TODO: -} diff --git a/plugin/queue/bolt.go b/plugin/queue/bolt.go deleted file mode 100644 index e6085a5..0000000 --- a/plugin/queue/bolt.go +++ /dev/null @@ -1,108 +0,0 @@ -package queue - -import ( - "bytes" - "encoding/binary" - "encoding/gob" - "errors" - - "github.com/twiny/wbot" - "go.etcd.io/bbolt" -) - -var ( - ErrEmptyQueue = errors.New("queue is empty") -) - -// prefix -var prefix = "queue" - -// BQueue -type BQueue struct { - prefix string - db *bbolt.DB // Bolt stores its keys in byte-sorted order within a bucket. -} - -// NewBQueue -func NewBQueue(db *bbolt.DB) (wbot.Queue, error) { - if err := db.Update(func(tx *bbolt.Tx) error { - _, err := tx.CreateBucketIfNotExists([]byte(prefix)) - return err - }); err != nil { - return nil, err - } - - return &BQueue{ - prefix: prefix, - db: db, - }, nil -} - -// Enqueue -func (bq *BQueue) Enqueue(req wbot.Request) error { - var buf bytes.Buffer - if err := gob.NewEncoder(&buf).Encode(req); err != nil { - return err - } - - return bq.db.Update(func(tx *bbolt.Tx) error { - bu := tx.Bucket([]byte(prefix)) - - var key = make([]byte, 8) - seq, err := bu.NextSequence() - if err != nil { - return err - } - - binary.BigEndian.PutUint64(key, seq) - - return bu.Put(key, buf.Bytes()) - }) -} - -// Dequeue -func (bq *BQueue) Dequeue() (wbot.Request, error) { - // get from db - var req wbot.Request - if err := bq.db.Update(func(tx *bbolt.Tx) error { - bu := tx.Bucket([]byte(prefix)) - - c := bu.Cursor() - - k, v := c.First() - if k == nil { - return ErrEmptyQueue - } - - if err := gob.NewDecoder(bytes.NewReader(v)).Decode(&req); err != nil { - return err - } - - return c.Delete() - }); err != nil { - return wbot.Request{}, err - } - - return req, nil -} - -// Next -func (bq *BQueue) Next() bool { - return bq.db.View(func(tx *bbolt.Tx) error { - bu := tx.Bucket([]byte(bq.prefix)) - - c := bu.Cursor() - - k, _ := c.First() - if k == nil { - return ErrEmptyQueue - } - - return nil - }) == nil -} - -// Close -func (bq *BQueue) Close() error { - return bq.db.Close() -} diff --git a/queue.go b/queue.go deleted file mode 100644 index 2179d73..0000000 --- a/queue.go +++ /dev/null @@ -1,63 +0,0 @@ -package wbot - -import ( - "fmt" - "sync" -) - -// Queue -type Queue interface { - Enqueue(req Request) error - Dequeue() (Request, error) - Next() bool - Close() error -} - -// Default Queue - -// Queue -type queue[T any] struct { - mu *sync.Mutex - q []T -} - -// NewQueue -func defaultQueue[T any]() *queue[T] { - return &queue[T]{ - mu: &sync.Mutex{}, - q: make([]T, 0), - } -} - -// Enqueue -func (q *queue[T]) Enqueue(item T) error { - q.mu.Lock() - defer q.mu.Unlock() - q.q = append(q.q, item) - - return nil -} - -// Dequeue -func (q *queue[T]) Dequeue() (T, error) { - q.mu.Lock() - defer q.mu.Unlock() - if len(q.q) == 0 { - var t T - return t, fmt.Errorf("queue is empty") - } - r := q.q[0] - q.q = q.q[1:] - return r, nil -} - -// Next -func (q *queue[T]) Next() bool { - return len(q.q) != 0 -} - -// Close -func (q *queue[T]) Close() error { - q.q = nil - return nil -} diff --git a/request.go b/request.go deleted file mode 100644 index beb09a9..0000000 --- a/request.go +++ /dev/null @@ -1,61 +0,0 @@ -package wbot - -import ( - "fmt" - "net/url" - "strings" - - "golang.org/x/net/publicsuffix" -) - -// Request -type Request struct { - BaseDomain string - URL *url.URL - Depth int32 - Param Param -} - -// param -type Param struct { - Referer string - MaxBodySize int64 - UserAgent string - Proxy string -} - -// newRequest -func newRequest(raw string, depth int32, p Param) (Request, error) { - u, err := url.Parse(raw) - if err != nil { - return Request{}, err - } - - baseDomain, err := publicsuffix.EffectiveTLDPlusOne(u.Hostname()) - if err != nil { - return Request{}, err - } - - return Request{ - BaseDomain: baseDomain, - URL: u, - Depth: depth, - Param: p, - }, nil -} - -// AbsURL -func (r *Request) AbsURL(u string) (*url.URL, error) { - if strings.HasPrefix(u, "#") { - return nil, fmt.Errorf("url is a fragment") - } - - absURL, err := r.URL.Parse(u) - if err != nil { - return nil, err - } - - absURL.Fragment = "" - - return absURL, nil -} diff --git a/response.go b/response.go deleted file mode 100644 index 5c83a40..0000000 --- a/response.go +++ /dev/null @@ -1,12 +0,0 @@ -package wbot - -import "net/url" - -// Response -type Response struct { - URL *url.URL - Status int - Body []byte - NextURLs []string - Depth int32 -} diff --git a/rotator.go b/rotator.go deleted file mode 100644 index ea8c5ef..0000000 --- a/rotator.go +++ /dev/null @@ -1,39 +0,0 @@ -package wbot - -import ( - "container/ring" -) - -// rotator -type rotator struct { - r *ring.Ring -} - -// newRotator -func newRotator(s []string) *rotator { - r := ring.New(len(s)) - for _, item := range s { - r.Value = item - r = r.Next() - } - return &rotator{ - r: r, - } -} - -// Next -func (r *rotator) next() string { - if r.r == nil { - return "" - } - - val, ok := r.r.Value.(string) - if !ok { - return "" - } - - // move - r.r = r.r.Next() - - return val -} diff --git a/store.go b/store.go deleted file mode 100644 index 5b444b7..0000000 --- a/store.go +++ /dev/null @@ -1,47 +0,0 @@ -package wbot - -import "sync" - -// Store -type Store interface { - Visited(link string) bool - Close() error -} - -// Default Store - -// - -// Store -type store[T comparable] struct { - mu *sync.Mutex - visited map[T]bool -} - -// NewStore -func defaultStore[T comparable]() *store[T] { - return &store[T]{ - mu: &sync.Mutex{}, - visited: make(map[T]bool), - } -} - -// Visited -func (s *store[T]) Visited(k T) bool { - s.mu.Lock() - defer s.mu.Unlock() - _, ok := s.visited[k] - - // add if not visited - if !ok { - s.visited[k] = true - } - - return ok -} - -// Close -func (s *store[T]) Close() error { - s.visited = nil - return nil -} From 400e45964eb1411311a476b4b77cd1b2aaacf1c5 Mon Sep 17 00:00:00 2001 From: twiny Date: Fri, 18 Aug 2023 00:20:48 +0100 Subject: [PATCH 02/10] - refactor: new structure. --- crawler/config.go | 69 +++++++ crawler/crawler.go | 296 ++++++++++++++++++++++++++++ crawler/filter.go | 64 ++++++ crawler/limiter.go | 97 +++++++++ crawler/option.go | 61 ++++++ crawler/robot.go | 27 +++ go.mod | 24 ++- go.sum | 120 +++++++++++- plugin/fetcher/headless.go | 7 + plugin/fetcher/http_client.go | 127 ++++++++++++ plugin/logger/logger.go | 57 ++++++ plugin/monitor/monitor.go | 59 ++++++ plugin/queue/bbolt.go | 98 +++++++++ plugin/queue/in_memory.go | 74 +++++++ plugin/queue/in_memory_test.go | 44 +++++ plugin/store/bbolt.go | 122 ++++++------ plugin/store/in_memory.go | 30 +++ plugin/store/in_memory_test.go | 1 + wbot.go | 349 ++++++++++++++------------------- 19 files changed, 1444 insertions(+), 282 deletions(-) create mode 100644 crawler/config.go create mode 100644 crawler/crawler.go create mode 100644 crawler/filter.go create mode 100644 crawler/limiter.go create mode 100644 crawler/option.go create mode 100644 crawler/robot.go create mode 100644 plugin/fetcher/headless.go create mode 100644 plugin/fetcher/http_client.go create mode 100644 plugin/logger/logger.go create mode 100644 plugin/monitor/monitor.go create mode 100644 plugin/queue/bbolt.go create mode 100644 plugin/queue/in_memory.go create mode 100644 plugin/queue/in_memory_test.go create mode 100644 plugin/store/in_memory.go create mode 100644 plugin/store/in_memory_test.go diff --git a/crawler/config.go b/crawler/config.go new file mode 100644 index 0000000..e2d65e3 --- /dev/null +++ b/crawler/config.go @@ -0,0 +1,69 @@ +package crawler + +import ( + "runtime" + "time" + + "github.com/twiny/poxa" +) + +const ( + defaultReferrer = "https://www.google.com/search" + defaultUserAgent = "WBot/0.1.6 (+https://github.com/twiny/wbot)" + defaultTimeout = 30 * time.Second + defaultMaxBodySize = int64(1024 * 1024 * 5) // 5MB +) + +type ( + config struct { + parallel int + maxDepth int32 + maxBodySize int64 + userAgents poxa.Spinner[string] + referrers poxa.Spinner[string] + proxies poxa.Spinner[string] + } +) + +func newConfig(maxDepth int32, userAgents, referrers, proxies []string) *config { + if maxDepth <= 0 { + maxDepth = 10 + } + var conf = &config{ + parallel: runtime.NumCPU(), + maxDepth: maxDepth, + maxBodySize: defaultMaxBodySize, + userAgents: poxa.NewSpinner(defaultUserAgent), + referrers: poxa.NewSpinner(defaultReferrer), + proxies: nil, + } + + if len(userAgents) > 0 { + uaList := poxa.NewSpinner(userAgents...) + if uaList == nil { + // can be ignored. + } + + conf.userAgents = uaList + } + + if len(referrers) > 0 { + refList := poxa.NewSpinner(referrers...) + if refList == nil { + // can be ignored. + } + + conf.referrers = refList + } + + if len(proxies) > 0 { + proxyList := poxa.NewSpinner(proxies...) + if proxyList == nil { + // can be ignored. + } + + conf.proxies = proxyList + } + + return conf +} diff --git a/crawler/crawler.go b/crawler/crawler.go new file mode 100644 index 0000000..22d2a4c --- /dev/null +++ b/crawler/crawler.go @@ -0,0 +1,296 @@ +package crawler + +import ( + "context" + "os" + "strings" + "sync" + + "github.com/twiny/wbot" + "github.com/twiny/wbot/plugin/fetcher" + "github.com/twiny/wbot/plugin/queue" + "github.com/twiny/wbot/plugin/store" + + clog "github.com/charmbracelet/log" +) + +type ( + Crawler struct { + wg sync.WaitGroup + + cfg *config + + fetcher wbot.Fetcher + queue wbot.Queue + store wbot.Store + logger wbot.Logger + metrics wbot.MetricsMonitor + + filter *filter + limiter *rateLimiter + robot *robortManager + + stream chan *wbot.Response + + termLog *clog.Logger + + ctx context.Context + cancel context.CancelFunc + } +) + +func New(opts ...Option) *Crawler { + ctx, cancel := context.WithCancel(context.Background()) + + options := clog.Options{ + TimeFormat: "2006-01-02 15:04:05", + Level: clog.DebugLevel, + Prefix: "[WBot]", + ReportTimestamp: true, + } + + c := &Crawler{ + cfg: newConfig(-1, nil, nil, nil), + filter: newFilter(), + limiter: newRateLimiter(), + fetcher: fetcher.NewHTTPClient(), + queue: queue.NewInMemoryQueue(), + store: store.NewInMemoryStore(), + // logger: newFileLogger(), + stream: make(chan *wbot.Response, 2048), + termLog: clog.NewWithOptions(os.Stdout, options), + ctx: ctx, + cancel: cancel, + } + + for _, opt := range opts { + opt(c) + } + + c.wg.Add(c.cfg.parallel) + for i := 0; i < c.cfg.parallel; i++ { + go c.routine() + } + + return c +} + +func (c *Crawler) SetOption(opts ...Option) { + for _, opt := range opts { + opt(c) + } +} +func (c *Crawler) Crawl(links ...string) { + for _, link := range links { + c.start(link) + } +} +func (c *Crawler) Stream() <-chan *wbot.Response { + return c.stream +} +func (c *Crawler) Wait() { + c.wg.Wait() +} +func (c *Crawler) Done() { + c.cancel() +} + +func (c *Crawler) start(raw string) { + target, err := wbot.ValidURL(raw) + if err != nil { + c.termLog.Errorf("start: %s", err.Error()) + return + } + + // first request + param := &wbot.Param{ + MaxBodySize: c.cfg.maxBodySize, + UserAgent: c.cfg.userAgents.Next(), + } + + if c.cfg.proxies != nil { + param.Proxy = c.cfg.proxies.Next() + } + + key, err := wbot.HashLink(target.String()) + if err != nil { + // todo: log + // review: why error at this point? link is already validated + c.termLog.Errorf("hashlink -> invalid url: %s\n", target) + return + } + + hostname, err := wbot.Hostname(target.String()) + if err != nil { + // todo: log + c.termLog.Errorf("hostname -> invalid url: %s\n", target) + return + } + + req := &wbot.Request{ + ID: key, + BaseHost: hostname, + URL: target, + Depth: 0, + Param: param, + } + + c.termLog.Infof("start %+v\n", req) + + // todo: fix robots.txt + // if !c.robot.Allowed(ua, target) { + // // todo: log + // return + // } + + c.limiter.wait(target) // should be unique hash + + resp, err := c.fetcher.Fetch(context.TODO(), req) + if err != nil { + // todo: log + c.termLog.Errorf("fetcher -> %s\n", err.Error()) + return + } + + _, _ = c.store.HasVisited(context.TODO(), key) + + c.stream <- resp // stream 1st response + + for _, link := range resp.NextURLs { + u, err := req.ResolveURL(link) + if err != nil { + c.termLog.Errorf("resolve url -> %s\n", err.Error()) + continue + } + + // todo: this should only allow base host + if !strings.Contains(u.Hostname(), req.BaseHost) { + c.termLog.Errorf("invalid hostname: %s\n", u.Hostname()) + continue + } + + // add only referer & maxBodySize + // rest of params will be added + // right before fetch request + // to avoid rotating user agent and proxy. + nextParm := &wbot.Param{ + Referer: req.URL.String(), + MaxBodySize: c.cfg.maxBodySize, + } + + nextReq := &wbot.Request{ + ID: key, + BaseHost: hostname, + URL: u, + Depth: req.Depth + 1, + Param: nextParm, + } + + if err := c.queue.Push(context.TODO(), nextReq); err != nil { + c.termLog.Errorf("push -> %s\n", err.Error()) + continue + } + } +} +func (c *Crawler) routine() { + defer c.wg.Done() + + for { + select { + case <-c.ctx.Done(): + return + default: + req, err := c.queue.Pop(context.TODO()) + if err != nil { + if err == queue.ErrQueueClosed { + c.termLog.Errorf("queue closed\n") + return + } + c.termLog.Errorf("pop -> %s\n", err.Error()) + continue + } + + if visited, err := c.store.HasVisited(context.TODO(), req.ID); visited { + if err != nil { + // todo: log + c.termLog.Errorf("has visited -> %s\n", err.Error()) + continue + } + // todo: log + c.termLog.Errorf("already visited: %s\n", req.URL) + continue + } + + if req.Depth > c.cfg.maxDepth { + // todo: log + c.termLog.Errorf("max depth reached: %s\n", req.URL) + continue + } + + // if !c.robot.Allowed(req.Param.UserAgent, req.URL.String()) { + // // todo: log + // continue + // } + + if !c.filter.allow(req.URL) { + // todo: log + c.termLog.Errorf("filter -> %s\n", req.URL) + continue + } + + c.limiter.wait(req.URL) + + resp, err := c.fetcher.Fetch(c.ctx, req) + if err != nil { + // todo: log + c.termLog.Errorf("fetcher -> %s\n", err.Error()) + continue + } + + for _, link := range resp.NextURLs { + u, err := req.ResolveURL(link) + if err != nil { + c.termLog.Errorf("resolve url -> %s\n", err.Error()) + continue + } + + key, err := wbot.HashLink(u.String()) + if err != nil { + // todo: log + c.termLog.Errorf("hashlink -> %s\n", err.Error()) + continue + } + hostname, err := wbot.Hostname(u.String()) + if err != nil { + // todo: log + c.termLog.Errorf("hostname -> %s\n", err.Error()) + continue + } + + nextReq := &wbot.Request{ + ID: key, + BaseHost: hostname, + URL: u, + Depth: req.Depth + 1, + Param: req.Param, + } + + if err := c.queue.Push(context.TODO(), nextReq); err != nil { + // todo: log + c.termLog.Errorf("push -> %s\n", err.Error()) + continue + } + } + + // if c.log != nil { + // rep := newReport(resp, nil) + // c.log.Send(rep) + // } + + // stream + c.stream <- resp + + c.termLog.Errorf("crawled: %s\n", req.URL) + } + } +} diff --git a/crawler/filter.go b/crawler/filter.go new file mode 100644 index 0000000..eb9356f --- /dev/null +++ b/crawler/filter.go @@ -0,0 +1,64 @@ +package crawler + +import ( + "net/url" + "regexp" + + "github.com/twiny/wbot" +) + +var ( + badExtensions = regexp.MustCompile(`\.(png|jpg|jpeg|gif|ico|eps|pdf|iso|mp3|mp4|zip|aif|mpa|wav|wma|7z|deb|pkg|rar|rpm|bin|dmg|dat|tar|exe|ps|psd|svg|tif|tiff|pps|ppt|pptx|xls|xlsx|wmv|doc|docx|txt|mov|mpl)$`) +) + +type ( + filter struct { + rules map[string]*wbot.FilterRule + } +) + +func newFilter(rules ...*wbot.FilterRule) *filter { + f := &filter{ + rules: make(map[string]*wbot.FilterRule), + } + + for _, rule := range rules { + f.rules[rule.Hostname] = rule + } + + return f +} + +func (f *filter) allow(link *url.URL) bool { + hostname, err := wbot.Hostname(link.String()) + if err != nil { + // review: double check this case + } + + if badExtensions.MatchString(link.String()) { + return false + } + + rule, found := f.rules[hostname] + if !found { + // check if there is a wildcard rule + rule, found = f.rules["*"] + if !found { + return true + } + } + + for _, pattern := range rule.Disallow { + if pattern.MatchString(link.String()) { + return false + } + } + + for _, pattern := range rule.Allow { + if pattern.MatchString(link.String()) { + return true + } + } + + return false // review: default deny +} diff --git a/crawler/limiter.go b/crawler/limiter.go new file mode 100644 index 0000000..e2937b0 --- /dev/null +++ b/crawler/limiter.go @@ -0,0 +1,97 @@ +package crawler + +import ( + "net/url" + "strconv" + "strings" + "time" + + "github.com/twiny/ratelimit" + "github.com/twiny/wbot" +) + +var ( + defaultRateLimit = "10/1s" +) + +type ( + rateLimiter struct { + table map[string]*ratelimit.Limiter + } +) + +func newRateLimiter(limits ...*wbot.RateLimit) *rateLimiter { + rl := &rateLimiter{ + table: make(map[string]*ratelimit.Limiter), + } + + // Handle the default wildcard limit. + hasWildcard := false + if len(limits) > 0 { + for _, limit := range limits { + if limit.Hostname == "*" { + hasWildcard = true + break + } + } + } + + if !hasWildcard { + limits = append(limits, &wbot.RateLimit{ + Hostname: "*", + Rate: defaultRateLimit, + }) + } + + for _, rate := range limits { + r, l := parseRateLimit(rate.Rate) + rl.table[rate.Hostname] = ratelimit.NewLimiter(r, l) + } + + return rl +} +func (l *rateLimiter) wait(link *url.URL) { + hostname, err := wbot.Hostname(link.String()) + if err != nil { + // case err, play safe. + hostname = "*" + } + + limit, found := l.table[hostname] + if !found { + limit = l.table["*"] + } + + limit.Take() +} + +func parseRateLimit(s string) (rate int, interval time.Duration) { + parts := strings.Split(s, "/") + if len(parts) != 2 { + return parseRateLimit(defaultRateLimit) + } + + rate, err := strconv.Atoi(parts[0]) + if err != nil { + return parseRateLimit(defaultRateLimit) + } + + intervalValueStr := parts[1][:len(parts[1])-1] + intervalValue, err := strconv.Atoi(intervalValueStr) + if err != nil { + return parseRateLimit(defaultRateLimit) + } + + switch parts[1][len(parts[1])-1] { + case 's', 'S': + interval = time.Duration(intervalValue) * time.Second + case 'm', 'M': + interval = time.Duration(intervalValue) * time.Minute + case 'h', 'H': + interval = time.Duration(intervalValue) * time.Hour + default: + return parseRateLimit(defaultRateLimit) + } + + return rate, interval +} diff --git a/crawler/option.go b/crawler/option.go new file mode 100644 index 0000000..9074942 --- /dev/null +++ b/crawler/option.go @@ -0,0 +1,61 @@ +package crawler + +import ( + "github.com/twiny/poxa" + "github.com/twiny/wbot" +) + +type ( + Option func(c *Crawler) +) + +func WithParallel(parallel int) Option { + return func(c *Crawler) { + c.cfg.parallel = parallel + } +} +func WithMaxDepth(maxDepth int32) Option { + return func(c *Crawler) { + c.cfg.maxDepth = maxDepth + } +} +func WithUserAgents(userAgents []string) Option { + return func(c *Crawler) { + c.cfg.userAgents = poxa.NewSpinner(userAgents...) + } +} +func WithProxies(proxies []string) Option { + return func(c *Crawler) { + c.cfg.proxies = poxa.NewSpinner(proxies...) + } +} +func WithRateLimit(rates ...*wbot.RateLimit) Option { + return func(c *Crawler) { + c.limiter = newRateLimiter(rates...) + } +} +func WithFilter(rules ...*wbot.FilterRule) Option { + return func(c *Crawler) { + c.filter = newFilter(rules...) + } +} +func WithFetcher(fetcher wbot.Fetcher) Option { + return func(c *Crawler) { + c.fetcher = fetcher + } +} +func WithQueue(queue wbot.Queue) Option { + return func(c *Crawler) { + c.queue = queue + } +} +func WithStore(store wbot.Store) Option { + return func(c *Crawler) { + c.store = store + } +} +func WithLogger(logger wbot.Logger) Option { + return func(c *Crawler) { + c.logger = logger + } +} diff --git a/crawler/robot.go b/crawler/robot.go new file mode 100644 index 0000000..eeba530 --- /dev/null +++ b/crawler/robot.go @@ -0,0 +1,27 @@ +package crawler + +import ( + "github.com/temoto/robotstxt" +) + +const ( + robotstxtPath = "/robots.txt" +) + +type ( + robortManager struct { + robots map[string]*robotstxt.RobotsData + } +) + +func NewRobotManager() *robortManager { + return &robortManager{ + robots: make(map[string]*robotstxt.RobotsData), + } +} + +// func (rm *robortManager) AddRobotsTxt(hostname string, statusCode int, body []byte) error { +// } + +// func (rm *robortManager) Allowed(userAgent, path string) bool { +// } diff --git a/go.mod b/go.mod index f24e2b1..a4e49fe 100644 --- a/go.mod +++ b/go.mod @@ -1,16 +1,30 @@ module github.com/twiny/wbot -go 1.18 +go 1.21.0 require ( - github.com/PuerkitoBio/goquery v1.8.0 + github.com/PuerkitoBio/goquery v1.8.1 + github.com/charmbracelet/log v0.2.3 + github.com/temoto/robotstxt v1.1.2 + github.com/twiny/flog/v2 v2.0.0 + github.com/twiny/poxa v0.1.0 github.com/twiny/ratelimit v0.0.0-20220509163414-256d3376b0ac - go.etcd.io/bbolt v1.3.5 - golang.org/x/net v0.0.0-20220513224357-95641704303c + github.com/weppos/publicsuffix-go v0.30.1 ) require ( github.com/andybalholm/cascadia v1.3.1 // indirect + github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect github.com/benbjohnson/clock v1.3.0 // indirect - golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e // indirect + github.com/charmbracelet/lipgloss v0.7.1 // indirect + github.com/go-logfmt/logfmt v0.6.0 // indirect + github.com/lucasb-eyer/go-colorful v1.2.0 // indirect + github.com/mattn/go-isatty v0.0.18 // indirect + github.com/mattn/go-runewidth v0.0.14 // indirect + github.com/muesli/reflow v0.3.0 // indirect + github.com/muesli/termenv v0.15.2 // indirect + github.com/rivo/uniseg v0.2.0 // indirect + golang.org/x/net v0.12.0 // indirect + golang.org/x/sys v0.10.0 // indirect + golang.org/x/text v0.11.0 // indirect ) diff --git a/go.sum b/go.sum index 8ca3830..be46d47 100644 --- a/go.sum +++ b/go.sum @@ -1,21 +1,123 @@ -github.com/PuerkitoBio/goquery v1.8.0 h1:PJTF7AmFCFKk1N6V6jmKfrNH9tV5pNE6lZMkG0gta/U= -github.com/PuerkitoBio/goquery v1.8.0/go.mod h1:ypIiRMtY7COPGk+I/YbZLbxsxn9g5ejnI2HSMtkjZvI= +cloud.google.com/go/compute/metadata v0.2.0/go.mod h1:zFmK7XCadkQkj6TtorcaGlCW1hT1fIilQDwofLpJ20k= +github.com/ProtonMail/go-crypto v0.0.0-20230217124315-7d5c6f04bbb8/go.mod h1:I0gYDMZ6Z5GRU7l58bNFSkPTFN6Yl12dsUlAZ8xy98g= +github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM= +github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= +github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k= +github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8= github.com/benbjohnson/clock v1.3.0 h1:ip6w0uFQkncKQ979AypyG0ER7mqUSBdKLOgAle/AT8A= github.com/benbjohnson/clock v1.3.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= +github.com/bwesterb/go-ristretto v1.2.0/go.mod h1:fUIoIZaG73pV5biE2Blr2xEzDoMj7NFEuV9ekS419A0= +github.com/charmbracelet/lipgloss v0.7.1 h1:17WMwi7N1b1rVWOjMT+rCh7sQkvDU75B2hbZpc5Kc1E= +github.com/charmbracelet/lipgloss v0.7.1/go.mod h1:yG0k3giv8Qj8edTCbbg6AlQ5e8KNWpFujkNawKNhE2c= +github.com/charmbracelet/log v0.2.3 h1:YVmBhJtpGL7nW/nlf5u+SEloU8XYljxozGzZpgwIvhs= +github.com/charmbracelet/log v0.2.3/go.mod h1:ZApwwzDbbETVTIRTk7724yQRJAXIktt98yGVMMaa3y8= +github.com/cloudflare/circl v1.1.0/go.mod h1:prBCrKB9DV4poKZY1l9zBXg2QJY7mvgRvtMxxK7fi4I= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-logfmt/logfmt v0.6.0 h1:wGYYu3uicYdqXVgoYbvnkrPVXkuLM1p1ifugDMEdRi4= +github.com/go-logfmt/logfmt v0.6.0/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KEVveWlfTs= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-github/v50 v50.2.0/go.mod h1:VBY8FB6yPIjrtKhozXv4FQupxKLS6H4m6xFZlT43q8Q= +github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17icRSOU623lUBU= +github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY= +github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= +github.com/mattn/go-isatty v0.0.18 h1:DOKFKCQ7FNG2L1rbrmstDN4QVRdS89Nkh85u68Uwp98= +github.com/mattn/go-isatty v0.0.18/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mattn/go-runewidth v0.0.12/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk= +github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU= +github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= +github.com/muesli/reflow v0.3.0 h1:IFsN6K9NfGtjeggFP+68I4chLZV2yIKsXJFNZ+eWh6s= +github.com/muesli/reflow v0.3.0/go.mod h1:pbwTDkVPibjO2kyvBQRBxTWEEGDGq0FlB1BIKtnHY/8= +github.com/muesli/termenv v0.15.2 h1:GohcuySI0QmI3wN8Ok9PtKGkgkFIk7y6Vpb5PvrY+Wo= +github.com/muesli/termenv v0.15.2/go.mod h1:Epx+iuz8sNs7mNKhxzH4fWXGNpZwUaJKRS1noLXviQ8= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY= +github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= +github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +github.com/twiny/flog/v2 v2.0.0 h1:uHyWg1/q1P5zi/5jOiZyFZBfUZwd416v+9J+fnIBMZY= +github.com/twiny/flog/v2 v2.0.0/go.mod h1:ujV9tY/hkP9302AgmXcpA/ds1/prAymijvIoxF9zzbM= +github.com/twiny/poxa v0.1.0 h1:NMM1ZeRfGFVOz60NjHR4r78pQYkq09VyOjKjdkhkWsE= +github.com/twiny/poxa v0.1.0/go.mod h1:zTPmnK5Ta+Ro+HL1R/LREGg3LNqs/bpNcEWlUipKl7A= github.com/twiny/ratelimit v0.0.0-20220509163414-256d3376b0ac h1:nT+8DFvrU5Nu3Be2bK7LooU8AslFJeypQoAF+wm1CM0= github.com/twiny/ratelimit v0.0.0-20220509163414-256d3376b0ac/go.mod h1:C589KqlnfcMeRAJ+evrNJwSf9ddkXO926hRDtgjjoYM= -go.etcd.io/bbolt v1.3.5 h1:XAzx9gjCb0Rxj7EoqcClPD1d5ZBxZJk0jbuoPHenBt0= -go.etcd.io/bbolt v1.3.5/go.mod h1:G5EMThwa9y8QZGBClrRx5EY+Yw9kAhnjy3bSjsnlVTQ= +github.com/weppos/publicsuffix-go v0.30.1 h1:8q+QwBS1MY56Zjfk/50ycu33NN8aa1iCCEQwo/71Oos= +github.com/weppos/publicsuffix-go v0.30.1/go.mod h1:s41lQh6dIsDWIC1OWh7ChWJXLH0zkJ9KHZVqA7vHyuQ= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.7.0/go.mod h1:pYwdfH91IfpZVANVyUOhSIPZaFoJGxTFbZhFTx+dXZU= +golang.org/x/crypto v0.11.0/go.mod h1:xgJhtzW8F9jGdVFWZESrid1U1bjeNy4zgy5cRr/CIio= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= -golang.org/x/net v0.0.0-20220513224357-95641704303c h1:nF9mHSvoKBLkQNQhJZNsc66z2UzAMUbLGjC95CF3pU0= -golang.org/x/net v0.0.0-20220513224357-95641704303c/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= -golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.12.0 h1:cfawfvKITfUsFCeJIHJrbSxpeu/E81khclypR0GVT50= +golang.org/x/net v0.12.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA= +golang.org/x/oauth2 v0.6.0/go.mod h1:ycmewcwgD4Rpr3eZJLSB4Kyyljb3qDh40vJ8STE5HKw= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e h1:fLOSk5Q00efkSvAm+4xcoXD+RRmLmmulPn5I3Y9F2EM= -golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.10.0 h1:SqMFp9UcQJZa+pmYuAKjd9xq1f0j5rLcDIk0mj4qAsA= +golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.10.0/go.mod h1:lpqdcUyK/oCiQxvxVrppt5ggO2KCZ5QblwqPnfZ6d5o= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.11.0 h1:LAntKIrcmeSKERyiOh0XMV39LXS8IE9UL2yP7+f5ij4= +golang.org/x/text v0.11.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/plugin/fetcher/headless.go b/plugin/fetcher/headless.go new file mode 100644 index 0000000..e2d50b5 --- /dev/null +++ b/plugin/fetcher/headless.go @@ -0,0 +1,7 @@ +package fetcher + +// CDPClient +// TODO: to implement fetcher using ChromeDP +type CDPClient struct { + // TODO: +} diff --git a/plugin/fetcher/http_client.go b/plugin/fetcher/http_client.go new file mode 100644 index 0000000..7f5e688 --- /dev/null +++ b/plugin/fetcher/http_client.go @@ -0,0 +1,127 @@ +package fetcher + +import ( + "context" + "io" + "net" + "net/http" + "net/url" + "time" + + "github.com/twiny/wbot" +) + +type ( + defaultHTTPClient struct { + client *http.Client + } +) + +func NewHTTPClient() wbot.Fetcher { + return &defaultHTTPClient{ + client: &http.Client{ + Jar: http.DefaultClient.Jar, + Timeout: 30 * time.Second, + }, + } +} + +func (f *defaultHTTPClient) Fetch(ctx context.Context, req *wbot.Request) (*wbot.Response, error) { + type ( + fetchResult struct { + result *wbot.Response + err error + } + ) + + var ch = make(chan fetchResult, 1) + + go func() { + resp, err := f.fetch(req) + if err != nil { + ch <- fetchResult{nil, err} + return + } + ch <- fetchResult{resp, nil} + }() + + for { + select { + case <-ctx.Done(): + return nil, ctx.Err() + case resp := <-ch: + if resp.err != nil { + return nil, resp.err + } + return resp.result, nil + } + } +} +func (f *defaultHTTPClient) Close() error { + f.client.CloseIdleConnections() + return nil +} + +func (f *defaultHTTPClient) fetch(req *wbot.Request) (*wbot.Response, error) { + var header = make(http.Header) + header.Set("User-Agent", req.Param.UserAgent) + header.Set("Referer", req.Param.Referer) + + if req.Param.Proxy != "" { + f.client.Transport = newHTTPTransport(req.Param.Proxy) + } + + resp, err := f.client.Do(&http.Request{ + Method: http.MethodGet, + URL: req.URL, + Header: header, + Proto: "HTTP/1.1", + ProtoMajor: 1, + ProtoMinor: 1, + }) + if err != nil { + return nil, err + } + + // Limit response body reading + bodyReader := io.LimitReader(resp.Body, req.Param.MaxBodySize) + + body, err := io.ReadAll(bodyReader) + if err != nil { + return nil, err + } + + resp.Body.Close() + + return &wbot.Response{ + URL: req.URL, + Status: resp.StatusCode, + Body: body, + NextURLs: wbot.FindLinks(body), + Depth: req.Depth, + }, nil +} +func newHTTPTransport(purl string) *http.Transport { + var proxy = http.ProxyFromEnvironment + + if purl != "" { + proxy = func(req *http.Request) (*url.URL, error) { + return url.Parse(purl) + } + } + return &http.Transport{ + Proxy: proxy, + DialContext: (&net.Dialer{ + Timeout: 30 * time.Second, + KeepAlive: 30 * time.Second, + DualStack: true, + }).DialContext, + ForceAttemptHTTP2: true, + MaxIdleConns: 100, // Default: 100 + MaxIdleConnsPerHost: 2, // Default: 2 + IdleConnTimeout: 10 * time.Second, + TLSHandshakeTimeout: 5 * time.Second, + ExpectContinueTimeout: 1 * time.Second, + // DisableKeepAlives: true, + } +} diff --git a/plugin/logger/logger.go b/plugin/logger/logger.go new file mode 100644 index 0000000..4388cba --- /dev/null +++ b/plugin/logger/logger.go @@ -0,0 +1,57 @@ +package logger + +import ( + "time" + + "github.com/twiny/flog/v2" + "github.com/twiny/wbot" +) + +type ( + defaultLogger struct { + l *flog.Logger + } +) + +func NewFileLogger(prefix string) (*defaultLogger, error) { + logger, err := flog.NewLogger(prefix, 10, 10) + if err != nil { + return nil, err + } + + return &defaultLogger{ + l: logger, + }, nil +} + +func (l *defaultLogger) Write(log *wbot.Log) error { + f := []flog.Field{ + flog.NewField("request_url", log.RequestURL), + flog.NewField("status", log.Status), + flog.NewField("depth", log.Depth), + flog.NewField("timestamp", log.Timestamp.Format(time.RFC3339)), + flog.NewField("response_time", log.ResponseTime.String()), + flog.NewField("content_size", log.ContentSize), + } + + if log.UserAgent != "" { + f = append(f, flog.NewField("user_agent", log.UserAgent)) + } + + if log.RedirectURL != "" { + f = append(f, flog.NewField("redirect_url", log.RedirectURL)) + } + + if log.Err != nil { + f = append(f, flog.NewField("error", log.Err.Error())) + l.l.Error(log.Err.Error(), f...) + return nil + } + + l.l.Info("page", f...) + return nil +} +func (l *defaultLogger) Close() error { + l.l.Close() + return nil +} diff --git a/plugin/monitor/monitor.go b/plugin/monitor/monitor.go new file mode 100644 index 0000000..9ed5497 --- /dev/null +++ b/plugin/monitor/monitor.go @@ -0,0 +1,59 @@ +package monitor + +import ( + "sync/atomic" +) + +type ( + metricsMonitor struct { + totalRequests int64 + successfulRequests int64 + failedRequests int64 + retries int64 + redirects int64 + totalPages int64 + crawledPages int64 + skippedPages int64 + parsedLinks int64 + clientErrors int64 + serverErrors int64 + } +) + +func NewmetricsMonitor() *metricsMonitor { + return &metricsMonitor{} +} + +func (m *metricsMonitor) IncrementTotalRequests() { + atomic.AddInt64(&m.totalRequests, 1) +} +func (m *metricsMonitor) IncrementSuccessfulRequests() { + atomic.AddInt64(&m.successfulRequests, 1) +} +func (m *metricsMonitor) IncrementFailedRequests() { + atomic.AddInt64(&m.failedRequests, 1) +} +func (m *metricsMonitor) IncrementRetries() { + atomic.AddInt64(&m.retries, 1) +} +func (m *metricsMonitor) IncrementRedirects() { + atomic.AddInt64(&m.redirects, 1) +} +func (m *metricsMonitor) IncrementTotalPages() { + atomic.AddInt64(&m.totalPages, 1) +} +func (m *metricsMonitor) IncrementCrawledPages() { + atomic.AddInt64(&m.crawledPages, 1) +} +func (m *metricsMonitor) IncrementSkippedPages() { + atomic.AddInt64(&m.skippedPages, 1) +} +func (m *metricsMonitor) IncrementParsedLinks() { + atomic.AddInt64(&m.parsedLinks, 1) +} +func (m *metricsMonitor) IncrementClientErrors() { + atomic.AddInt64(&m.clientErrors, 1) +} +func (m *metricsMonitor) IncrementServerErrors() { + atomic.AddInt64(&m.serverErrors, 1) +} diff --git a/plugin/queue/bbolt.go b/plugin/queue/bbolt.go new file mode 100644 index 0000000..616346d --- /dev/null +++ b/plugin/queue/bbolt.go @@ -0,0 +1,98 @@ +package queue + +// var ( +// ErrEmptyQueue = errors.New("queue is empty") +// ) + +// // prefix +// var prefix = "queue" + +// // BQueue +// type BQueue struct { +// prefix string +// db *bbolt.DB // Bolt stores its keys in byte-sorted order within a bucket. +// } + +// // NewBQueue +// func NewBQueue(db *bbolt.DB) (wbot.Queue, error) { +// if err := db.Update(func(tx *bbolt.Tx) error { +// _, err := tx.CreateBucketIfNotExists([]byte(prefix)) +// return err +// }); err != nil { +// return nil, err +// } + +// return &BQueue{ +// prefix: prefix, +// db: db, +// }, nil +// } + +// // Enqueue +// func (bq *BQueue) Enqueue(req wbot.Request) error { +// var buf bytes.Buffer +// if err := gob.NewEncoder(&buf).Encode(req); err != nil { +// return err +// } + +// return bq.db.Update(func(tx *bbolt.Tx) error { +// bu := tx.Bucket([]byte(prefix)) + +// var key = make([]byte, 8) +// seq, err := bu.NextSequence() +// if err != nil { +// return err +// } + +// binary.BigEndian.PutUint64(key, seq) + +// return bu.Put(key, buf.Bytes()) +// }) +// } + +// // Dequeue +// func (bq *BQueue) Dequeue() (wbot.Request, error) { +// // get from db +// var req wbot.Request +// if err := bq.db.Update(func(tx *bbolt.Tx) error { +// bu := tx.Bucket([]byte(prefix)) + +// c := bu.Cursor() + +// k, v := c.First() +// if k == nil { +// return ErrEmptyQueue +// } + +// if err := gob.NewDecoder(bytes.NewReader(v)).Decode(&req); err != nil { +// return err +// } + +// return c.Delete() +// }); err != nil { +// return wbot.Request{}, err +// } + +// return req, nil +// } + +// // Next +// func (bq *BQueue) Next() bool { +// return bq.db.View(func(tx *bbolt.Tx) error { +// bu := tx.Bucket([]byte(bq.prefix)) + +// c := bu.Cursor() + +// k, _ := c.First() +// if k == nil { +// return ErrEmptyQueue +// } + +// return nil +// }) == nil +// } + +// // Close +// func (bq *BQueue) Close() error { +// return bq.db.Close() +// } diff --git a/plugin/queue/in_memory.go b/plugin/queue/in_memory.go new file mode 100644 index 0000000..4cfd835 --- /dev/null +++ b/plugin/queue/in_memory.go @@ -0,0 +1,74 @@ +package queue + +import ( + "context" + "fmt" + "sync" + + "github.com/twiny/wbot" +) + +var ( + ErrQueueClosed = fmt.Errorf("queue is closed") +) + +type ( + defaultInMemoryQueue struct { + mu *sync.RWMutex + list []*wbot.Request + cond *sync.Cond + closed bool + } +) + +func NewInMemoryQueue() wbot.Queue { + queue := &defaultInMemoryQueue{ + mu: &sync.RWMutex{}, + list: make([]*wbot.Request, 0, 4096), + } + queue.cond = sync.NewCond(queue.mu) + return queue +} +func (q *defaultInMemoryQueue) Push(ctx context.Context, req *wbot.Request) error { + q.mu.Lock() + defer q.mu.Unlock() + + if q.closed { + return ErrQueueClosed + } + + q.list = append(q.list, req) + q.cond.Broadcast() + + return nil +} +func (q *defaultInMemoryQueue) Pop(ctx context.Context) (*wbot.Request, error) { + q.mu.Lock() + defer q.mu.Unlock() + + switch { + case q.closed && len(q.list) == 0: + return nil, ErrQueueClosed + case len(q.list) == 0 && !q.closed: + q.cond.Wait() + } + + req := q.list[0] + q.list = q.list[1:] + return req, nil +} +func (q *defaultInMemoryQueue) Len() int { + q.mu.RLock() + defer q.mu.RUnlock() + + return len(q.list) +} +func (q *defaultInMemoryQueue) Close() error { + q.mu.Lock() + defer q.mu.Unlock() + + q.closed = true + q.cond.Broadcast() + + return nil +} diff --git a/plugin/queue/in_memory_test.go b/plugin/queue/in_memory_test.go new file mode 100644 index 0000000..81068c3 --- /dev/null +++ b/plugin/queue/in_memory_test.go @@ -0,0 +1,44 @@ +package queue + +import ( + "context" + "fmt" + "sync" + "testing" + + "github.com/twiny/wbot" +) + +func TestInMemoryPush(t *testing.T) { + +} + +func TestInMemoryPop(t *testing.T) { + +} + +// go test -benchmem -v -count=1 -run=^$ -bench ^BenchmarkInMemoryPush$ github.com/twiny/wbot/plugin/queue -tags=integration,unit +func BenchmarkInMemoryPush(b *testing.B) { + queue := NewInMemoryQueue() + defer queue.Close() + + b.ResetTimer() // Reset the timer to ignore the setup time + + var wg sync.WaitGroup + for i := 0; i < b.N; i++ { + wg.Add(1) + go func(j int) { + defer wg.Done() + if err := queue.Push(context.TODO(), &wbot.Request{ + ID: fmt.Sprintf("%d", j), + }); err != nil { + b.Error(err) + } + }(i) + } + wg.Wait() +} + +func BenchmarkInMemoryPop(b *testing.B) { + +} diff --git a/plugin/store/bbolt.go b/plugin/store/bbolt.go index b5fef0d..b193d71 100644 --- a/plugin/store/bbolt.go +++ b/plugin/store/bbolt.go @@ -1,68 +1,58 @@ package store -import ( - "crypto/sha256" - "encoding/hex" - "fmt" - "strings" - - "github.com/twiny/wbot" - "go.etcd.io/bbolt" -) - -var ( - prefix = "store" -) - -// BStore -type BStore struct { - prefix string - db *bbolt.DB -} - -// NewBStore -func NewBStore(db *bbolt.DB) (wbot.Store, error) { - // create bucket for store - if err := db.Update(func(tx *bbolt.Tx) error { - _, err := tx.CreateBucketIfNotExists([]byte(prefix)) - return err - }); err != nil { - return nil, err - } - - return &BStore{ - prefix: prefix, - db: db, - }, nil -} - -// Visited -func (bs *BStore) Visited(link string) bool { - sum := sha256.Sum224([]byte(link)) - - // - key := strings.Join([]string{ - bs.prefix, - hex.EncodeToString(sum[:]), - }, "_") - - return bs.db.Update(func(tx *bbolt.Tx) error { - bu := tx.Bucket([]byte(prefix)) - - d := bu.Get([]byte(key)) - // if d == nil means not found - if d == nil { - if err := bu.Put([]byte(key), []byte(link)); err != nil { - return err - } - return nil - } - - return fmt.Errorf("visited") - }) != nil -} - -// Close -func (bs *BStore) Close() error { - return bs.db.Close() -} +// var ( +// prefix = "store" +// ) + +// // BStore +// type BStore struct { +// prefix string +// db *bbolt.DB +// } + +// // NewBStore +// func NewBStore(db *bbolt.DB) (wbot.Store, error) { +// // create bucket for store +// if err := db.Update(func(tx *bbolt.Tx) error { +// _, err := tx.CreateBucketIfNotExists([]byte(prefix)) +// return err +// }); err != nil { +// return nil, err +// } + +// return &BStore{ +// prefix: prefix, +// db: db, +// }, nil +// } + +// // Visited +// func (bs *BStore) Visited(link string) bool { +// sum := sha256.Sum224([]byte(link)) + +// // +// key := strings.Join([]string{ +// bs.prefix, +// hex.EncodeToString(sum[:]), +// }, "_") + +// return bs.db.Update(func(tx *bbolt.Tx) error { +// bu := tx.Bucket([]byte(prefix)) + +// d := bu.Get([]byte(key)) +// // if d == nil means not found +// if d == nil { +// if err := bu.Put([]byte(key), []byte(link)); err != nil { +// return err +// } +// return nil +// } + +// return fmt.Errorf("visited") +// }) != nil +// } + +// // Close +// func (bs *BStore) Close() error { +// return bs.db.Close() +// } diff --git a/plugin/store/in_memory.go b/plugin/store/in_memory.go new file mode 100644 index 0000000..23efbb8 --- /dev/null +++ b/plugin/store/in_memory.go @@ -0,0 +1,30 @@ +package store + +import ( + "context" + "sync" + + "github.com/twiny/wbot" +) + +type ( + defaultInMemoryStore struct { + mu sync.RWMutex + table map[string]bool + } +) + +func NewInMemoryStore() wbot.Store { + return &defaultInMemoryStore{ + table: make(map[string]bool), + } +} +func (s *defaultInMemoryStore) HasVisited(ctx context.Context, link string) (bool, error) { + s.mu.RLock() + defer s.mu.RUnlock() + + return s.table[link], nil +} +func (s *defaultInMemoryStore) Close() error { + return nil +} diff --git a/plugin/store/in_memory_test.go b/plugin/store/in_memory_test.go new file mode 100644 index 0000000..72440ea --- /dev/null +++ b/plugin/store/in_memory_test.go @@ -0,0 +1 @@ +package store diff --git a/wbot.go b/wbot.go index fc926b7..5251312 100644 --- a/wbot.go +++ b/wbot.go @@ -1,252 +1,197 @@ package wbot import ( + "bytes" + "context" + "crypto/sha256" + "encoding/hex" "fmt" - "runtime" + "net/url" + "regexp" "strings" - "sync" - "sync/atomic" + "time" + + "github.com/PuerkitoBio/goquery" + "github.com/weppos/publicsuffix-go/publicsuffix" ) -// default cpu core -var cores = func() int { - c := runtime.NumCPU() - if c == 1 { - return c - } - return c - 1 -}() - -// WBot -type WBot struct { - wg *sync.WaitGroup - conf *config - limit *limiter - filter *filter - fetcher Fetcher - queue Queue - store Store - log Logger - stream chan Response -} +type ( + Fetcher interface { + Fetch(ctx context.Context, req *Request) (*Response, error) + Close() error + } -// NewWBot -func NewWBot(opts ...Option) *WBot { - conf := &config{ - maxDepth: 10, - parallel: cores, - maxBodySize: 1024 * 1024 * 10, - userAgents: newRotator([]string{}), - proxies: newRotator([]string{}), + Queue interface { + Push(ctx context.Context, req *Request) error + Pop(ctx context.Context) (*Request, error) + Close() error } - wbot := &WBot{ - wg: &sync.WaitGroup{}, - conf: conf, - fetcher: defaultFetcher(), - limit: newLimiter(1, 1), - filter: newFilter([]string{}, []string{}), - store: defaultStore[string](), - queue: defaultQueue[Request](), - log: nil, - stream: make(chan Response, cores), + Store interface { + HasVisited(ctx context.Context, key string) (bool, error) + Close() error } - // options - wbot.SetOptions(opts...) + Logger interface { + Write(ctx context.Context, log *Log) error + Close() error + } - return wbot -} + MetricsMonitor interface { + IncrementTotalRequests() + IncrementSuccessfulRequests() + IncrementFailedRequests() + IncrementRetries() + IncrementRedirects() -// Crawl -func (wb *WBot) Crawl(link string) error { - // first request - p := Param{ - Referer: link, - MaxBodySize: wb.conf.maxBodySize, - UserAgent: wb.conf.userAgents.next(), - Proxy: wb.conf.proxies.next(), - } + IncrementTotalPages() + IncrementCrawledPages() + IncrementSkippedPages() + IncrementParsedLinks() - req, err := newRequest(link, 0, p) - if err != nil { - return err + IncrementClientErrors() + IncrementServerErrors() } - // rate limit - wb.limit.take(req.URL) - - resp, err := wb.fetcher.Fetch(req) - if err != nil { - return err + Request struct { + ID string + BaseHost string + URL *url.URL + Depth int32 + Param *Param } - if wb.log != nil { - rep := newReport(resp, nil) - wb.log.Send(rep) + Response struct { + URL *url.URL + Status int + Body []byte + NextURLs []string + Depth int32 + ElapsedTime time.Duration + Err error } - // stream 1st response - wb.stream <- resp + Param struct { + Proxy string + UserAgent string + Referer string + MaxBodySize int64 + } - // add to queue - for _, link := range resp.NextURLs { - u, err := req.AbsURL(link) - if err != nil { - continue - } + FilterRule struct { + Hostname string + Allow []*regexp.Regexp + Disallow []*regexp.Regexp + } - // is allowed domain - if !strings.Contains(u.Hostname(), req.BaseDomain) { - continue - } + RateLimit struct { + Hostname string + Rate string + } - // add only referer & maxBodySize - // rest of params will be added - // right before fetch request - // to avoid rotating user agent and proxy. - p := Param{ - Referer: req.URL.String(), - MaxBodySize: wb.conf.maxBodySize, - } - nreq, err := newRequest(u.String(), 1, p) - if err != nil { - continue - } + Log struct { + RequestURL string + Status int + Depth int32 + Err error + Timestamp time.Time + ResponseTime time.Duration + ContentSize int64 + UserAgent string + RedirectURL string + } +) - if err := wb.queue.Enqueue(nreq); err != nil { - continue - } +func (r *Request) ResolveURL(u string) (*url.URL, error) { + if strings.HasPrefix(u, "#") { + return nil, fmt.Errorf("url is a fragment") } - // start crawl - wb.wg.Add(wb.conf.parallel) - for i := 0; i < wb.conf.parallel; i++ { - go wb.crawl() + absURL, err := r.URL.Parse(u) + if err != nil { + return nil, err } - // wait for all workers to finish - wb.wg.Wait() - close(wb.stream) + absURL.Fragment = "" - return nil + return absURL, nil } -// crawl -func (wb *WBot) crawl() { - defer wb.wg.Done() - // - for wb.queue.Next() { - req, err := wb.queue.Dequeue() - if err != nil { - if wb.log != nil { - rep := newReport(Response{}, err) - wb.log.Send(rep) - } - continue - } +func FindLinks(body []byte) (hrefs []string) { + doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) + if err != nil { + return hrefs + } - // check if max depth reached - if req.Depth > wb.conf.maxDepth { - if wb.log != nil { - rep := newReport(Response{}, fmt.Errorf("max depth reached")) - wb.log.Send(rep) - } - return + doc.Find("a[href]").Each(func(index int, item *goquery.Selection) { + if href, found := item.Attr("href"); found { + hrefs = append(hrefs, href) } - - // if already visited - if wb.store.Visited(req.URL.String()) { - if wb.log != nil { - rep := newReport(Response{}, fmt.Errorf("url recently checked")) - wb.log.Send(rep) - } - continue + }) + doc.Find("link[href]").Each(func(index int, item *goquery.Selection) { + if href, found := item.Attr("href"); found { + hrefs = append(hrefs, href) } - - // check filter - if !wb.filter.Allow(req.URL) { - if wb.log != nil { - rep := newReport(Response{}, fmt.Errorf("filtered url")) - wb.log.Send(rep) - } - continue + }) + doc.Find("img[src]").Each(func(index int, item *goquery.Selection) { + if src, found := item.Attr("src"); found { + hrefs = append(hrefs, src) } + }) + doc.Find("script[src]").Each(func(index int, item *goquery.Selection) { + if src, found := item.Attr("src"); found { + hrefs = append(hrefs, src) + } + }) - // rate limit - wb.limit.take(req.URL) + // ... Add other tags and attributes as necessary - req.Param.UserAgent = wb.conf.userAgents.next() - req.Param.Proxy = wb.conf.proxies.next() + return hrefs +} - // visit next url - resp, err := wb.fetcher.Fetch(req) - if err != nil { - if wb.log != nil { - rep := newReport(resp, err) - wb.log.Send(rep) - } - continue - } +func HashLink(link string) (string, error) { + parsedLink, err := url.Parse(link) + if err != nil { + return "", err + } - if wb.log != nil { - rep := newReport(resp, nil) - wb.log.Send(rep) - } + parsedLink.Scheme = "" - // stream - wb.stream <- resp - - // current depth - depth := req.Depth - // increment depth - atomic.AddInt32(&depth, 1) - - // visit next urls - for _, link := range resp.NextURLs { - u, err := req.AbsURL(link) - if err != nil { - continue - } - - // is allowed domain - if !strings.Contains(u.Hostname(), req.BaseDomain) { - continue - } - - p := Param{ - Referer: req.URL.String(), - MaxBodySize: wb.conf.maxBodySize, - } - nreq, err := newRequest(u.String(), depth, p) - if err != nil { - continue - } - - if err := wb.queue.Enqueue(nreq); err != nil { - continue - } - } + parsedLink.Host = strings.TrimPrefix(parsedLink.Host, "www.") + + decodedPath, err := url.PathUnescape(parsedLink.Path) + if err != nil { + return "", err } + parsedLink.Path = decodedPath + + cleanedURL := strings.TrimRight(parsedLink.String(), "/") + + cleanedURL = strings.TrimPrefix(cleanedURL, "//") + + hasher := sha256.New() + hasher.Write([]byte(cleanedURL)) + + return hex.EncodeToString(hasher.Sum(nil)), nil } -// SetOptions -func (wb *WBot) SetOptions(opts ...Option) { - for _, opt := range opts { - opt(wb) +func Hostname(link string) (string, error) { + hostname, err := publicsuffix.Domain(link) + if err != nil { + return "", fmt.Errorf("failed to get domain: %w", err) } + return hostname, nil } -// Stream -func (wb *WBot) Stream() <-chan Response { - return wb.stream -} +func ValidURL(raw string) (*url.URL, error) { + u, err := url.Parse(raw) + if err != nil { + return nil, err + } -// Close -func (wb *WBot) Close() { - wb.queue.Close() - wb.store.Close() - if wb.log != nil { - wb.log.Close() + if u.Scheme != "http" && u.Scheme != "https" { + return nil, fmt.Errorf("invalid scheme: %s", u.Scheme) } + + return u, nil } From 0685430906a2149ec41a6322ef0857f63828f707 Mon Sep 17 00:00:00 2001 From: twiny Date: Sun, 20 Aug 2023 23:24:05 +0100 Subject: [PATCH 03/10] - fixed crwal close. --- crawler/config.go | 3 +- crawler/crawler.go | 299 +++++++++++++++++---------------- crawler/limiter.go | 2 +- crawler/option.go | 5 - plugin/fetcher/http_client.go | 6 +- plugin/queue/bbolt.go | 98 ----------- plugin/queue/in_memory.go | 74 -------- plugin/queue/in_memory_test.go | 44 ----- plugin/store/in_memory.go | 13 +- wbot.go | 16 +- 10 files changed, 179 insertions(+), 381 deletions(-) delete mode 100644 plugin/queue/bbolt.go delete mode 100644 plugin/queue/in_memory.go delete mode 100644 plugin/queue/in_memory_test.go diff --git a/crawler/config.go b/crawler/config.go index e2d65e3..7173c39 100644 --- a/crawler/config.go +++ b/crawler/config.go @@ -10,7 +10,7 @@ import ( const ( defaultReferrer = "https://www.google.com/search" defaultUserAgent = "WBot/0.1.6 (+https://github.com/twiny/wbot)" - defaultTimeout = 30 * time.Second + defaultTimeout = 10 * time.Second defaultMaxBodySize = int64(1024 * 1024 * 5) // 5MB ) @@ -29,6 +29,7 @@ func newConfig(maxDepth int32, userAgents, referrers, proxies []string) *config if maxDepth <= 0 { maxDepth = 10 } + var conf = &config{ parallel: runtime.NumCPU(), maxDepth: maxDepth, diff --git a/crawler/crawler.go b/crawler/crawler.go index 22d2a4c..831d163 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -2,26 +2,29 @@ package crawler import ( "context" + "net/url" "os" + "os/signal" "strings" "sync" + "sync/atomic" "github.com/twiny/wbot" "github.com/twiny/wbot/plugin/fetcher" - "github.com/twiny/wbot/plugin/queue" "github.com/twiny/wbot/plugin/store" clog "github.com/charmbracelet/log" ) type ( + Reader func(*wbot.Response) error + Crawler struct { wg sync.WaitGroup cfg *config fetcher wbot.Fetcher - queue wbot.Queue store wbot.Store logger wbot.Logger metrics wbot.MetricsMonitor @@ -30,10 +33,14 @@ type ( limiter *rateLimiter robot *robortManager - stream chan *wbot.Response + counter int32 + queue chan *wbot.Request + stream chan *wbot.Response termLog *clog.Logger + finished <-chan struct{} + ctx context.Context cancel context.CancelFunc } @@ -50,17 +57,24 @@ func New(opts ...Option) *Crawler { } c := &Crawler{ - cfg: newConfig(-1, nil, nil, nil), - filter: newFilter(), - limiter: newRateLimiter(), + cfg: newConfig(-1, nil, nil, nil), + fetcher: fetcher.NewHTTPClient(), - queue: queue.NewInMemoryQueue(), store: store.NewInMemoryStore(), // logger: newFileLogger(), - stream: make(chan *wbot.Response, 2048), + // metrics: newMetricsMonitor(), + + filter: newFilter(), + limiter: newRateLimiter(), + // robot: newRobotsManager(), + + queue: make(chan *wbot.Request, 1024), + stream: make(chan *wbot.Response, 1024), + termLog: clog.NewWithOptions(os.Stdout, options), - ctx: ctx, - cancel: cancel, + + ctx: ctx, + cancel: cancel, } for _, opt := range opts { @@ -68,10 +82,13 @@ func New(opts ...Option) *Crawler { } c.wg.Add(c.cfg.parallel) + c.termLog.Infof("starting %d workers...", c.cfg.parallel) for i := 0; i < c.cfg.parallel; i++ { - go c.routine() + go c.crawl() } + go c.shutdown() + return c } @@ -81,27 +98,49 @@ func (c *Crawler) SetOption(opts ...Option) { } } func (c *Crawler) Crawl(links ...string) { + var targets []*url.URL for _, link := range links { - c.start(link) + target, err := wbot.ValidURL(link) + if err != nil { + c.termLog.Errorf("start: %s", err.Error()) + continue + } + targets = append(targets, target) } -} -func (c *Crawler) Stream() <-chan *wbot.Response { - return c.stream -} -func (c *Crawler) Wait() { - c.wg.Wait() -} -func (c *Crawler) Done() { - c.cancel() -} -func (c *Crawler) start(raw string) { - target, err := wbot.ValidURL(raw) - if err != nil { - c.termLog.Errorf("start: %s", err.Error()) + if len(targets) == 0 { + c.termLog.Errorf("no valid links found") + c.cancel() + c.wg.Wait() + c.close() return } + c.termLog.Infof("crawling %d links...", len(targets)) + + for _, target := range targets { + c.start(target) + } + + c.wg.Wait() +} +func (c *Crawler) Read(fn Reader) error { + for { + select { + case <-c.ctx.Done(): + return c.ctx.Err() + case resp := <-c.stream: + if err := fn(resp); err != nil { + return err + } + } + } +} +func (c *Crawler) Done() <-chan struct{} { + return c.ctx.Done() +} + +func (c *Crawler) start(target *url.URL) { // first request param := &wbot.Param{ MaxBodySize: c.cfg.maxBodySize, @@ -112,129 +151,41 @@ func (c *Crawler) start(raw string) { param.Proxy = c.cfg.proxies.Next() } - key, err := wbot.HashLink(target.String()) + hostname, err := wbot.Hostname(target.Hostname()) if err != nil { // todo: log - // review: why error at this point? link is already validated - c.termLog.Errorf("hashlink -> invalid url: %s\n", target) - return - } - - hostname, err := wbot.Hostname(target.String()) - if err != nil { - // todo: log - c.termLog.Errorf("hostname -> invalid url: %s\n", target) + c.termLog.Errorf("hostname -> invalid url: %s", target) return } req := &wbot.Request{ - ID: key, BaseHost: hostname, URL: target, - Depth: 0, + Depth: 1, Param: param, } - c.termLog.Infof("start %+v\n", req) - - // todo: fix robots.txt - // if !c.robot.Allowed(ua, target) { - // // todo: log - // return - // } - - c.limiter.wait(target) // should be unique hash - - resp, err := c.fetcher.Fetch(context.TODO(), req) - if err != nil { - // todo: log - c.termLog.Errorf("fetcher -> %s\n", err.Error()) - return - } - - _, _ = c.store.HasVisited(context.TODO(), key) - - c.stream <- resp // stream 1st response - - for _, link := range resp.NextURLs { - u, err := req.ResolveURL(link) - if err != nil { - c.termLog.Errorf("resolve url -> %s\n", err.Error()) - continue - } - - // todo: this should only allow base host - if !strings.Contains(u.Hostname(), req.BaseHost) { - c.termLog.Errorf("invalid hostname: %s\n", u.Hostname()) - continue - } - - // add only referer & maxBodySize - // rest of params will be added - // right before fetch request - // to avoid rotating user agent and proxy. - nextParm := &wbot.Param{ - Referer: req.URL.String(), - MaxBodySize: c.cfg.maxBodySize, - } - - nextReq := &wbot.Request{ - ID: key, - BaseHost: hostname, - URL: u, - Depth: req.Depth + 1, - Param: nextParm, - } - - if err := c.queue.Push(context.TODO(), nextReq); err != nil { - c.termLog.Errorf("push -> %s\n", err.Error()) - continue - } - } + atomic.AddInt32(&c.counter, 1) + c.queue <- req } -func (c *Crawler) routine() { +func (c *Crawler) crawl() { defer c.wg.Done() for { select { case <-c.ctx.Done(): return - default: - req, err := c.queue.Pop(context.TODO()) - if err != nil { - if err == queue.ErrQueueClosed { - c.termLog.Errorf("queue closed\n") - return - } - c.termLog.Errorf("pop -> %s\n", err.Error()) - continue - } - - if visited, err := c.store.HasVisited(context.TODO(), req.ID); visited { - if err != nil { - // todo: log - c.termLog.Errorf("has visited -> %s\n", err.Error()) - continue - } - // todo: log - c.termLog.Errorf("already visited: %s\n", req.URL) - continue - } - + case req := <-c.queue: if req.Depth > c.cfg.maxDepth { - // todo: log - c.termLog.Errorf("max depth reached: %s\n", req.URL) - continue - } + atomic.AddInt32(&c.counter, -1) - // if !c.robot.Allowed(req.Param.UserAgent, req.URL.String()) { - // // todo: log - // continue - // } - - if !c.filter.allow(req.URL) { - // todo: log - c.termLog.Errorf("filter -> %s\n", req.URL) + // panic: close of closed channel + if c.counter == 0 { + c.termLog.Infof("done: crawled %d links", c.counter) + c.cancel() + c.close() + } + c.termLog.Errorf("max depth reached: %s, counter: %d, queue: %d", first64Chars(req.URL.String()), c.counter, len(c.queue)) continue } @@ -243,43 +194,63 @@ func (c *Crawler) routine() { resp, err := c.fetcher.Fetch(c.ctx, req) if err != nil { // todo: log - c.termLog.Errorf("fetcher -> %s\n", err.Error()) + atomic.AddInt32(&c.counter, -1) + c.termLog.Errorf("fetcher -> %s", err.Error()) continue } + atomic.AddInt32(&req.Depth, 1) for _, link := range resp.NextURLs { u, err := req.ResolveURL(link) if err != nil { - c.termLog.Errorf("resolve url -> %s\n", err.Error()) + // c.termLog.Errorf("crwal: resolve url %s -> %s", link, err.Error()) continue } - key, err := wbot.HashLink(u.String()) + hostname, err := wbot.Hostname(u.Hostname()) if err != nil { // todo: log - c.termLog.Errorf("hashlink -> %s\n", err.Error()) + // c.termLog.Errorf("hostname -> %s", err.Error()) continue } - hostname, err := wbot.Hostname(u.String()) - if err != nil { + + if !strings.Contains(u.Hostname(), hostname) { + // todo: log + // c.termLog.Errorf("invalid hostname: %s", u) + continue + } + + // if !c.robot.Allowed(req.Param.UserAgent, req.URL.String()) { + // // todo: log + // continue + // } + + if !c.filter.allow(u) { // todo: log - c.termLog.Errorf("hostname -> %s\n", err.Error()) + // c.termLog.Errorf("filter -> %s", req.URL) + continue + } + + if visited, err := c.store.HasVisited(context.TODO(), u.String()); visited { + if err != nil { + // todo: log + // c.termLog.Errorf("has visited -> %s", err.Error()) + // continue + } + // todo: log + // c.termLog.Errorf("already visited: %s", req.URL) continue } nextReq := &wbot.Request{ - ID: key, BaseHost: hostname, URL: u, - Depth: req.Depth + 1, + Depth: req.Depth, Param: req.Param, } - if err := c.queue.Push(context.TODO(), nextReq); err != nil { - // todo: log - c.termLog.Errorf("push -> %s\n", err.Error()) - continue - } + atomic.AddInt32(&c.counter, 1) + c.queue <- nextReq } // if c.log != nil { @@ -289,8 +260,48 @@ func (c *Crawler) routine() { // stream c.stream <- resp + atomic.AddInt32(&c.counter, -1) - c.termLog.Errorf("crawled: %s\n", req.URL) + c.termLog.Infof("crawled: %s, depth: %d, counter: %d, queue: %d", first64Chars(req.URL.String()), req.Depth, c.counter, len(c.queue)) } } } +func (c *Crawler) shutdown() { + ctx, done := signal.NotifyContext(c.ctx, os.Interrupt) + defer done() + + <-ctx.Done() + c.termLog.Infof("closing...") + + go func() { + nctx, ndone := signal.NotifyContext(context.Background(), os.Interrupt) + defer ndone() + + <-nctx.Done() + c.termLog.Errorf("force shutdown.. good bye!") + os.Exit(0) + }() + + c.cancel() + c.wg.Wait() + c.close() +} +func (c *Crawler) close() { + close(c.queue) + close(c.stream) + c.store.Close() + c.fetcher.Close() +} + +func first64Chars(s string) string { + if len(s) <= 64 { + return s + } + + runes := []rune(s) + if len(runes) <= 64 { + return s + } + + return string(runes[:64]) +} diff --git a/crawler/limiter.go b/crawler/limiter.go index e2937b0..dcab51e 100644 --- a/crawler/limiter.go +++ b/crawler/limiter.go @@ -51,7 +51,7 @@ func newRateLimiter(limits ...*wbot.RateLimit) *rateLimiter { return rl } func (l *rateLimiter) wait(link *url.URL) { - hostname, err := wbot.Hostname(link.String()) + hostname, err := wbot.Hostname(link.Hostname()) if err != nil { // case err, play safe. hostname = "*" diff --git a/crawler/option.go b/crawler/option.go index 9074942..26255eb 100644 --- a/crawler/option.go +++ b/crawler/option.go @@ -44,11 +44,6 @@ func WithFetcher(fetcher wbot.Fetcher) Option { c.fetcher = fetcher } } -func WithQueue(queue wbot.Queue) Option { - return func(c *Crawler) { - c.queue = queue - } -} func WithStore(store wbot.Store) Option { return func(c *Crawler) { c.store = store diff --git a/plugin/fetcher/http_client.go b/plugin/fetcher/http_client.go index 7f5e688..a401ba4 100644 --- a/plugin/fetcher/http_client.go +++ b/plugin/fetcher/http_client.go @@ -21,7 +21,7 @@ func NewHTTPClient() wbot.Fetcher { return &defaultHTTPClient{ client: &http.Client{ Jar: http.DefaultClient.Jar, - Timeout: 30 * time.Second, + Timeout: 10 * time.Second, }, } } @@ -112,8 +112,8 @@ func newHTTPTransport(purl string) *http.Transport { return &http.Transport{ Proxy: proxy, DialContext: (&net.Dialer{ - Timeout: 30 * time.Second, - KeepAlive: 30 * time.Second, + Timeout: 10 * time.Second, + KeepAlive: 10 * time.Second, DualStack: true, }).DialContext, ForceAttemptHTTP2: true, diff --git a/plugin/queue/bbolt.go b/plugin/queue/bbolt.go deleted file mode 100644 index 616346d..0000000 --- a/plugin/queue/bbolt.go +++ /dev/null @@ -1,98 +0,0 @@ -package queue - -// var ( -// ErrEmptyQueue = errors.New("queue is empty") -// ) - -// // prefix -// var prefix = "queue" - -// // BQueue -// type BQueue struct { -// prefix string -// db *bbolt.DB // Bolt stores its keys in byte-sorted order within a bucket. -// } - -// // NewBQueue -// func NewBQueue(db *bbolt.DB) (wbot.Queue, error) { -// if err := db.Update(func(tx *bbolt.Tx) error { -// _, err := tx.CreateBucketIfNotExists([]byte(prefix)) -// return err -// }); err != nil { -// return nil, err -// } - -// return &BQueue{ -// prefix: prefix, -// db: db, -// }, nil -// } - -// // Enqueue -// func (bq *BQueue) Enqueue(req wbot.Request) error { -// var buf bytes.Buffer -// if err := gob.NewEncoder(&buf).Encode(req); err != nil { -// return err -// } - -// return bq.db.Update(func(tx *bbolt.Tx) error { -// bu := tx.Bucket([]byte(prefix)) - -// var key = make([]byte, 8) -// seq, err := bu.NextSequence() -// if err != nil { -// return err -// } - -// binary.BigEndian.PutUint64(key, seq) - -// return bu.Put(key, buf.Bytes()) -// }) -// } - -// // Dequeue -// func (bq *BQueue) Dequeue() (wbot.Request, error) { -// // get from db -// var req wbot.Request -// if err := bq.db.Update(func(tx *bbolt.Tx) error { -// bu := tx.Bucket([]byte(prefix)) - -// c := bu.Cursor() - -// k, v := c.First() -// if k == nil { -// return ErrEmptyQueue -// } - -// if err := gob.NewDecoder(bytes.NewReader(v)).Decode(&req); err != nil { -// return err -// } - -// return c.Delete() -// }); err != nil { -// return wbot.Request{}, err -// } - -// return req, nil -// } - -// // Next -// func (bq *BQueue) Next() bool { -// return bq.db.View(func(tx *bbolt.Tx) error { -// bu := tx.Bucket([]byte(bq.prefix)) - -// c := bu.Cursor() - -// k, _ := c.First() -// if k == nil { -// return ErrEmptyQueue -// } - -// return nil -// }) == nil -// } - -// // Close -// func (bq *BQueue) Close() error { -// return bq.db.Close() -// } diff --git a/plugin/queue/in_memory.go b/plugin/queue/in_memory.go deleted file mode 100644 index 4cfd835..0000000 --- a/plugin/queue/in_memory.go +++ /dev/null @@ -1,74 +0,0 @@ -package queue - -import ( - "context" - "fmt" - "sync" - - "github.com/twiny/wbot" -) - -var ( - ErrQueueClosed = fmt.Errorf("queue is closed") -) - -type ( - defaultInMemoryQueue struct { - mu *sync.RWMutex - list []*wbot.Request - cond *sync.Cond - closed bool - } -) - -func NewInMemoryQueue() wbot.Queue { - queue := &defaultInMemoryQueue{ - mu: &sync.RWMutex{}, - list: make([]*wbot.Request, 0, 4096), - } - queue.cond = sync.NewCond(queue.mu) - return queue -} -func (q *defaultInMemoryQueue) Push(ctx context.Context, req *wbot.Request) error { - q.mu.Lock() - defer q.mu.Unlock() - - if q.closed { - return ErrQueueClosed - } - - q.list = append(q.list, req) - q.cond.Broadcast() - - return nil -} -func (q *defaultInMemoryQueue) Pop(ctx context.Context) (*wbot.Request, error) { - q.mu.Lock() - defer q.mu.Unlock() - - switch { - case q.closed && len(q.list) == 0: - return nil, ErrQueueClosed - case len(q.list) == 0 && !q.closed: - q.cond.Wait() - } - - req := q.list[0] - q.list = q.list[1:] - return req, nil -} -func (q *defaultInMemoryQueue) Len() int { - q.mu.RLock() - defer q.mu.RUnlock() - - return len(q.list) -} -func (q *defaultInMemoryQueue) Close() error { - q.mu.Lock() - defer q.mu.Unlock() - - q.closed = true - q.cond.Broadcast() - - return nil -} diff --git a/plugin/queue/in_memory_test.go b/plugin/queue/in_memory_test.go deleted file mode 100644 index 81068c3..0000000 --- a/plugin/queue/in_memory_test.go +++ /dev/null @@ -1,44 +0,0 @@ -package queue - -import ( - "context" - "fmt" - "sync" - "testing" - - "github.com/twiny/wbot" -) - -func TestInMemoryPush(t *testing.T) { - -} - -func TestInMemoryPop(t *testing.T) { - -} - -// go test -benchmem -v -count=1 -run=^$ -bench ^BenchmarkInMemoryPush$ github.com/twiny/wbot/plugin/queue -tags=integration,unit -func BenchmarkInMemoryPush(b *testing.B) { - queue := NewInMemoryQueue() - defer queue.Close() - - b.ResetTimer() // Reset the timer to ignore the setup time - - var wg sync.WaitGroup - for i := 0; i < b.N; i++ { - wg.Add(1) - go func(j int) { - defer wg.Done() - if err := queue.Push(context.TODO(), &wbot.Request{ - ID: fmt.Sprintf("%d", j), - }); err != nil { - b.Error(err) - } - }(i) - } - wg.Wait() -} - -func BenchmarkInMemoryPop(b *testing.B) { - -} diff --git a/plugin/store/in_memory.go b/plugin/store/in_memory.go index 23efbb8..a5092d8 100644 --- a/plugin/store/in_memory.go +++ b/plugin/store/in_memory.go @@ -23,7 +23,18 @@ func (s *defaultInMemoryStore) HasVisited(ctx context.Context, link string) (boo s.mu.RLock() defer s.mu.RUnlock() - return s.table[link], nil + hash, err := wbot.HashLink(link) + if err != nil { + return false, err + } + + _, found := s.table[hash] + if !found { + s.table[hash] = true + return false, nil + } + + return found, nil } func (s *defaultInMemoryStore) Close() error { return nil diff --git a/wbot.go b/wbot.go index 5251312..efa2164 100644 --- a/wbot.go +++ b/wbot.go @@ -21,14 +21,8 @@ type ( Close() error } - Queue interface { - Push(ctx context.Context, req *Request) error - Pop(ctx context.Context) (*Request, error) - Close() error - } - Store interface { - HasVisited(ctx context.Context, key string) (bool, error) + HasVisited(ctx context.Context, link string) (bool, error) Close() error } @@ -143,9 +137,11 @@ func FindLinks(body []byte) (hrefs []string) { hrefs = append(hrefs, src) } }) - - // ... Add other tags and attributes as necessary - + doc.Find("iframe[src]").Each(func(index int, item *goquery.Selection) { + if src, found := item.Attr("src"); found { + hrefs = append(hrefs, src) + } + }) return hrefs } From d824a9b4cd5ec1e80689604ccd75908e96a5aaae Mon Sep 17 00:00:00 2001 From: twiny Date: Mon, 21 Aug 2023 17:37:54 +0100 Subject: [PATCH 04/10] - added twiny/flare to signal quit & finished. --- crawler/crawler.go | 96 +++++++++++++++++++--------------------------- go.mod | 1 + go.sum | 2 + 3 files changed, 43 insertions(+), 56 deletions(-) diff --git a/crawler/crawler.go b/crawler/crawler.go index 831d163..372a109 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -4,7 +4,6 @@ import ( "context" "net/url" "os" - "os/signal" "strings" "sync" "sync/atomic" @@ -13,12 +12,12 @@ import ( "github.com/twiny/wbot/plugin/fetcher" "github.com/twiny/wbot/plugin/store" + "github.com/twiny/flare" + clog "github.com/charmbracelet/log" ) type ( - Reader func(*wbot.Response) error - Crawler struct { wg sync.WaitGroup @@ -37,19 +36,17 @@ type ( queue chan *wbot.Request stream chan *wbot.Response - termLog *clog.Logger - - finished <-chan struct{} + finished flare.Notifier + quit flare.Notifier - ctx context.Context - cancel context.CancelFunc + termLog *clog.Logger } + + Reader func(*wbot.Response) error ) func New(opts ...Option) *Crawler { - ctx, cancel := context.WithCancel(context.Background()) - - options := clog.Options{ + logOpt := clog.Options{ TimeFormat: "2006-01-02 15:04:05", Level: clog.DebugLevel, Prefix: "[WBot]", @@ -71,10 +68,10 @@ func New(opts ...Option) *Crawler { queue: make(chan *wbot.Request, 1024), stream: make(chan *wbot.Response, 1024), - termLog: clog.NewWithOptions(os.Stdout, options), + termLog: clog.NewWithOptions(os.Stdout, logOpt), - ctx: ctx, - cancel: cancel, + finished: flare.New(), + quit: flare.New(), } for _, opt := range opts { @@ -84,11 +81,9 @@ func New(opts ...Option) *Crawler { c.wg.Add(c.cfg.parallel) c.termLog.Infof("starting %d workers...", c.cfg.parallel) for i := 0; i < c.cfg.parallel; i++ { - go c.crawl() + go c.crawler() } - go c.shutdown() - return c } @@ -110,9 +105,11 @@ func (c *Crawler) Crawl(links ...string) { if len(targets) == 0 { c.termLog.Errorf("no valid links found") - c.cancel() + c.finished.Cancel() + c.quit.Cancel() c.wg.Wait() - c.close() + close(c.queue) + close(c.stream) return } @@ -127,8 +124,10 @@ func (c *Crawler) Crawl(links ...string) { func (c *Crawler) Read(fn Reader) error { for { select { - case <-c.ctx.Done(): - return c.ctx.Err() + case <-c.quit.Done(): + return nil + case <-c.finished.Done(): + return nil case resp := <-c.stream: if err := fn(resp); err != nil { return err @@ -136,8 +135,13 @@ func (c *Crawler) Read(fn Reader) error { } } } -func (c *Crawler) Done() <-chan struct{} { - return c.ctx.Done() +func (c *Crawler) Close() { + c.termLog.Infof("done: crawled %d links", c.counter) + c.finished.Cancel() + c.quit.Cancel() + c.wg.Wait() + c.store.Close() + c.fetcher.Close() } func (c *Crawler) start(target *url.URL) { @@ -168,30 +172,36 @@ func (c *Crawler) start(target *url.URL) { atomic.AddInt32(&c.counter, 1) c.queue <- req } -func (c *Crawler) crawl() { +func (c *Crawler) crawler() { defer c.wg.Done() + c.termLog.Debugf("worker started") for { select { - case <-c.ctx.Done(): + case <-c.quit.Done(): + c.termLog.Debugf("worker quit") + return + case <-c.finished.Done(): + c.termLog.Debugf("worker finished") return + // continue case req := <-c.queue: if req.Depth > c.cfg.maxDepth { atomic.AddInt32(&c.counter, -1) - // panic: close of closed channel if c.counter == 0 { c.termLog.Infof("done: crawled %d links", c.counter) - c.cancel() - c.close() + c.finished.Cancel() + c.quit.Cancel() + return } - c.termLog.Errorf("max depth reached: %s, counter: %d, queue: %d", first64Chars(req.URL.String()), c.counter, len(c.queue)) + // c.termLog.Errorf("max depth reached: %s, counter: %d, queue: %d", first64Chars(req.URL.String()), c.counter, len(c.queue)) continue } c.limiter.wait(req.URL) - resp, err := c.fetcher.Fetch(c.ctx, req) + resp, err := c.fetcher.Fetch(context.TODO(), req) if err != nil { // todo: log atomic.AddInt32(&c.counter, -1) @@ -266,32 +276,6 @@ func (c *Crawler) crawl() { } } } -func (c *Crawler) shutdown() { - ctx, done := signal.NotifyContext(c.ctx, os.Interrupt) - defer done() - - <-ctx.Done() - c.termLog.Infof("closing...") - - go func() { - nctx, ndone := signal.NotifyContext(context.Background(), os.Interrupt) - defer ndone() - - <-nctx.Done() - c.termLog.Errorf("force shutdown.. good bye!") - os.Exit(0) - }() - - c.cancel() - c.wg.Wait() - c.close() -} -func (c *Crawler) close() { - close(c.queue) - close(c.stream) - c.store.Close() - c.fetcher.Close() -} func first64Chars(s string) string { if len(s) <= 64 { diff --git a/go.mod b/go.mod index a4e49fe..a444329 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,7 @@ require ( github.com/PuerkitoBio/goquery v1.8.1 github.com/charmbracelet/log v0.2.3 github.com/temoto/robotstxt v1.1.2 + github.com/twiny/flare v0.1.0 github.com/twiny/flog/v2 v2.0.0 github.com/twiny/poxa v0.1.0 github.com/twiny/ratelimit v0.0.0-20220509163414-256d3376b0ac diff --git a/go.sum b/go.sum index be46d47..7f1fd67 100644 --- a/go.sum +++ b/go.sum @@ -50,6 +50,8 @@ github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcU github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= +github.com/twiny/flare v0.1.0 h1:bq50IXYNUpiJULoIXXwerL1gwr+KBz49ayYgQo/CqnY= +github.com/twiny/flare v0.1.0/go.mod h1:Rlzkek5PDlOGFue015tC7fe/ROyeSx3hqy+jfAZGezQ= github.com/twiny/flog/v2 v2.0.0 h1:uHyWg1/q1P5zi/5jOiZyFZBfUZwd416v+9J+fnIBMZY= github.com/twiny/flog/v2 v2.0.0/go.mod h1:ujV9tY/hkP9302AgmXcpA/ds1/prAymijvIoxF9zzbM= github.com/twiny/poxa v0.1.0 h1:NMM1ZeRfGFVOz60NjHR4r78pQYkq09VyOjKjdkhkWsE= From 00f5415282f3c06b830474052e38425edc3864e6 Mon Sep 17 00:00:00 2001 From: twiny Date: Sat, 26 Aug 2023 23:24:43 +0100 Subject: [PATCH 05/10] - update. --- crawler/crawler.go | 232 ++++++++++++++++++---------------- crawler/filter.go | 21 +-- crawler/limiter.go | 11 +- go.mod | 3 + go.sum | 15 +++ plugin/fetcher/http_client.go | 21 ++- plugin/queue/queue.go | 76 +++++++++++ plugin/store/in_memory.go | 18 +-- utilities.go | 52 ++++++++ utilities_test.go | 39 ++++++ wbot.go | 107 +++++++++------- 11 files changed, 408 insertions(+), 187 deletions(-) create mode 100644 plugin/queue/queue.go create mode 100644 utilities.go create mode 100644 utilities_test.go diff --git a/crawler/crawler.go b/crawler/crawler.go index 372a109..b102e4e 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -2,7 +2,7 @@ package crawler import ( "context" - "net/url" + "fmt" "os" "strings" "sync" @@ -10,6 +10,7 @@ import ( "github.com/twiny/wbot" "github.com/twiny/wbot/plugin/fetcher" + "github.com/twiny/wbot/plugin/queue" "github.com/twiny/wbot/plugin/store" "github.com/twiny/flare" @@ -17,6 +18,15 @@ import ( clog "github.com/charmbracelet/log" ) +type CrawlState int32 + +const ( + CrawlQuit CrawlState = iota + CrawlFinished + CrawlInProgress + CrawlPaused +) + type ( Crawler struct { wg sync.WaitGroup @@ -26,6 +36,7 @@ type ( fetcher wbot.Fetcher store wbot.Store logger wbot.Logger + queue wbot.Queue metrics wbot.MetricsMonitor filter *filter @@ -33,16 +44,14 @@ type ( robot *robortManager counter int32 - queue chan *wbot.Request stream chan *wbot.Response + errors chan error - finished flare.Notifier - quit flare.Notifier + state CrawlState + quit flare.Notifier termLog *clog.Logger } - - Reader func(*wbot.Response) error ) func New(opts ...Option) *Crawler { @@ -59,93 +68,112 @@ func New(opts ...Option) *Crawler { fetcher: fetcher.NewHTTPClient(), store: store.NewInMemoryStore(), // logger: newFileLogger(), + queue: queue.NewInMemoryQueue(), // metrics: newMetricsMonitor(), filter: newFilter(), limiter: newRateLimiter(), // robot: newRobotsManager(), - queue: make(chan *wbot.Request, 1024), - stream: make(chan *wbot.Response, 1024), + stream: make(chan *wbot.Response, 16), + errors: make(chan error, 16), termLog: clog.NewWithOptions(os.Stdout, logOpt), - finished: flare.New(), - quit: flare.New(), + quit: flare.New(), } for _, opt := range opts { opt(c) } - c.wg.Add(c.cfg.parallel) - c.termLog.Infof("starting %d workers...", c.cfg.parallel) - for i := 0; i < c.cfg.parallel; i++ { - go c.crawler() - } - return c } -func (c *Crawler) SetOption(opts ...Option) { - for _, opt := range opts { - opt(c) - } -} -func (c *Crawler) Crawl(links ...string) { - var targets []*url.URL +func (c *Crawler) Start(links ...string) error { + var targets []*wbot.ParsedURL for _, link := range links { - target, err := wbot.ValidURL(link) + target, err := wbot.NewURL(link) if err != nil { - c.termLog.Errorf("start: %s", err.Error()) + c.errors <- fmt.Errorf("start: %w", err) continue } targets = append(targets, target) } if len(targets) == 0 { - c.termLog.Errorf("no valid links found") - c.finished.Cancel() c.quit.Cancel() c.wg.Wait() - close(c.queue) close(c.stream) - return + return fmt.Errorf("no links to crawl") } c.termLog.Infof("crawling %d links...", len(targets)) for _, target := range targets { - c.start(target) + c.wg.Add(1) + go c.start(target) + } + + c.wg.Add(c.cfg.parallel) + c.termLog.Infof("starting %d workers...", c.cfg.parallel) + for i := 0; i < c.cfg.parallel; i++ { + go c.crawler() } c.wg.Wait() + return nil } -func (c *Crawler) Read(fn Reader) error { - for { - select { - case <-c.quit.Done(): - return nil - case <-c.finished.Done(): - return nil - case resp := <-c.stream: - if err := fn(resp); err != nil { - return err +func (c *Crawler) OnReponse(fn func(*wbot.Response) error) { + c.wg.Add(1) + go func() { + defer c.wg.Done() + + for { + select { + case <-c.quit.Done(): + return + case resp := <-c.stream: + if err := fn(resp); err != nil { + c.errors <- err + } } } - } + }() +} +func (c *Crawler) OnError(fn func(err error)) { + c.wg.Add(1) + + go func() { + defer c.wg.Done() + + for { + select { + case <-c.quit.Done(): + return + case err := <-c.errors: + fn(err) + } + } + }() +} +func (c *Crawler) Stats() map[string]any { + return map[string]any{} } func (c *Crawler) Close() { - c.termLog.Infof("done: crawled %d links", c.counter) - c.finished.Cancel() + c.termLog.Debugf("closing...") c.quit.Cancel() c.wg.Wait() c.store.Close() c.fetcher.Close() } -func (c *Crawler) start(target *url.URL) { - // first request +func (c *Crawler) start(target *wbot.ParsedURL) { + defer func() { + c.wg.Done() + atomic.AddInt32(&c.counter, 1) + }() + param := &wbot.Param{ MaxBodySize: c.cfg.maxBodySize, UserAgent: c.cfg.userAgents.Next(), @@ -155,78 +183,65 @@ func (c *Crawler) start(target *url.URL) { param.Proxy = c.cfg.proxies.Next() } - hostname, err := wbot.Hostname(target.Hostname()) - if err != nil { - // todo: log - c.termLog.Errorf("hostname -> invalid url: %s", target) - return + req := &wbot.Request{ + Target: target, + Param: param, + Depth: 0, } - req := &wbot.Request{ - BaseHost: hostname, - URL: target, - Depth: 1, - Param: param, + if err := c.queue.Push(context.TODO(), req); err != nil { + c.errors <- fmt.Errorf("push: %w", err) + return } - atomic.AddInt32(&c.counter, 1) - c.queue <- req } func (c *Crawler) crawler() { defer c.wg.Done() - c.termLog.Debugf("worker started") for { select { case <-c.quit.Done(): - c.termLog.Debugf("worker quit") + c.termLog.Debugf("quit") + c.queue.Close() // must close queue before quit return - case <-c.finished.Done(): - c.termLog.Debugf("worker finished") - return - // continue - case req := <-c.queue: + default: + func() { + defer atomic.AddInt32(&c.counter, -1) + }() + + req, err := c.queue.Pop(context.TODO()) + if err != nil { + // atomic.AddInt32(&c.counter, -1) + // c.termLog.Errorf("pop: %s", err.Error()) + continue + } + if req.Depth > c.cfg.maxDepth { - atomic.AddInt32(&c.counter, -1) + // atomic.AddInt32(&c.counter, -1) - if c.counter == 0 { - c.termLog.Infof("done: crawled %d links", c.counter) - c.finished.Cancel() + if c.queue.Len()+atomic.LoadInt32(&c.counter) == 0 { c.quit.Cancel() - return + fmt.Println("queue len:", c.queue.Len()) + continue } - // c.termLog.Errorf("max depth reached: %s, counter: %d, queue: %d", first64Chars(req.URL.String()), c.counter, len(c.queue)) + continue } - c.limiter.wait(req.URL) + c.limiter.wait(req.Target) resp, err := c.fetcher.Fetch(context.TODO(), req) if err != nil { // todo: log - atomic.AddInt32(&c.counter, -1) - c.termLog.Errorf("fetcher -> %s", err.Error()) + // atomic.AddInt32(&c.counter, -1) + c.errors <- fmt.Errorf("fetch: %w", err) continue } atomic.AddInt32(&req.Depth, 1) - for _, link := range resp.NextURLs { - u, err := req.ResolveURL(link) - if err != nil { - // c.termLog.Errorf("crwal: resolve url %s -> %s", link, err.Error()) - continue - } - - hostname, err := wbot.Hostname(u.Hostname()) - if err != nil { - // todo: log - // c.termLog.Errorf("hostname -> %s", err.Error()) - continue - } - - if !strings.Contains(u.Hostname(), hostname) { - // todo: log - // c.termLog.Errorf("invalid hostname: %s", u) + for _, target := range resp.NextURLs { + if !strings.Contains(target.URL.Host, req.Target.Root) { + c.errors <- fmt.Errorf("hostname check: %w", err) continue } @@ -235,32 +250,32 @@ func (c *Crawler) crawler() { // continue // } - if !c.filter.allow(u) { - // todo: log - // c.termLog.Errorf("filter -> %s", req.URL) + if !c.filter.allow(target) { + c.errors <- fmt.Errorf("allow check: %w", err) continue } - if visited, err := c.store.HasVisited(context.TODO(), u.String()); visited { + if visited, err := c.store.HasVisited(context.TODO(), target); visited { if err != nil { - // todo: log - // c.termLog.Errorf("has visited -> %s", err.Error()) - // continue + c.errors <- fmt.Errorf("has visited 1: %w", err) + continue } - // todo: log - // c.termLog.Errorf("already visited: %s", req.URL) + c.errors <- fmt.Errorf("has visited 2: %w", err) continue } nextReq := &wbot.Request{ - BaseHost: hostname, - URL: u, - Depth: req.Depth, - Param: req.Param, + Target: target, + Depth: req.Depth, + Param: req.Param, + } + + if err := c.queue.Push(context.TODO(), nextReq); err != nil { + c.errors <- fmt.Errorf("push: %w", err) + continue } atomic.AddInt32(&c.counter, 1) - c.queue <- nextReq } // if c.log != nil { @@ -270,9 +285,14 @@ func (c *Crawler) crawler() { // stream c.stream <- resp - atomic.AddInt32(&c.counter, -1) + // atomic.AddInt32(&c.counter, -1) - c.termLog.Infof("crawled: %s, depth: %d, counter: %d, queue: %d", first64Chars(req.URL.String()), req.Depth, c.counter, len(c.queue)) + c.termLog.Infof("crawled: %s, depth: %d, counter: %d, queue: %d", first64Chars(req.Target.URL.String()), req.Depth, c.counter, c.queue.Len()) + + if c.queue.Len()+atomic.LoadInt32(&c.counter) == 0 { + c.queue.Close() // must close queue before quit + continue + } } } } diff --git a/crawler/filter.go b/crawler/filter.go index eb9356f..7d004c6 100644 --- a/crawler/filter.go +++ b/crawler/filter.go @@ -1,14 +1,13 @@ package crawler import ( - "net/url" "regexp" "github.com/twiny/wbot" ) var ( - badExtensions = regexp.MustCompile(`\.(png|jpg|jpeg|gif|ico|eps|pdf|iso|mp3|mp4|zip|aif|mpa|wav|wma|7z|deb|pkg|rar|rpm|bin|dmg|dat|tar|exe|ps|psd|svg|tif|tiff|pps|ppt|pptx|xls|xlsx|wmv|doc|docx|txt|mov|mpl)$`) + badExtensions = regexp.MustCompile(`\.(png|jpg|jpeg|gif|ico|eps|pdf|iso|mp3|mp4|zip|aif|mpa|wav|wma|7z|deb|pkg|rar|rpm|bin|dmg|dat|tar|exe|ps|psd|svg|tif|tiff|pps|ppt|pptx|xls|xlsx|wmv|doc|docx|txt|mov|mpl|css|js)$`) ) type ( @@ -28,18 +27,12 @@ func newFilter(rules ...*wbot.FilterRule) *filter { return f } - -func (f *filter) allow(link *url.URL) bool { - hostname, err := wbot.Hostname(link.String()) - if err != nil { - // review: double check this case - } - - if badExtensions.MatchString(link.String()) { +func (f *filter) allow(u *wbot.ParsedURL) bool { + if badExtensions.MatchString(u.URL.Path) { return false } - rule, found := f.rules[hostname] + rule, found := f.rules[u.Root] if !found { // check if there is a wildcard rule rule, found = f.rules["*"] @@ -49,16 +42,16 @@ func (f *filter) allow(link *url.URL) bool { } for _, pattern := range rule.Disallow { - if pattern.MatchString(link.String()) { + if pattern.MatchString(u.URL.String()) { return false } } for _, pattern := range rule.Allow { - if pattern.MatchString(link.String()) { + if pattern.MatchString(u.URL.String()) { return true } } - return false // review: default deny + return false // default deny } diff --git a/crawler/limiter.go b/crawler/limiter.go index dcab51e..eb7cf0d 100644 --- a/crawler/limiter.go +++ b/crawler/limiter.go @@ -1,7 +1,6 @@ package crawler import ( - "net/url" "strconv" "strings" "time" @@ -50,14 +49,8 @@ func newRateLimiter(limits ...*wbot.RateLimit) *rateLimiter { return rl } -func (l *rateLimiter) wait(link *url.URL) { - hostname, err := wbot.Hostname(link.Hostname()) - if err != nil { - // case err, play safe. - hostname = "*" - } - - limit, found := l.table[hostname] +func (l *rateLimiter) wait(u *wbot.ParsedURL) { + limit, found := l.table[u.Root] if !found { limit = l.table["*"] } diff --git a/go.mod b/go.mod index a444329..d88a154 100644 --- a/go.mod +++ b/go.mod @@ -18,6 +18,8 @@ require ( github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect github.com/benbjohnson/clock v1.3.0 // indirect github.com/charmbracelet/lipgloss v0.7.1 // indirect + github.com/go-echarts/go-echarts/v2 v2.2.7 // indirect + github.com/go-echarts/statsview v0.3.4 // indirect github.com/go-logfmt/logfmt v0.6.0 // indirect github.com/lucasb-eyer/go-colorful v1.2.0 // indirect github.com/mattn/go-isatty v0.0.18 // indirect @@ -25,6 +27,7 @@ require ( github.com/muesli/reflow v0.3.0 // indirect github.com/muesli/termenv v0.15.2 // indirect github.com/rivo/uniseg v0.2.0 // indirect + github.com/rs/cors v1.9.0 // indirect golang.org/x/net v0.12.0 // indirect golang.org/x/sys v0.10.0 // indirect golang.org/x/text v0.11.0 // indirect diff --git a/go.sum b/go.sum index 7f1fd67..72105f2 100644 --- a/go.sum +++ b/go.sum @@ -17,6 +17,11 @@ github.com/cloudflare/circl v1.1.0/go.mod h1:prBCrKB9DV4poKZY1l9zBXg2QJY7mvgRvtM github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-echarts/go-echarts/v2 v2.2.3/go.mod h1:6TOomEztzGDVDkOSCFBq3ed7xOYfbOqhaBzD0YV771A= +github.com/go-echarts/go-echarts/v2 v2.2.7 h1:mtFAuoqQ7McdlKrJ0gLexwxMPT7yoscDDhULNwPOxBk= +github.com/go-echarts/go-echarts/v2 v2.2.7/go.mod h1:VEeyPT5Odx/UHeuxtIAHGu2+87MWGA5OBaZ120NFi/w= +github.com/go-echarts/statsview v0.3.4 h1:CCuytRAutdnF901NrR4BzSjHXjUp8OyA3/iopgG/1/Y= +github.com/go-echarts/statsview v0.3.4/go.mod h1:AehKjL9cTFMeIo5QdV8sQO43vFmfY65X5GMWa3XMciY= github.com/go-logfmt/logfmt v0.6.0 h1:wGYYu3uicYdqXVgoYbvnkrPVXkuLM1p1ifugDMEdRi4= github.com/go-logfmt/logfmt v0.6.0/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KEVveWlfTs= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -28,6 +33,9 @@ github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeN github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-github/v50 v50.2.0/go.mod h1:VBY8FB6yPIjrtKhozXv4FQupxKLS6H4m6xFZlT43q8Q= github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17icRSOU623lUBU= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY= github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= github.com/mattn/go-isatty v0.0.18 h1:DOKFKCQ7FNG2L1rbrmstDN4QVRdS89Nkh85u68Uwp98= @@ -44,8 +52,12 @@ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZN github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/rs/cors v1.7.0/go.mod h1:gFx+x8UowdsKA9AchylcLynDq+nNFfI8FkUZdN/jGCU= +github.com/rs/cors v1.9.0 h1:l9HGsTsHJcvW14Nk7J9KFz8bzeAWXn3CG6bgt7LsrAE= +github.com/rs/cors v1.9.0/go.mod h1:XyqrcTp5zjWr1wsJ8PIRZssZ8b/WMcMf71DJnit4EMU= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.6.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= @@ -121,5 +133,8 @@ google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp0 google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/plugin/fetcher/http_client.go b/plugin/fetcher/http_client.go index a401ba4..ee6ebdd 100644 --- a/plugin/fetcher/http_client.go +++ b/plugin/fetcher/http_client.go @@ -73,7 +73,7 @@ func (f *defaultHTTPClient) fetch(req *wbot.Request) (*wbot.Response, error) { resp, err := f.client.Do(&http.Request{ Method: http.MethodGet, - URL: req.URL, + URL: req.Target.URL, Header: header, Proto: "HTTP/1.1", ProtoMajor: 1, @@ -93,11 +93,26 @@ func (f *defaultHTTPClient) fetch(req *wbot.Request) (*wbot.Response, error) { resp.Body.Close() + links := wbot.FindLinks(body) + + var nextURLs []*wbot.ParsedURL + for _, link := range links { + absURL, err := req.ResolveURL(link) + if err != nil { + continue + } + parsedURL, err := wbot.NewURL(absURL.String()) + if err != nil { + continue + } + nextURLs = append(nextURLs, parsedURL) + } + return &wbot.Response{ - URL: req.URL, + URL: req.Target, Status: resp.StatusCode, Body: body, - NextURLs: wbot.FindLinks(body), + NextURLs: nextURLs, Depth: req.Depth, }, nil } diff --git a/plugin/queue/queue.go b/plugin/queue/queue.go new file mode 100644 index 0000000..bb1fa4f --- /dev/null +++ b/plugin/queue/queue.go @@ -0,0 +1,76 @@ +package queue + +import ( + "context" + "fmt" + "sync" + + "github.com/twiny/wbot" +) + +var ( + ErrQueueClosed = fmt.Errorf("queue is closed") +) + +type defaultInMemoryQueue struct { + mu sync.Mutex + list []*wbot.Request + cond *sync.Cond + closed bool +} + +func NewInMemoryQueue() wbot.Queue { + queue := &defaultInMemoryQueue{ + list: make([]*wbot.Request, 0, 4096), + } + queue.cond = sync.NewCond(&queue.mu) + return queue +} + +func (q *defaultInMemoryQueue) Push(ctx context.Context, req *wbot.Request) error { + q.mu.Lock() + defer q.mu.Unlock() + + if q.closed { + return ErrQueueClosed + } + + q.list = append(q.list, req) + q.cond.Broadcast() + + return nil +} + +func (q *defaultInMemoryQueue) Pop(ctx context.Context) (*wbot.Request, error) { + q.mu.Lock() + defer q.mu.Unlock() + + for len(q.list) == 0 && !q.closed { + q.cond.Wait() + } + + if q.closed && len(q.list) == 0 { + return nil, ErrQueueClosed + } + + req := q.list[0] + q.list = q.list[1:] + return req, nil +} + +func (q *defaultInMemoryQueue) Len() int32 { + q.mu.Lock() + defer q.mu.Unlock() + + return int32(len(q.list)) +} + +func (q *defaultInMemoryQueue) Close() error { + q.mu.Lock() + defer q.mu.Unlock() + + q.closed = true + q.cond.Broadcast() + + return nil +} diff --git a/plugin/store/in_memory.go b/plugin/store/in_memory.go index a5092d8..9d8e755 100644 --- a/plugin/store/in_memory.go +++ b/plugin/store/in_memory.go @@ -9,7 +9,7 @@ import ( type ( defaultInMemoryStore struct { - mu sync.RWMutex + mu sync.Mutex table map[string]bool } ) @@ -19,23 +19,19 @@ func NewInMemoryStore() wbot.Store { table: make(map[string]bool), } } -func (s *defaultInMemoryStore) HasVisited(ctx context.Context, link string) (bool, error) { - s.mu.RLock() - defer s.mu.RUnlock() +func (s *defaultInMemoryStore) HasVisited(ctx context.Context, link *wbot.ParsedURL) (bool, error) { + s.mu.Lock() + defer s.mu.Unlock() - hash, err := wbot.HashLink(link) - if err != nil { - return false, err - } - - _, found := s.table[hash] + _, found := s.table[link.Hash] if !found { - s.table[hash] = true + s.table[link.Hash] = true return false, nil } return found, nil } func (s *defaultInMemoryStore) Close() error { + clear(s.table) return nil } diff --git a/utilities.go b/utilities.go new file mode 100644 index 0000000..93ad490 --- /dev/null +++ b/utilities.go @@ -0,0 +1,52 @@ +package wbot + +import ( + "fmt" + "net/url" + "strings" + + "github.com/weppos/publicsuffix-go/publicsuffix" +) + +var tlds = map[string]bool{ + "ac": true, "ae": true, "aero": true, "af": true, "ag": true, "am": true, + "as": true, "asia": true, "at": true, "au": true, "ax": true, "be": true, + "bg": true, "bi": true, "biz": true, "bj": true, "br": true, "by": true, + "ca": true, "cat": true, "cc": true, "cl": true, "cn": true, "co": true, + "com": true, "coop": true, "cx": true, "de": true, "dk": true, "dm": true, + "dz": true, "edu": true, "ee": true, "eu": true, "fi": true, "fo": true, + "fr": true, "ge": true, "gl": true, "gov": true, "gs": true, "hk": true, + "hr": true, "hu": true, "id": true, "ie": true, "in": true, "info": true, + "int": true, "io": true, "ir": true, "is": true, "je": true, "jobs": true, + "kg": true, "kr": true, "la": true, "lu": true, "lv": true, "ly": true, + "ma": true, "md": true, "me": true, "mk": true, "mobi": true, "ms": true, + "mu": true, "mx": true, "name": true, "net": true, "nf": true, "ng": true, + "no": true, "nu": true, "nz": true, "org": true, "pl": true, "pr": true, + "pro": true, "pw": true, "ro": true, "ru": true, "sc": true, "se": true, + "sg": true, "sh": true, "si": true, "sk": true, "sm": true, "st": true, + "so": true, "su": true, "tc": true, "tel": true, "tf": true, "th": true, + "tk": true, "tl": true, "tm": true, "tn": true, "travel": true, "tw": true, + "tv": true, "tz": true, "ua": true, "uk": true, "us": true, "uz": true, + "vc": true, "ve": true, "vg": true, "ws": true, "xxx": true, "rs": true, +} + +func Hostname(link string) (string, error) { + u, err := url.Parse(link) + if err != nil { + return "", fmt.Errorf("failed to parse URL: %w", err) + } + + // Extract domain and TLD using publicsuffix-go + domain, err := publicsuffix.Domain(u.Hostname()) + if err != nil { + return "", fmt.Errorf("failed to extract domain: %w", err) + } + + // Ensure that the extracted TLD is in our allowed list + tld := domain[strings.LastIndex(domain, ".")+1:] + if !tlds[tld] { + return "", fmt.Errorf("invalid TLD: %s", tld) + } + + return domain, nil +} diff --git a/utilities_test.go b/utilities_test.go new file mode 100644 index 0000000..c787682 --- /dev/null +++ b/utilities_test.go @@ -0,0 +1,39 @@ +package wbot + +import "testing" + +func TestHostname(t *testing.T) { + validURLs := []struct { + input string + expected string + }{ + {"http://www.google.com", "google.com"}, + {"https://sub.domain.google.com", "google.com"}, + {"http://beta.moon.facebook.com", "facebook.com"}, + // ... Add more valid test cases here + } + + invalidURLs := []string{ + "http://www.google.invalidTLD", + "https://example.com.xxy", + "ftp://example.site", // assuming "site" is not in your TLDs map + // ... Add more invalid test cases here + } + + for _, tt := range validURLs { + got, err := Hostname(tt.input) + if err != nil { + t.Errorf("Hostname(%q) returned unexpected error: %v", tt.input, err) + } + if got != tt.expected { + t.Errorf("Hostname(%q) = %q; want %q", tt.input, got, tt.expected) + } + } + + for _, url := range invalidURLs { + _, err := Hostname(url) + if err == nil { + t.Errorf("Hostname(%q) expected to return an error, but got none", url) + } + } +} diff --git a/wbot.go b/wbot.go index efa2164..690f809 100644 --- a/wbot.go +++ b/wbot.go @@ -22,7 +22,14 @@ type ( } Store interface { - HasVisited(ctx context.Context, link string) (bool, error) + HasVisited(ctx context.Context, u *ParsedURL) (bool, error) + Close() error + } + + Queue interface { + Push(ctx context.Context, req *Request) error + Pop(ctx context.Context) (*Request, error) + Len() int32 Close() error } @@ -32,39 +39,43 @@ type ( } MetricsMonitor interface { - IncrementTotalRequests() - IncrementSuccessfulRequests() - IncrementFailedRequests() - IncrementRetries() - IncrementRedirects() + IncTotalRequests() + IncSuccessfulRequests() + IncFailedRequests() + IncRetries() + IncRedirects() - IncrementTotalPages() - IncrementCrawledPages() - IncrementSkippedPages() - IncrementParsedLinks() + IncTotalPages() + IncCrawledPages() + IncSkippedPages() + IncParsedLinks() - IncrementClientErrors() - IncrementServerErrors() + IncClientErrors() + IncServerErrors() } Request struct { - ID string - BaseHost string - URL *url.URL - Depth int32 - Param *Param + Target *ParsedURL + Param *Param + Depth int32 } Response struct { - URL *url.URL + URL *ParsedURL Status int Body []byte - NextURLs []string + NextURLs []*ParsedURL Depth int32 ElapsedTime time.Duration Err error } + ParsedURL struct { + Hash string + Root string + URL *url.URL + } + Param struct { Proxy string UserAgent string @@ -101,7 +112,7 @@ func (r *Request) ResolveURL(u string) (*url.URL, error) { return nil, fmt.Errorf("url is a fragment") } - absURL, err := r.URL.Parse(u) + absURL, err := r.Target.URL.Parse(u) if err != nil { return nil, err } @@ -145,12 +156,41 @@ func FindLinks(body []byte) (hrefs []string) { return hrefs } -func HashLink(link string) (string, error) { - parsedLink, err := url.Parse(link) +func NewURL(raw string) (*ParsedURL, error) { + u, err := url.Parse(raw) if err != nil { - return "", err + return nil, err } + if u.Scheme != "http" && u.Scheme != "https" { + return nil, fmt.Errorf("invalid scheme: %s", u.Scheme) + } + + // Extract domain and TLD using publicsuffix-go + domain, err := publicsuffix.Domain(u.Hostname()) + if err != nil { + return nil, fmt.Errorf("failed to extract domain: %w", err) + } + + // Ensure that the extracted TLD is in our allowed list + tld := domain[strings.LastIndex(domain, ".")+1:] + if !tlds[tld] { + return nil, fmt.Errorf("invalid TLD: %s", tld) + } + + hash, err := hashLink(*u) + if err != nil { + return nil, fmt.Errorf("invalid hash: %s", hash) + } + + return &ParsedURL{ + Hash: hash, + Root: domain, + URL: u, + }, nil +} + +func hashLink(parsedLink url.URL) (string, error) { parsedLink.Scheme = "" parsedLink.Host = strings.TrimPrefix(parsedLink.Host, "www.") @@ -170,24 +210,3 @@ func HashLink(link string) (string, error) { return hex.EncodeToString(hasher.Sum(nil)), nil } - -func Hostname(link string) (string, error) { - hostname, err := publicsuffix.Domain(link) - if err != nil { - return "", fmt.Errorf("failed to get domain: %w", err) - } - return hostname, nil -} - -func ValidURL(raw string) (*url.URL, error) { - u, err := url.Parse(raw) - if err != nil { - return nil, err - } - - if u.Scheme != "http" && u.Scheme != "https" { - return nil, fmt.Errorf("invalid scheme: %s", u.Scheme) - } - - return u, nil -} From 0bb5cb641080cf96cfa5d99b6e2bb082f3eca625 Mon Sep 17 00:00:00 2001 From: twiny Date: Wed, 14 Feb 2024 20:59:47 +0100 Subject: [PATCH 06/10] fix(crawler): get stuck at defer stop --- .gitignore | 3 +- Makefile | 6 + README.md | 10 +- crawler/config.go | 18 +-- crawler/crawler.go | 226 ++++++++++++++++------------------ crawler/option.go | 5 - crawler/robot.go | 36 ++++-- go.mod | 24 ++-- go.sum | 44 +++---- plugin/fetcher/http_client.go | 52 ++++---- plugin/logger/logger.go | 57 --------- plugin/monitor/monitor.go | 22 ++-- plugin/queue/queue.go | 23 ++-- plugin/store/in_memory.go | 3 + wbot.go | 26 ++-- 15 files changed, 242 insertions(+), 313 deletions(-) create mode 100644 Makefile delete mode 100644 plugin/logger/logger.go diff --git a/.gitignore b/.gitignore index 7b5ae83..04be717 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ tests/ -.idea.md \ No newline at end of file +.idea.md +*.*prof \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7c8dbea --- /dev/null +++ b/Makefile @@ -0,0 +1,6 @@ +clean: +start: +init: +build: +usage: +.PHONY: clean start init build usage \ No newline at end of file diff --git a/README.md b/README.md index 314f725..229172b 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,18 @@ -## WBot +# WBot A configurable, thread-safe web crawler, provides a minimal interface for crawling and downloading web pages. -### Features: +## Features + - Clean minimal API. - Configurable: MaxDepth, MaxBodySize, Rate Limit, Parrallelism, User Agent & Proxy rotation. - Memory-efficient, thread-safe. - Provides built-in interface: Fetcher, Store, Queue & a Logger. +## [Examples & API](https://github.com/twiny/wbot/wiki) -### [Examples & API](https://github.com/twiny/wbot/wiki) +## TODO -### TODO - [ ] Add support for robots.txt. - [ ] Add test cases. - [ ] Implement `Fetch` using Chromedp. @@ -19,4 +20,5 @@ A configurable, thread-safe web crawler, provides a minimal interface for crawli - [ ] Add documentation. ### Bugs + Bugs or suggestions? Please visit the [issue tracker](https://github.com/twiny/wbot/issues). diff --git a/crawler/config.go b/crawler/config.go index 7173c39..af45e97 100644 --- a/crawler/config.go +++ b/crawler/config.go @@ -41,29 +41,23 @@ func newConfig(maxDepth int32, userAgents, referrers, proxies []string) *config if len(userAgents) > 0 { uaList := poxa.NewSpinner(userAgents...) - if uaList == nil { - // can be ignored. + if uaList != nil { + conf.userAgents = uaList } - - conf.userAgents = uaList } if len(referrers) > 0 { refList := poxa.NewSpinner(referrers...) - if refList == nil { - // can be ignored. + if refList != nil { + conf.referrers = refList } - - conf.referrers = refList } if len(proxies) > 0 { proxyList := poxa.NewSpinner(proxies...) - if proxyList == nil { - // can be ignored. + if proxyList != nil { + conf.proxies = proxyList } - - conf.proxies = proxyList } return conf diff --git a/crawler/crawler.go b/crawler/crawler.go index b102e4e..b995b91 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -2,140 +2,148 @@ package crawler import ( "context" + "errors" "fmt" - "os" "strings" "sync" "sync/atomic" "github.com/twiny/wbot" "github.com/twiny/wbot/plugin/fetcher" + "github.com/twiny/wbot/plugin/monitor" "github.com/twiny/wbot/plugin/queue" "github.com/twiny/wbot/plugin/store" "github.com/twiny/flare" - - clog "github.com/charmbracelet/log" ) -type CrawlState int32 - const ( - CrawlQuit CrawlState = iota - CrawlFinished - CrawlInProgress - CrawlPaused + crawlInProgress int32 = iota + crawlFinished ) type ( Crawler struct { - wg sync.WaitGroup - + wg *sync.WaitGroup cfg *config fetcher wbot.Fetcher store wbot.Store - logger wbot.Logger queue wbot.Queue metrics wbot.MetricsMonitor filter *filter limiter *rateLimiter - robot *robortManager - - counter int32 - stream chan *wbot.Response - errors chan error + robot *robotManager - state CrawlState - quit flare.Notifier + stream chan *wbot.Response + errors chan error - termLog *clog.Logger + once *sync.Once + status int32 + quit flare.Notifier } ) func New(opts ...Option) *Crawler { - logOpt := clog.Options{ - TimeFormat: "2006-01-02 15:04:05", - Level: clog.DebugLevel, - Prefix: "[WBot]", - ReportTimestamp: true, - } - c := &Crawler{ + wg: new(sync.WaitGroup), cfg: newConfig(-1, nil, nil, nil), fetcher: fetcher.NewHTTPClient(), store: store.NewInMemoryStore(), - // logger: newFileLogger(), - queue: queue.NewInMemoryQueue(), - // metrics: newMetricsMonitor(), + queue: queue.NewInMemoryQueue(), + metrics: monitor.NewmetricsMonitor(), filter: newFilter(), limiter: newRateLimiter(), - // robot: newRobotsManager(), + robot: newRobotManager(false), - stream: make(chan *wbot.Response, 16), - errors: make(chan error, 16), + stream: make(chan *wbot.Response, 1024), + errors: make(chan error, 1024), - termLog: clog.NewWithOptions(os.Stdout, logOpt), - - quit: flare.New(), + once: new(sync.Once), + status: crawlFinished, + quit: flare.New(), } for _, opt := range opts { opt(c) } + // this routine waits for quit signal + go func() { + <-c.quit.Done() + fmt.Println("Crawler is shutting down") + atomic.StoreInt32(&c.status, crawlFinished) + + close(c.stream) + close(c.errors) + c.wg.Wait() + c.queue.Close() + c.store.Close() + c.fetcher.Close() + }() + return c } func (c *Crawler) Start(links ...string) error { - var targets []*wbot.ParsedURL + var ( + targets []*wbot.ParsedURL + errs []error + ) for _, link := range links { target, err := wbot.NewURL(link) if err != nil { - c.errors <- fmt.Errorf("start: %w", err) + errs = append(errs, err) continue } targets = append(targets, target) } - if len(targets) == 0 { + if len(errs) > 0 { c.quit.Cancel() - c.wg.Wait() - close(c.stream) - return fmt.Errorf("no links to crawl") + return fmt.Errorf("invalid targets: %v", errors.Join(errs...)) } - c.termLog.Infof("crawling %d links...", len(targets)) + if len(targets) == 0 { + c.quit.Cancel() + return fmt.Errorf("no valid targets: %v", errs) + } + atomic.StoreInt32(&c.status, crawlInProgress) for _, target := range targets { c.wg.Add(1) - go c.start(target) + go func(t *wbot.ParsedURL) { + if err := c.start(t); err != nil { + c.errors <- fmt.Errorf("start: %w", err) + } + }(target) } - c.wg.Add(c.cfg.parallel) - c.termLog.Infof("starting %d workers...", c.cfg.parallel) for i := 0; i < c.cfg.parallel; i++ { - go c.crawler() + c.wg.Add(1) + go c.crawler(i) } + fmt.Println("Crawler is running") c.wg.Wait() + fmt.Println("Crawler has stopped") + return nil } -func (c *Crawler) OnReponse(fn func(*wbot.Response) error) { +func (c *Crawler) OnReponse(fn func(*wbot.Response)) { c.wg.Add(1) go func() { defer c.wg.Done() - for { select { case <-c.quit.Done(): return - case resp := <-c.stream: - if err := fn(resp); err != nil { - c.errors <- err + case resp, ok := <-c.stream: + if ok { + fn(resp) } } } @@ -143,16 +151,16 @@ func (c *Crawler) OnReponse(fn func(*wbot.Response) error) { } func (c *Crawler) OnError(fn func(err error)) { c.wg.Add(1) - go func() { defer c.wg.Done() - for { select { case <-c.quit.Done(): return - case err := <-c.errors: - fn(err) + case err, ok := <-c.errors: + if ok { + fn(err) + } } } }() @@ -160,19 +168,12 @@ func (c *Crawler) OnError(fn func(err error)) { func (c *Crawler) Stats() map[string]any { return map[string]any{} } -func (c *Crawler) Close() { - c.termLog.Debugf("closing...") +func (c *Crawler) Stop() { c.quit.Cancel() - c.wg.Wait() - c.store.Close() - c.fetcher.Close() } -func (c *Crawler) start(target *wbot.ParsedURL) { - defer func() { - c.wg.Done() - atomic.AddInt32(&c.counter, 1) - }() +func (c *Crawler) start(target *wbot.ParsedURL) error { + defer c.wg.Done() param := &wbot.Param{ MaxBodySize: c.cfg.maxBodySize, @@ -190,41 +191,35 @@ func (c *Crawler) start(target *wbot.ParsedURL) { } if err := c.queue.Push(context.TODO(), req); err != nil { - c.errors <- fmt.Errorf("push: %w", err) - return + return fmt.Errorf("push: %w", err) } + fmt.Printf("Crawling %s\n", target.URL.String()) + + return nil } -func (c *Crawler) crawler() { +func (c *Crawler) crawler(id int) { defer c.wg.Done() for { select { case <-c.quit.Done(): - c.termLog.Debugf("quit") - c.queue.Close() // must close queue before quit + fmt.Printf("worker %d is stopping\n", id) return default: - func() { - defer atomic.AddInt32(&c.counter, -1) - }() - - req, err := c.queue.Pop(context.TODO()) - if err != nil { - // atomic.AddInt32(&c.counter, -1) - // c.termLog.Errorf("pop: %s", err.Error()) - continue + if atomic.LoadInt32(&c.status) == crawlFinished { + c.quit.Cancel() + return } - if req.Depth > c.cfg.maxDepth { - // atomic.AddInt32(&c.counter, -1) - - if c.queue.Len()+atomic.LoadInt32(&c.counter) == 0 { - c.quit.Cancel() - fmt.Println("queue len:", c.queue.Len()) - continue - } + if c.queue.IsDone() { + c.quit.Cancel() + return + } + req, err := c.queue.Pop(context.TODO()) + if err != nil { + c.errors <- fmt.Errorf("pop: %w", err) continue } @@ -232,8 +227,6 @@ func (c *Crawler) crawler() { resp, err := c.fetcher.Fetch(context.TODO(), req) if err != nil { - // todo: log - // atomic.AddInt32(&c.counter, -1) c.errors <- fmt.Errorf("fetch: %w", err) continue } @@ -241,7 +234,8 @@ func (c *Crawler) crawler() { atomic.AddInt32(&req.Depth, 1) for _, target := range resp.NextURLs { if !strings.Contains(target.URL.Host, req.Target.Root) { - c.errors <- fmt.Errorf("hostname check: %w", err) + // can be ignored - better add log level + // c.errors <- fmt.Errorf("hostname check: %s", target.URL.String()) continue } @@ -251,16 +245,18 @@ func (c *Crawler) crawler() { // } if !c.filter.allow(target) { - c.errors <- fmt.Errorf("allow check: %w", err) + // can be ignored - better add log level + // c.errors <- fmt.Errorf("allow check: %s", target.URL.String()) continue } if visited, err := c.store.HasVisited(context.TODO(), target); visited { if err != nil { - c.errors <- fmt.Errorf("has visited 1: %w", err) + c.errors <- fmt.Errorf("store: %w", err) continue } - c.errors <- fmt.Errorf("has visited 2: %w", err) + // can be ignored - better add log level + // c.errors <- fmt.Errorf("URL %s has been visited", target.URL.String()) continue } @@ -274,38 +270,26 @@ func (c *Crawler) crawler() { c.errors <- fmt.Errorf("push: %w", err) continue } - - atomic.AddInt32(&c.counter, 1) } - // if c.log != nil { - // rep := newReport(resp, nil) - // c.log.Send(rep) - // } - - // stream - c.stream <- resp - // atomic.AddInt32(&c.counter, -1) - - c.termLog.Infof("crawled: %s, depth: %d, counter: %d, queue: %d", first64Chars(req.Target.URL.String()), req.Depth, c.counter, c.queue.Len()) - - if c.queue.Len()+atomic.LoadInt32(&c.counter) == 0 { - c.queue.Close() // must close queue before quit + if req.Depth > c.cfg.maxDepth { + c.queue.Cancel() // todo: better way to stop the queue + // c.quit.Cancel() continue } + + c.stream <- resp } } } - -func first64Chars(s string) string { - if len(s) <= 64 { - return s - } - - runes := []rune(s) - if len(runes) <= 64 { - return s - } - - return string(runes[:64]) +func (c *Crawler) exit() { + c.once.Do(func() { + atomic.StoreInt32(&c.status, crawlFinished) + c.queue.Cancel() + c.queue.Close() + c.store.Close() + c.fetcher.Close() + close(c.stream) + close(c.errors) + }) } diff --git a/crawler/option.go b/crawler/option.go index 26255eb..8b614da 100644 --- a/crawler/option.go +++ b/crawler/option.go @@ -49,8 +49,3 @@ func WithStore(store wbot.Store) Option { c.store = store } } -func WithLogger(logger wbot.Logger) Option { - return func(c *Crawler) { - c.logger = logger - } -} diff --git a/crawler/robot.go b/crawler/robot.go index eeba530..63cefa6 100644 --- a/crawler/robot.go +++ b/crawler/robot.go @@ -8,20 +8,34 @@ const ( robotstxtPath = "/robots.txt" ) -type ( - robortManager struct { - robots map[string]*robotstxt.RobotsData +type robotManager struct { + followRobots bool + robots map[string]*robotstxt.RobotsData +} + +func newRobotManager(follow bool) *robotManager { + return &robotManager{ + followRobots: follow, + robots: make(map[string]*robotstxt.RobotsData), } -) +} -func NewRobotManager() *robortManager { - return &robortManager{ - robots: make(map[string]*robotstxt.RobotsData), +func (rm *robotManager) AddRobotsTxt(hostname string, body []byte) error { + data, err := robotstxt.FromBytes(body) + if err != nil { + return err // Return the error if parsing fails. } + + rm.robots[hostname] = data + return nil } +func (rm *robotManager) Allowed(userAgent, url string) bool { + hostname := url // Simplification; use proper URL parsing in production. -// func (rm *robortManager) AddRobotsTxt(hostname string, statusCode int, body []byte) error { -// } + robotsData, exists := rm.robots[hostname] + if !exists { + return true + } -// func (rm *robortManager) Allowed(userAgent, path string) bool { -// } + return robotsData.TestAgent(url, userAgent) +} diff --git a/go.mod b/go.mod index d88a154..7494eeb 100644 --- a/go.mod +++ b/go.mod @@ -1,13 +1,13 @@ module github.com/twiny/wbot -go 1.21.0 +go 1.21.5 require ( github.com/PuerkitoBio/goquery v1.8.1 - github.com/charmbracelet/log v0.2.3 + github.com/go-echarts/statsview v0.3.4 + github.com/pkg/profile v1.7.0 github.com/temoto/robotstxt v1.1.2 github.com/twiny/flare v0.1.0 - github.com/twiny/flog/v2 v2.0.0 github.com/twiny/poxa v0.1.0 github.com/twiny/ratelimit v0.0.0-20220509163414-256d3376b0ac github.com/weppos/publicsuffix-go v0.30.1 @@ -15,20 +15,12 @@ require ( require ( github.com/andybalholm/cascadia v1.3.1 // indirect - github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect github.com/benbjohnson/clock v1.3.0 // indirect - github.com/charmbracelet/lipgloss v0.7.1 // indirect - github.com/go-echarts/go-echarts/v2 v2.2.7 // indirect - github.com/go-echarts/statsview v0.3.4 // indirect - github.com/go-logfmt/logfmt v0.6.0 // indirect - github.com/lucasb-eyer/go-colorful v1.2.0 // indirect - github.com/mattn/go-isatty v0.0.18 // indirect - github.com/mattn/go-runewidth v0.0.14 // indirect - github.com/muesli/reflow v0.3.0 // indirect - github.com/muesli/termenv v0.15.2 // indirect - github.com/rivo/uniseg v0.2.0 // indirect - github.com/rs/cors v1.9.0 // indirect + github.com/felixge/fgprof v0.9.3 // indirect + github.com/go-echarts/go-echarts/v2 v2.2.3 // indirect + github.com/google/pprof v0.0.0-20211214055906-6f57359322fd // indirect + github.com/rs/cors v1.7.0 // indirect + github.com/stretchr/testify v1.8.4 // indirect golang.org/x/net v0.12.0 // indirect - golang.org/x/sys v0.10.0 // indirect golang.org/x/text v0.11.0 // indirect ) diff --git a/go.sum b/go.sum index 72105f2..a0c51df 100644 --- a/go.sum +++ b/go.sum @@ -4,26 +4,22 @@ github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAc github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= -github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k= -github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8= github.com/benbjohnson/clock v1.3.0 h1:ip6w0uFQkncKQ979AypyG0ER7mqUSBdKLOgAle/AT8A= github.com/benbjohnson/clock v1.3.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/bwesterb/go-ristretto v1.2.0/go.mod h1:fUIoIZaG73pV5biE2Blr2xEzDoMj7NFEuV9ekS419A0= -github.com/charmbracelet/lipgloss v0.7.1 h1:17WMwi7N1b1rVWOjMT+rCh7sQkvDU75B2hbZpc5Kc1E= -github.com/charmbracelet/lipgloss v0.7.1/go.mod h1:yG0k3giv8Qj8edTCbbg6AlQ5e8KNWpFujkNawKNhE2c= -github.com/charmbracelet/log v0.2.3 h1:YVmBhJtpGL7nW/nlf5u+SEloU8XYljxozGzZpgwIvhs= -github.com/charmbracelet/log v0.2.3/go.mod h1:ZApwwzDbbETVTIRTk7724yQRJAXIktt98yGVMMaa3y8= +github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= +github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= +github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= github.com/cloudflare/circl v1.1.0/go.mod h1:prBCrKB9DV4poKZY1l9zBXg2QJY7mvgRvtMxxK7fi4I= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/felixge/fgprof v0.9.3 h1:VvyZxILNuCiUCSXtPtYmmtGvb65nqXh2QFWc0Wpf2/g= +github.com/felixge/fgprof v0.9.3/go.mod h1:RdbpDgzqYVh/T9fPELJyV7EYJuHB55UTEULNun8eiPw= +github.com/go-echarts/go-echarts/v2 v2.2.3 h1:H8oPdUpzuiV2K8S4xYZa1JRNjP3U0h7HVqvhPrmCk1A= github.com/go-echarts/go-echarts/v2 v2.2.3/go.mod h1:6TOomEztzGDVDkOSCFBq3ed7xOYfbOqhaBzD0YV771A= -github.com/go-echarts/go-echarts/v2 v2.2.7 h1:mtFAuoqQ7McdlKrJ0gLexwxMPT7yoscDDhULNwPOxBk= -github.com/go-echarts/go-echarts/v2 v2.2.7/go.mod h1:VEeyPT5Odx/UHeuxtIAHGu2+87MWGA5OBaZ120NFi/w= github.com/go-echarts/statsview v0.3.4 h1:CCuytRAutdnF901NrR4BzSjHXjUp8OyA3/iopgG/1/Y= github.com/go-echarts/statsview v0.3.4/go.mod h1:AehKjL9cTFMeIo5QdV8sQO43vFmfY65X5GMWa3XMciY= -github.com/go-logfmt/logfmt v0.6.0 h1:wGYYu3uicYdqXVgoYbvnkrPVXkuLM1p1ifugDMEdRi4= -github.com/go-logfmt/logfmt v0.6.0/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KEVveWlfTs= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= @@ -33,39 +29,30 @@ github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeN github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-github/v50 v50.2.0/go.mod h1:VBY8FB6yPIjrtKhozXv4FQupxKLS6H4m6xFZlT43q8Q= github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17icRSOU623lUBU= +github.com/google/pprof v0.0.0-20211214055906-6f57359322fd h1:1FjCyPC+syAzJ5/2S8fqdZK1R22vvA0J7JZKcuOIQ7Y= +github.com/google/pprof v0.0.0-20211214055906-6f57359322fd/go.mod h1:KgnwoLYCZ8IQu3XUZ8Nc/bM9CCZFOyjUNOSygVozoDg= +github.com/ianlancetaylor/demangle v0.0.0-20210905161508-09a460cdf81d/go.mod h1:aYm2/VgdVmcIU8iMfdMvDMsRAQjcfZSKFby6HOFvi/w= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= -github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY= -github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= -github.com/mattn/go-isatty v0.0.18 h1:DOKFKCQ7FNG2L1rbrmstDN4QVRdS89Nkh85u68Uwp98= -github.com/mattn/go-isatty v0.0.18/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= -github.com/mattn/go-runewidth v0.0.12/go.mod h1:RAqKPSqVFrSLVXbA8x7dzmKdmGzieGRCM46jaSJTDAk= -github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU= -github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= -github.com/muesli/reflow v0.3.0 h1:IFsN6K9NfGtjeggFP+68I4chLZV2yIKsXJFNZ+eWh6s= -github.com/muesli/reflow v0.3.0/go.mod h1:pbwTDkVPibjO2kyvBQRBxTWEEGDGq0FlB1BIKtnHY/8= -github.com/muesli/termenv v0.15.2 h1:GohcuySI0QmI3wN8Ok9PtKGkgkFIk7y6Vpb5PvrY+Wo= -github.com/muesli/termenv v0.15.2/go.mod h1:Epx+iuz8sNs7mNKhxzH4fWXGNpZwUaJKRS1noLXviQ8= +github.com/pkg/profile v1.7.0 h1:hnbDkaNWPCLMO9wGLdBFTIZvzDrDfBM2072E1S9gJkA= +github.com/pkg/profile v1.7.0/go.mod h1:8Uer0jas47ZQMJ7VD+OHknK4YDY07LPUC6dEvqDjvNo= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/rivo/uniseg v0.1.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= -github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY= -github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/rs/cors v1.7.0 h1:+88SsELBHx5r+hZ8TCkggzSstaWNbDvThkVK8H6f9ik= github.com/rs/cors v1.7.0/go.mod h1:gFx+x8UowdsKA9AchylcLynDq+nNFfI8FkUZdN/jGCU= -github.com/rs/cors v1.9.0 h1:l9HGsTsHJcvW14Nk7J9KFz8bzeAWXn3CG6bgt7LsrAE= -github.com/rs/cors v1.9.0/go.mod h1:XyqrcTp5zjWr1wsJ8PIRZssZ8b/WMcMf71DJnit4EMU= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.6.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= github.com/twiny/flare v0.1.0 h1:bq50IXYNUpiJULoIXXwerL1gwr+KBz49ayYgQo/CqnY= github.com/twiny/flare v0.1.0/go.mod h1:Rlzkek5PDlOGFue015tC7fe/ROyeSx3hqy+jfAZGezQ= -github.com/twiny/flog/v2 v2.0.0 h1:uHyWg1/q1P5zi/5jOiZyFZBfUZwd416v+9J+fnIBMZY= -github.com/twiny/flog/v2 v2.0.0/go.mod h1:ujV9tY/hkP9302AgmXcpA/ds1/prAymijvIoxF9zzbM= github.com/twiny/poxa v0.1.0 h1:NMM1ZeRfGFVOz60NjHR4r78pQYkq09VyOjKjdkhkWsE= github.com/twiny/poxa v0.1.0/go.mod h1:zTPmnK5Ta+Ro+HL1R/LREGg3LNqs/bpNcEWlUipKl7A= github.com/twiny/ratelimit v0.0.0-20220509163414-256d3376b0ac h1:nT+8DFvrU5Nu3Be2bK7LooU8AslFJeypQoAF+wm1CM0= @@ -104,7 +91,6 @@ golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.10.0 h1:SqMFp9UcQJZa+pmYuAKjd9xq1f0j5rLcDIk0mj4qAsA= golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= diff --git a/plugin/fetcher/http_client.go b/plugin/fetcher/http_client.go index ee6ebdd..e2d23ff 100644 --- a/plugin/fetcher/http_client.go +++ b/plugin/fetcher/http_client.go @@ -1,11 +1,13 @@ package fetcher import ( + "bytes" "context" "io" "net" "net/http" "net/url" + "sync" "time" "github.com/twiny/wbot" @@ -13,47 +15,53 @@ import ( type ( defaultHTTPClient struct { - client *http.Client + client *http.Client + bufferPool *sync.Pool } ) func NewHTTPClient() wbot.Fetcher { + var ( + fn = func() any { + return new(bytes.Buffer) + } + ) + return &defaultHTTPClient{ client: &http.Client{ Jar: http.DefaultClient.Jar, Timeout: 10 * time.Second, }, + bufferPool: &sync.Pool{ + New: fn, + }, } } func (f *defaultHTTPClient) Fetch(ctx context.Context, req *wbot.Request) (*wbot.Response, error) { - type ( - fetchResult struct { - result *wbot.Response - err error - } + var ( + respCh = make(chan *wbot.Response, 1) + fetchErr error ) - var ch = make(chan fetchResult, 1) - go func() { resp, err := f.fetch(req) if err != nil { - ch <- fetchResult{nil, err} + fetchErr = err return } - ch <- fetchResult{resp, nil} + respCh <- resp }() for { select { case <-ctx.Done(): return nil, ctx.Err() - case resp := <-ch: - if resp.err != nil { - return nil, resp.err + case resp := <-respCh: + if fetchErr != nil { + return nil, fetchErr } - return resp.result, nil + return resp, nil } } } @@ -82,18 +90,20 @@ func (f *defaultHTTPClient) fetch(req *wbot.Request) (*wbot.Response, error) { if err != nil { return nil, err } + defer resp.Body.Close() - // Limit response body reading - bodyReader := io.LimitReader(resp.Body, req.Param.MaxBodySize) + buf := f.bufferPool.Get().(*bytes.Buffer) + buf.Reset() + defer f.bufferPool.Put(buf) - body, err := io.ReadAll(bodyReader) - if err != nil { + // Limit response body reading + if _, err := io.CopyN(buf, resp.Body, req.Param.MaxBodySize); err != nil && err != io.EOF { return nil, err } - resp.Body.Close() + bytes := buf.Bytes() - links := wbot.FindLinks(body) + links := wbot.FindLinks(bytes) var nextURLs []*wbot.ParsedURL for _, link := range links { @@ -111,7 +121,7 @@ func (f *defaultHTTPClient) fetch(req *wbot.Request) (*wbot.Response, error) { return &wbot.Response{ URL: req.Target, Status: resp.StatusCode, - Body: body, + Body: bytes, NextURLs: nextURLs, Depth: req.Depth, }, nil diff --git a/plugin/logger/logger.go b/plugin/logger/logger.go deleted file mode 100644 index 4388cba..0000000 --- a/plugin/logger/logger.go +++ /dev/null @@ -1,57 +0,0 @@ -package logger - -import ( - "time" - - "github.com/twiny/flog/v2" - "github.com/twiny/wbot" -) - -type ( - defaultLogger struct { - l *flog.Logger - } -) - -func NewFileLogger(prefix string) (*defaultLogger, error) { - logger, err := flog.NewLogger(prefix, 10, 10) - if err != nil { - return nil, err - } - - return &defaultLogger{ - l: logger, - }, nil -} - -func (l *defaultLogger) Write(log *wbot.Log) error { - f := []flog.Field{ - flog.NewField("request_url", log.RequestURL), - flog.NewField("status", log.Status), - flog.NewField("depth", log.Depth), - flog.NewField("timestamp", log.Timestamp.Format(time.RFC3339)), - flog.NewField("response_time", log.ResponseTime.String()), - flog.NewField("content_size", log.ContentSize), - } - - if log.UserAgent != "" { - f = append(f, flog.NewField("user_agent", log.UserAgent)) - } - - if log.RedirectURL != "" { - f = append(f, flog.NewField("redirect_url", log.RedirectURL)) - } - - if log.Err != nil { - f = append(f, flog.NewField("error", log.Err.Error())) - l.l.Error(log.Err.Error(), f...) - return nil - } - - l.l.Info("page", f...) - return nil -} -func (l *defaultLogger) Close() error { - l.l.Close() - return nil -} diff --git a/plugin/monitor/monitor.go b/plugin/monitor/monitor.go index 9ed5497..18835b7 100644 --- a/plugin/monitor/monitor.go +++ b/plugin/monitor/monitor.go @@ -24,36 +24,36 @@ func NewmetricsMonitor() *metricsMonitor { return &metricsMonitor{} } -func (m *metricsMonitor) IncrementTotalRequests() { +func (m *metricsMonitor) IncTotalRequests() { atomic.AddInt64(&m.totalRequests, 1) } -func (m *metricsMonitor) IncrementSuccessfulRequests() { +func (m *metricsMonitor) IncSuccessfulRequests() { atomic.AddInt64(&m.successfulRequests, 1) } -func (m *metricsMonitor) IncrementFailedRequests() { +func (m *metricsMonitor) IncFailedRequests() { atomic.AddInt64(&m.failedRequests, 1) } -func (m *metricsMonitor) IncrementRetries() { +func (m *metricsMonitor) IncRetries() { atomic.AddInt64(&m.retries, 1) } -func (m *metricsMonitor) IncrementRedirects() { +func (m *metricsMonitor) IncRedirects() { atomic.AddInt64(&m.redirects, 1) } -func (m *metricsMonitor) IncrementTotalPages() { +func (m *metricsMonitor) IncTotalPages() { atomic.AddInt64(&m.totalPages, 1) } -func (m *metricsMonitor) IncrementCrawledPages() { +func (m *metricsMonitor) IncCrawledPages() { atomic.AddInt64(&m.crawledPages, 1) } -func (m *metricsMonitor) IncrementSkippedPages() { +func (m *metricsMonitor) IncSkippedPages() { atomic.AddInt64(&m.skippedPages, 1) } -func (m *metricsMonitor) IncrementParsedLinks() { +func (m *metricsMonitor) IncParsedLinks() { atomic.AddInt64(&m.parsedLinks, 1) } -func (m *metricsMonitor) IncrementClientErrors() { +func (m *metricsMonitor) IncClientErrors() { atomic.AddInt64(&m.clientErrors, 1) } -func (m *metricsMonitor) IncrementServerErrors() { +func (m *metricsMonitor) IncServerErrors() { atomic.AddInt64(&m.serverErrors, 1) } diff --git a/plugin/queue/queue.go b/plugin/queue/queue.go index bb1fa4f..27e6bfb 100644 --- a/plugin/queue/queue.go +++ b/plugin/queue/queue.go @@ -8,10 +8,6 @@ import ( "github.com/twiny/wbot" ) -var ( - ErrQueueClosed = fmt.Errorf("queue is closed") -) - type defaultInMemoryQueue struct { mu sync.Mutex list []*wbot.Request @@ -32,7 +28,7 @@ func (q *defaultInMemoryQueue) Push(ctx context.Context, req *wbot.Request) erro defer q.mu.Unlock() if q.closed { - return ErrQueueClosed + return fmt.Errorf("queue is closed") } q.list = append(q.list, req) @@ -40,7 +36,6 @@ func (q *defaultInMemoryQueue) Push(ctx context.Context, req *wbot.Request) erro return nil } - func (q *defaultInMemoryQueue) Pop(ctx context.Context) (*wbot.Request, error) { q.mu.Lock() defer q.mu.Unlock() @@ -50,21 +45,32 @@ func (q *defaultInMemoryQueue) Pop(ctx context.Context) (*wbot.Request, error) { } if q.closed && len(q.list) == 0 { - return nil, ErrQueueClosed + return nil, fmt.Errorf("queue is closed") } req := q.list[0] q.list = q.list[1:] return req, nil } - func (q *defaultInMemoryQueue) Len() int32 { q.mu.Lock() defer q.mu.Unlock() return int32(len(q.list)) } +func (q *defaultInMemoryQueue) IsDone() bool { + q.mu.Lock() + defer q.mu.Unlock() + return q.closed && len(q.list) == 0 +} +func (q *defaultInMemoryQueue) Cancel() { + q.mu.Lock() + defer q.mu.Unlock() + + q.closed = true + q.cond.Broadcast() +} func (q *defaultInMemoryQueue) Close() error { q.mu.Lock() defer q.mu.Unlock() @@ -72,5 +78,6 @@ func (q *defaultInMemoryQueue) Close() error { q.closed = true q.cond.Broadcast() + clear(q.list) return nil } diff --git a/plugin/store/in_memory.go b/plugin/store/in_memory.go index 9d8e755..5f0a4c2 100644 --- a/plugin/store/in_memory.go +++ b/plugin/store/in_memory.go @@ -32,6 +32,9 @@ func (s *defaultInMemoryStore) HasVisited(ctx context.Context, link *wbot.Parsed return found, nil } func (s *defaultInMemoryStore) Close() error { + s.mu.Lock() + defer s.mu.Unlock() + clear(s.table) return nil } diff --git a/wbot.go b/wbot.go index 690f809..689c3ad 100644 --- a/wbot.go +++ b/wbot.go @@ -30,11 +30,8 @@ type ( Push(ctx context.Context, req *Request) error Pop(ctx context.Context) (*Request, error) Len() int32 - Close() error - } - - Logger interface { - Write(ctx context.Context, log *Log) error + Cancel() + IsDone() bool Close() error } @@ -93,18 +90,6 @@ type ( Hostname string Rate string } - - Log struct { - RequestURL string - Status int - Depth int32 - Err error - Timestamp time.Time - ResponseTime time.Duration - ContentSize int64 - UserAgent string - RedirectURL string - } ) func (r *Request) ResolveURL(u string) (*url.URL, error) { @@ -121,6 +106,13 @@ func (r *Request) ResolveURL(u string) (*url.URL, error) { return absURL, nil } +func (u *ParsedURL) String() string { + var link = u.URL.String() + if len(link) > 64 { + return link[:64] + } + return link +} func FindLinks(body []byte) (hrefs []string) { doc, err := goquery.NewDocumentFromReader(bytes.NewReader(body)) From 75e9bc48b6bd3881f9de6243ede722e6e118e7cc Mon Sep 17 00:00:00 2001 From: twiny Date: Sat, 17 Feb 2024 15:31:23 +0100 Subject: [PATCH 07/10] fix(crawler): get stuck at defer stop - update gitignore and add timeout to HTTP client. - reverted to simple queue. --- .gitignore | 4 +- crawler/config.go | 2 + crawler/crawler.go | 136 +++++++++++++++------------------- plugin/fetcher/http_client.go | 23 +++++- plugin/queue/queue.go | 58 +++++---------- wbot.go | 3 +- 6 files changed, 104 insertions(+), 122 deletions(-) diff --git a/.gitignore b/.gitignore index 04be717..e3b34f4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +bin/ tests/ .idea.md -*.*prof \ No newline at end of file +*.*prof +.vscode/ \ No newline at end of file diff --git a/crawler/config.go b/crawler/config.go index af45e97..c76004c 100644 --- a/crawler/config.go +++ b/crawler/config.go @@ -19,6 +19,7 @@ type ( parallel int maxDepth int32 maxBodySize int64 + timeout time.Duration userAgents poxa.Spinner[string] referrers poxa.Spinner[string] proxies poxa.Spinner[string] @@ -34,6 +35,7 @@ func newConfig(maxDepth int32, userAgents, referrers, proxies []string) *config parallel: runtime.NumCPU(), maxDepth: maxDepth, maxBodySize: defaultMaxBodySize, + timeout: defaultTimeout, userAgents: poxa.NewSpinner(defaultUserAgent), referrers: poxa.NewSpinner(defaultReferrer), proxies: nil, diff --git a/crawler/crawler.go b/crawler/crawler.go index b995b91..a4e6884 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -2,24 +2,24 @@ package crawler import ( "context" - "errors" "fmt" + "os" + "os/signal" "strings" "sync" "sync/atomic" + "github.com/twiny/flare" "github.com/twiny/wbot" "github.com/twiny/wbot/plugin/fetcher" "github.com/twiny/wbot/plugin/monitor" "github.com/twiny/wbot/plugin/queue" "github.com/twiny/wbot/plugin/store" - - "github.com/twiny/flare" ) const ( - crawlInProgress int32 = iota - crawlFinished + crawlRunning = 0 + crawlStopped = 1 ) type ( @@ -39,20 +39,23 @@ type ( stream chan *wbot.Response errors chan error - once *sync.Once status int32 - quit flare.Notifier + flare flare.Notifier + + ctx context.Context + stop context.CancelFunc } ) func New(opts ...Option) *Crawler { + ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt) c := &Crawler{ wg: new(sync.WaitGroup), cfg: newConfig(-1, nil, nil, nil), fetcher: fetcher.NewHTTPClient(), store: store.NewInMemoryStore(), - queue: queue.NewInMemoryQueue(), + queue: queue.NewInMemoryQueue(2048), metrics: monitor.NewmetricsMonitor(), filter: newFilter(), @@ -62,9 +65,11 @@ func New(opts ...Option) *Crawler { stream: make(chan *wbot.Response, 1024), errors: make(chan error, 1024), - once: new(sync.Once), - status: crawlFinished, - quit: flare.New(), + status: crawlStopped, + flare: flare.New(), + + ctx: ctx, + stop: stop, } for _, opt := range opts { @@ -73,16 +78,18 @@ func New(opts ...Option) *Crawler { // this routine waits for quit signal go func() { - <-c.quit.Done() + <-c.ctx.Done() fmt.Println("Crawler is shutting down") - atomic.StoreInt32(&c.status, crawlFinished) - close(c.stream) - close(c.errors) - c.wg.Wait() + c.flare.Cancel() + c.queue.Close() c.store.Close() c.fetcher.Close() + + c.wg.Wait() + close(c.stream) + close(c.errors) }() return c @@ -103,34 +110,25 @@ func (c *Crawler) Start(links ...string) error { } if len(errs) > 0 { - c.quit.Cancel() - return fmt.Errorf("invalid targets: %v", errors.Join(errs...)) + return fmt.Errorf("invalid links: %v", errs) } if len(targets) == 0 { - c.quit.Cancel() - return fmt.Errorf("no valid targets: %v", errs) + return fmt.Errorf("no valid links") } - atomic.StoreInt32(&c.status, crawlInProgress) for _, target := range targets { - c.wg.Add(1) - go func(t *wbot.ParsedURL) { - if err := c.start(t); err != nil { - c.errors <- fmt.Errorf("start: %w", err) - } - }(target) + c.start(target) } + c.status = crawlRunning + + c.wg.Add(c.cfg.parallel) for i := 0; i < c.cfg.parallel; i++ { - c.wg.Add(1) go c.crawler(i) } - fmt.Println("Crawler is running") c.wg.Wait() - fmt.Println("Crawler has stopped") - return nil } func (c *Crawler) OnReponse(fn func(*wbot.Response)) { @@ -139,7 +137,9 @@ func (c *Crawler) OnReponse(fn func(*wbot.Response)) { defer c.wg.Done() for { select { - case <-c.quit.Done(): + case <-c.ctx.Done(): + return + case <-c.flare.Done(): return case resp, ok := <-c.stream: if ok { @@ -155,7 +155,9 @@ func (c *Crawler) OnError(fn func(err error)) { defer c.wg.Done() for { select { - case <-c.quit.Done(): + case <-c.ctx.Done(): + return + case <-c.flare.Done(): return case err, ok := <-c.errors: if ok { @@ -169,15 +171,14 @@ func (c *Crawler) Stats() map[string]any { return map[string]any{} } func (c *Crawler) Stop() { - c.quit.Cancel() + c.stop() } -func (c *Crawler) start(target *wbot.ParsedURL) error { - defer c.wg.Done() - +func (c *Crawler) start(target *wbot.ParsedURL) { param := &wbot.Param{ MaxBodySize: c.cfg.maxBodySize, UserAgent: c.cfg.userAgents.Next(), + Timeout: c.cfg.timeout, } if c.cfg.proxies != nil { @@ -190,48 +191,52 @@ func (c *Crawler) start(target *wbot.ParsedURL) error { Depth: 0, } - if err := c.queue.Push(context.TODO(), req); err != nil { - return fmt.Errorf("push: %w", err) + if err := c.queue.Push(c.ctx, req); err != nil { + c.errors <- fmt.Errorf("push: %w", err) + return } - - fmt.Printf("Crawling %s\n", target.URL.String()) - - return nil } func (c *Crawler) crawler(id int) { defer c.wg.Done() for { select { - case <-c.quit.Done(): - fmt.Printf("worker %d is stopping\n", id) + case <-c.ctx.Done(): return default: - if atomic.LoadInt32(&c.status) == crawlFinished { - c.quit.Cancel() + if atomic.LoadInt32(&c.status) == crawlStopped && c.queue.Len() == 0 { + c.flare.Cancel() return } - if c.queue.IsDone() { - c.quit.Cancel() - return - } - - req, err := c.queue.Pop(context.TODO()) + req, err := c.queue.Pop(c.ctx) if err != nil { c.errors <- fmt.Errorf("pop: %w", err) continue } + // if the next response will exceed the max depth, + // we signal the crawler to stop + if atomic.LoadInt32(&req.Depth) > c.cfg.maxDepth-1 { + atomic.StoreInt32(&c.status, crawlStopped) + } + c.limiter.wait(req.Target) - resp, err := c.fetcher.Fetch(context.TODO(), req) + resp, err := c.fetcher.Fetch(c.ctx, req) if err != nil { c.errors <- fmt.Errorf("fetch: %w", err) continue } + c.stream <- resp + atomic.AddInt32(&req.Depth, 1) + + if atomic.LoadInt32(&req.Depth) > c.cfg.maxDepth { + continue + } + for _, target := range resp.NextURLs { if !strings.Contains(target.URL.Host, req.Target.Root) { // can be ignored - better add log level @@ -250,7 +255,7 @@ func (c *Crawler) crawler(id int) { continue } - if visited, err := c.store.HasVisited(context.TODO(), target); visited { + if visited, err := c.store.HasVisited(c.ctx, target); visited { if err != nil { c.errors <- fmt.Errorf("store: %w", err) continue @@ -266,30 +271,11 @@ func (c *Crawler) crawler(id int) { Param: req.Param, } - if err := c.queue.Push(context.TODO(), nextReq); err != nil { + if err := c.queue.Push(c.ctx, nextReq); err != nil { c.errors <- fmt.Errorf("push: %w", err) continue } } - - if req.Depth > c.cfg.maxDepth { - c.queue.Cancel() // todo: better way to stop the queue - // c.quit.Cancel() - continue - } - - c.stream <- resp } } } -func (c *Crawler) exit() { - c.once.Do(func() { - atomic.StoreInt32(&c.status, crawlFinished) - c.queue.Cancel() - c.queue.Close() - c.store.Close() - c.fetcher.Close() - close(c.stream) - close(c.errors) - }) -} diff --git a/plugin/fetcher/http_client.go b/plugin/fetcher/http_client.go index e2d23ff..9878de2 100644 --- a/plugin/fetcher/http_client.go +++ b/plugin/fetcher/http_client.go @@ -31,6 +31,20 @@ func NewHTTPClient() wbot.Fetcher { client: &http.Client{ Jar: http.DefaultClient.Jar, Timeout: 10 * time.Second, + Transport: &http.Transport{ + DialContext: (&net.Dialer{ + Timeout: 10 * time.Second, + KeepAlive: 10 * time.Second, + DualStack: true, + }).DialContext, + ForceAttemptHTTP2: true, + MaxIdleConns: 100, // Default: 100 + MaxIdleConnsPerHost: 2, // Default: 2 + IdleConnTimeout: 10 * time.Second, + TLSHandshakeTimeout: 5 * time.Second, + ExpectContinueTimeout: 1 * time.Second, + // DisableKeepAlives: true, + }, }, bufferPool: &sync.Pool{ New: fn, @@ -44,6 +58,9 @@ func (f *defaultHTTPClient) Fetch(ctx context.Context, req *wbot.Request) (*wbot fetchErr error ) + fctx, done := context.WithTimeout(ctx, req.Param.Timeout) + defer done() + go func() { resp, err := f.fetch(req) if err != nil { @@ -55,8 +72,8 @@ func (f *defaultHTTPClient) Fetch(ctx context.Context, req *wbot.Request) (*wbot for { select { - case <-ctx.Done(): - return nil, ctx.Err() + case <-fctx.Done(): + return nil, fctx.Err() case resp := <-respCh: if fetchErr != nil { return nil, fetchErr @@ -147,6 +164,6 @@ func newHTTPTransport(purl string) *http.Transport { IdleConnTimeout: 10 * time.Second, TLSHandshakeTimeout: 5 * time.Second, ExpectContinueTimeout: 1 * time.Second, - // DisableKeepAlives: true, + DisableKeepAlives: false, } } diff --git a/plugin/queue/queue.go b/plugin/queue/queue.go index 27e6bfb..fcb4fbe 100644 --- a/plugin/queue/queue.go +++ b/plugin/queue/queue.go @@ -8,31 +8,29 @@ import ( "github.com/twiny/wbot" ) +/* +read first page add requests to queue +if request depth is exceeded return +*/ type defaultInMemoryQueue struct { - mu sync.Mutex - list []*wbot.Request - cond *sync.Cond - closed bool + mu *sync.RWMutex + list []*wbot.Request } -func NewInMemoryQueue() wbot.Queue { - queue := &defaultInMemoryQueue{ - list: make([]*wbot.Request, 0, 4096), +func NewInMemoryQueue(size int) wbot.Queue { + q := &defaultInMemoryQueue{ + mu: new(sync.RWMutex), + list: make([]*wbot.Request, 0, size), } - queue.cond = sync.NewCond(&queue.mu) - return queue + + return q } func (q *defaultInMemoryQueue) Push(ctx context.Context, req *wbot.Request) error { q.mu.Lock() defer q.mu.Unlock() - if q.closed { - return fmt.Errorf("queue is closed") - } - q.list = append(q.list, req) - q.cond.Broadcast() return nil } @@ -40,44 +38,22 @@ func (q *defaultInMemoryQueue) Pop(ctx context.Context) (*wbot.Request, error) { q.mu.Lock() defer q.mu.Unlock() - for len(q.list) == 0 && !q.closed { - q.cond.Wait() - } - - if q.closed && len(q.list) == 0 { - return nil, fmt.Errorf("queue is closed") + if len(q.list) == 0 { + return nil, fmt.Errorf("queue is empty") } req := q.list[0] q.list = q.list[1:] + return req, nil } func (q *defaultInMemoryQueue) Len() int32 { - q.mu.Lock() - defer q.mu.Unlock() + q.mu.RLock() + defer q.mu.RUnlock() return int32(len(q.list)) } -func (q *defaultInMemoryQueue) IsDone() bool { - q.mu.Lock() - defer q.mu.Unlock() - - return q.closed && len(q.list) == 0 -} -func (q *defaultInMemoryQueue) Cancel() { - q.mu.Lock() - defer q.mu.Unlock() - - q.closed = true - q.cond.Broadcast() -} func (q *defaultInMemoryQueue) Close() error { - q.mu.Lock() - defer q.mu.Unlock() - - q.closed = true - q.cond.Broadcast() - clear(q.list) return nil } diff --git a/wbot.go b/wbot.go index 689c3ad..3173c26 100644 --- a/wbot.go +++ b/wbot.go @@ -30,8 +30,6 @@ type ( Push(ctx context.Context, req *Request) error Pop(ctx context.Context) (*Request, error) Len() int32 - Cancel() - IsDone() bool Close() error } @@ -78,6 +76,7 @@ type ( UserAgent string Referer string MaxBodySize int64 + Timeout time.Duration } FilterRule struct { From c23a5c1f380a3d94ff4e27cccf2d93b6f476b0f4 Mon Sep 17 00:00:00 2001 From: twiny Date: Sat, 17 Feb 2024 19:07:30 +0100 Subject: [PATCH 08/10] feat: added logger - added zerolog logger. - removed OnError Method. --- crawler/crawler.go | 66 ++++++++++++++++++++----------------------- crawler/option.go | 11 ++++++++ go.mod | 7 +++-- go.sum | 17 +++++++++-- plugin/store/bbolt.go | 58 ------------------------------------- 5 files changed, 62 insertions(+), 97 deletions(-) delete mode 100644 plugin/store/bbolt.go diff --git a/crawler/crawler.go b/crawler/crawler.go index a4e6884..6bd3aa8 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -8,7 +8,9 @@ import ( "strings" "sync" "sync/atomic" + "time" + "github.com/rs/zerolog" "github.com/twiny/flare" "github.com/twiny/wbot" "github.com/twiny/wbot/plugin/fetcher" @@ -37,17 +39,27 @@ type ( robot *robotManager stream chan *wbot.Response - errors chan error status int32 flare flare.Notifier + logger zerolog.Logger + ctx context.Context stop context.CancelFunc } ) func New(opts ...Option) *Crawler { + cw := zerolog.ConsoleWriter{ + Out: os.Stdout, + TimeFormat: time.RFC3339, + NoColor: false, + } + zerolog.SetGlobalLevel(zerolog.TraceLevel) + + logger := zerolog.New(cw).With().Timestamp().Logger() + ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt) c := &Crawler{ wg: new(sync.WaitGroup), @@ -63,10 +75,10 @@ func New(opts ...Option) *Crawler { robot: newRobotManager(false), stream: make(chan *wbot.Response, 1024), - errors: make(chan error, 1024), status: crawlStopped, flare: flare.New(), + logger: logger, ctx: ctx, stop: stop, @@ -79,17 +91,15 @@ func New(opts ...Option) *Crawler { // this routine waits for quit signal go func() { <-c.ctx.Done() - fmt.Println("Crawler is shutting down") + c.logger.Info().Msgf("Crawler is shutting down") c.flare.Cancel() - c.queue.Close() c.store.Close() c.fetcher.Close() c.wg.Wait() close(c.stream) - close(c.errors) }() return c @@ -123,6 +133,8 @@ func (c *Crawler) Start(links ...string) error { c.status = crawlRunning + c.logger.Info().Msgf("Starting crawler with %d links", len(targets)) + c.wg.Add(c.cfg.parallel) for i := 0; i < c.cfg.parallel; i++ { go c.crawler(i) @@ -149,24 +161,6 @@ func (c *Crawler) OnReponse(fn func(*wbot.Response)) { } }() } -func (c *Crawler) OnError(fn func(err error)) { - c.wg.Add(1) - go func() { - defer c.wg.Done() - for { - select { - case <-c.ctx.Done(): - return - case <-c.flare.Done(): - return - case err, ok := <-c.errors: - if ok { - fn(err) - } - } - } - }() -} func (c *Crawler) Stats() map[string]any { return map[string]any{} } @@ -192,7 +186,7 @@ func (c *Crawler) start(target *wbot.ParsedURL) { } if err := c.queue.Push(c.ctx, req); err != nil { - c.errors <- fmt.Errorf("push: %w", err) + c.logger.Err(err).Msgf("pop") return } } @@ -209,9 +203,15 @@ func (c *Crawler) crawler(id int) { return } + // not elegant, but if the queue is empty, we wait for a while + if atomic.LoadInt32(&c.status) == crawlRunning && c.queue.Len() == 0 { + <-time.After(1 * time.Second) + continue + } + req, err := c.queue.Pop(c.ctx) if err != nil { - c.errors <- fmt.Errorf("pop: %w", err) + c.logger.Err(err).Msgf("pop") continue } @@ -225,10 +225,12 @@ func (c *Crawler) crawler(id int) { resp, err := c.fetcher.Fetch(c.ctx, req) if err != nil { - c.errors <- fmt.Errorf("fetch: %w", err) + c.logger.Err(err).Any("target", req.Target.String()).Msgf("fetch") continue } + c.logger.Debug().Msgf("Fetched: %s", resp.URL.String()) + c.stream <- resp atomic.AddInt32(&req.Depth, 1) @@ -237,10 +239,9 @@ func (c *Crawler) crawler(id int) { continue } + // logging here will just flood the logs for _, target := range resp.NextURLs { if !strings.Contains(target.URL.Host, req.Target.Root) { - // can be ignored - better add log level - // c.errors <- fmt.Errorf("hostname check: %s", target.URL.String()) continue } @@ -250,18 +251,13 @@ func (c *Crawler) crawler(id int) { // } if !c.filter.allow(target) { - // can be ignored - better add log level - // c.errors <- fmt.Errorf("allow check: %s", target.URL.String()) continue } if visited, err := c.store.HasVisited(c.ctx, target); visited { if err != nil { - c.errors <- fmt.Errorf("store: %w", err) - continue + c.logger.Err(err).Msgf("store") } - // can be ignored - better add log level - // c.errors <- fmt.Errorf("URL %s has been visited", target.URL.String()) continue } @@ -272,7 +268,7 @@ func (c *Crawler) crawler(id int) { } if err := c.queue.Push(c.ctx, nextReq); err != nil { - c.errors <- fmt.Errorf("push: %w", err) + c.logger.Err(err).Any("target", target.String()).Msgf("push") continue } } diff --git a/crawler/option.go b/crawler/option.go index 8b614da..979625d 100644 --- a/crawler/option.go +++ b/crawler/option.go @@ -1,6 +1,7 @@ package crawler import ( + "github.com/rs/zerolog" "github.com/twiny/poxa" "github.com/twiny/wbot" ) @@ -49,3 +50,13 @@ func WithStore(store wbot.Store) Option { c.store = store } } +func WithQueue(queue wbot.Queue) Option { + return func(c *Crawler) { + c.queue = queue + } +} +func WithLogLevel(level zerolog.Level) Option { + return func(c *Crawler) { + c.logger = c.logger.Level(level) + } +} diff --git a/go.mod b/go.mod index 7494eeb..48eab8d 100644 --- a/go.mod +++ b/go.mod @@ -1,11 +1,12 @@ module github.com/twiny/wbot -go 1.21.5 +go 1.22.0 require ( github.com/PuerkitoBio/goquery v1.8.1 github.com/go-echarts/statsview v0.3.4 github.com/pkg/profile v1.7.0 + github.com/rs/zerolog v1.32.0 github.com/temoto/robotstxt v1.1.2 github.com/twiny/flare v0.1.0 github.com/twiny/poxa v0.1.0 @@ -19,8 +20,10 @@ require ( github.com/felixge/fgprof v0.9.3 // indirect github.com/go-echarts/go-echarts/v2 v2.2.3 // indirect github.com/google/pprof v0.0.0-20211214055906-6f57359322fd // indirect + github.com/mattn/go-colorable v0.1.13 // indirect + github.com/mattn/go-isatty v0.0.19 // indirect github.com/rs/cors v1.7.0 // indirect - github.com/stretchr/testify v1.8.4 // indirect golang.org/x/net v0.12.0 // indirect + golang.org/x/sys v0.12.0 // indirect golang.org/x/text v0.11.0 // indirect ) diff --git a/go.sum b/go.sum index a0c51df..af4c519 100644 --- a/go.sum +++ b/go.sum @@ -11,6 +11,7 @@ github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWR github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= github.com/cloudflare/circl v1.1.0/go.mod h1:prBCrKB9DV4poKZY1l9zBXg2QJY7mvgRvtMxxK7fi4I= +github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -20,6 +21,7 @@ github.com/go-echarts/go-echarts/v2 v2.2.3 h1:H8oPdUpzuiV2K8S4xYZa1JRNjP3U0h7HVq github.com/go-echarts/go-echarts/v2 v2.2.3/go.mod h1:6TOomEztzGDVDkOSCFBq3ed7xOYfbOqhaBzD0YV771A= github.com/go-echarts/statsview v0.3.4 h1:CCuytRAutdnF901NrR4BzSjHXjUp8OyA3/iopgG/1/Y= github.com/go-echarts/statsview v0.3.4/go.mod h1:AehKjL9cTFMeIo5QdV8sQO43vFmfY65X5GMWa3XMciY= +github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= @@ -35,20 +37,28 @@ github.com/ianlancetaylor/demangle v0.0.0-20210905161508-09a460cdf81d/go.mod h1: github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= +github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= +github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA= +github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/profile v1.7.0 h1:hnbDkaNWPCLMO9wGLdBFTIZvzDrDfBM2072E1S9gJkA= github.com/pkg/profile v1.7.0/go.mod h1:8Uer0jas47ZQMJ7VD+OHknK4YDY07LPUC6dEvqDjvNo= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rs/cors v1.7.0 h1:+88SsELBHx5r+hZ8TCkggzSstaWNbDvThkVK8H6f9ik= github.com/rs/cors v1.7.0/go.mod h1:gFx+x8UowdsKA9AchylcLynDq+nNFfI8FkUZdN/jGCU= +github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= +github.com/rs/zerolog v1.32.0 h1:keLypqrlIjaFsbmJOBdB/qvyF8KEtCWHwobLp5l/mQ0= +github.com/rs/zerolog v1.32.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.6.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= -github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= github.com/twiny/flare v0.1.0 h1:bq50IXYNUpiJULoIXXwerL1gwr+KBz49ayYgQo/CqnY= @@ -88,10 +98,13 @@ golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= diff --git a/plugin/store/bbolt.go b/plugin/store/bbolt.go deleted file mode 100644 index b193d71..0000000 --- a/plugin/store/bbolt.go +++ /dev/null @@ -1,58 +0,0 @@ -package store - -// var ( -// prefix = "store" -// ) - -// // BStore -// type BStore struct { -// prefix string -// db *bbolt.DB -// } - -// // NewBStore -// func NewBStore(db *bbolt.DB) (wbot.Store, error) { -// // create bucket for store -// if err := db.Update(func(tx *bbolt.Tx) error { -// _, err := tx.CreateBucketIfNotExists([]byte(prefix)) -// return err -// }); err != nil { -// return nil, err -// } - -// return &BStore{ -// prefix: prefix, -// db: db, -// }, nil -// } - -// // Visited -// func (bs *BStore) Visited(link string) bool { -// sum := sha256.Sum224([]byte(link)) - -// // -// key := strings.Join([]string{ -// bs.prefix, -// hex.EncodeToString(sum[:]), -// }, "_") - -// return bs.db.Update(func(tx *bbolt.Tx) error { -// bu := tx.Bucket([]byte(prefix)) - -// d := bu.Get([]byte(key)) -// // if d == nil means not found -// if d == nil { -// if err := bu.Put([]byte(key), []byte(link)); err != nil { -// return err -// } -// return nil -// } - -// return fmt.Errorf("visited") -// }) != nil -// } - -// // Close -// func (bs *BStore) Close() error { -// return bs.db.Close() -// } From 298762976691593d23797c9538128941166f116e Mon Sep 17 00:00:00 2001 From: twiny Date: Sat, 17 Feb 2024 20:18:05 +0100 Subject: [PATCH 09/10] feat: fixing crawler - refactor metrics monitoring and add new metrics --- crawler/crawler.go | 51 +++++++++++++++++++----------- plugin/fetcher/http_client.go | 12 +++---- plugin/metrics/metrics.go | 55 ++++++++++++++++++++++++++++++++ plugin/monitor/monitor.go | 59 ----------------------------------- wbot.go | 13 +++----- 5 files changed, 98 insertions(+), 92 deletions(-) create mode 100644 plugin/metrics/metrics.go delete mode 100644 plugin/monitor/monitor.go diff --git a/crawler/crawler.go b/crawler/crawler.go index 6bd3aa8..d8388bd 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -14,7 +14,7 @@ import ( "github.com/twiny/flare" "github.com/twiny/wbot" "github.com/twiny/wbot/plugin/fetcher" - "github.com/twiny/wbot/plugin/monitor" + "github.com/twiny/wbot/plugin/metrics" "github.com/twiny/wbot/plugin/queue" "github.com/twiny/wbot/plugin/store" ) @@ -68,7 +68,7 @@ func New(opts ...Option) *Crawler { fetcher: fetcher.NewHTTPClient(), store: store.NewInMemoryStore(), queue: queue.NewInMemoryQueue(2048), - metrics: monitor.NewmetricsMonitor(), + metrics: metrics.NewMetricsMonitor(), filter: newFilter(), limiter: newRateLimiter(), @@ -105,11 +105,12 @@ func New(opts ...Option) *Crawler { return c } -func (c *Crawler) Start(links ...string) error { +func (c *Crawler) Run(links ...string) error { var ( targets []*wbot.ParsedURL errs []error ) + for _, link := range links { target, err := wbot.NewURL(link) if err != nil { @@ -128,7 +129,7 @@ func (c *Crawler) Start(links ...string) error { } for _, target := range targets { - c.start(target) + c.add(target) } c.status = crawlRunning @@ -137,7 +138,7 @@ func (c *Crawler) Start(links ...string) error { c.wg.Add(c.cfg.parallel) for i := 0; i < c.cfg.parallel; i++ { - go c.crawler(i) + go c.crawl(i) } c.wg.Wait() @@ -161,14 +162,14 @@ func (c *Crawler) OnReponse(fn func(*wbot.Response)) { } }() } -func (c *Crawler) Stats() map[string]any { - return map[string]any{} +func (c *Crawler) Metrics() map[string]int64 { + return c.metrics.Metrics() } -func (c *Crawler) Stop() { +func (c *Crawler) Shutdown() { c.stop() } -func (c *Crawler) start(target *wbot.ParsedURL) { +func (c *Crawler) add(target *wbot.ParsedURL) { param := &wbot.Param{ MaxBodySize: c.cfg.maxBodySize, UserAgent: c.cfg.userAgents.Next(), @@ -190,7 +191,7 @@ func (c *Crawler) start(target *wbot.ParsedURL) { return } } -func (c *Crawler) crawler(id int) { +func (c *Crawler) crawl(id int) { defer c.wg.Done() for { @@ -203,7 +204,7 @@ func (c *Crawler) crawler(id int) { return } - // not elegant, but if the queue is empty, we wait for a while + // not elegant, but just to give the queue some time to fill up if atomic.LoadInt32(&c.status) == crawlRunning && c.queue.Len() == 0 { <-time.After(1 * time.Second) continue @@ -214,6 +215,7 @@ func (c *Crawler) crawler(id int) { c.logger.Err(err).Msgf("pop") continue } + c.metrics.IncTotalRequests() // if the next response will exceed the max depth, // we signal the crawler to stop @@ -225,32 +227,40 @@ func (c *Crawler) crawler(id int) { resp, err := c.fetcher.Fetch(c.ctx, req) if err != nil { + c.metrics.IncFailedRequests() c.logger.Err(err).Any("target", req.Target.String()).Msgf("fetch") continue } - c.logger.Debug().Msgf("Fetched: %s", resp.URL.String()) - c.stream <- resp + c.metrics.IncSuccessfulRequests() - atomic.AddInt32(&req.Depth, 1) + c.logger.Debug().Any("target", req.Target.String()).Msgf("fetched") - if atomic.LoadInt32(&req.Depth) > c.cfg.maxDepth { + // increment the depth for the next requests + nextDepth := atomic.AddInt32(&req.Depth, 1) + + if nextDepth > c.cfg.maxDepth { continue } // logging here will just flood the logs for _, target := range resp.NextURLs { + c.metrics.IncTotalLink() + if !strings.Contains(target.URL.Host, req.Target.Root) { + c.metrics.IncSkippedLink() continue } - // if !c.robot.Allowed(req.Param.UserAgent, req.URL.String()) { - // // todo: log - // continue - // } + if !c.robot.Allowed(req.Param.UserAgent, req.Target.URL.String()) { + c.metrics.IncSkippedLink() + // todo: log + continue + } if !c.filter.allow(target) { + c.metrics.IncSkippedLink() continue } @@ -258,6 +268,7 @@ func (c *Crawler) crawler(id int) { if err != nil { c.logger.Err(err).Msgf("store") } + c.metrics.IncDuplicatedLink() continue } @@ -271,6 +282,8 @@ func (c *Crawler) crawler(id int) { c.logger.Err(err).Any("target", target.String()).Msgf("push") continue } + + c.metrics.IncCrawledLink() } } } diff --git a/plugin/fetcher/http_client.go b/plugin/fetcher/http_client.go index 9878de2..9c0b2f0 100644 --- a/plugin/fetcher/http_client.go +++ b/plugin/fetcher/http_client.go @@ -158,12 +158,12 @@ func newHTTPTransport(purl string) *http.Transport { KeepAlive: 10 * time.Second, DualStack: true, }).DialContext, - ForceAttemptHTTP2: true, - MaxIdleConns: 100, // Default: 100 - MaxIdleConnsPerHost: 2, // Default: 2 + ForceAttemptHTTP2: false, + MaxIdleConns: 10, // Default: 100 + MaxIdleConnsPerHost: 5, // Default: 2 IdleConnTimeout: 10 * time.Second, - TLSHandshakeTimeout: 5 * time.Second, - ExpectContinueTimeout: 1 * time.Second, - DisableKeepAlives: false, + TLSHandshakeTimeout: 2 * time.Second, + ExpectContinueTimeout: 2 * time.Second, + DisableKeepAlives: true, } } diff --git a/plugin/metrics/metrics.go b/plugin/metrics/metrics.go new file mode 100644 index 0000000..b612b80 --- /dev/null +++ b/plugin/metrics/metrics.go @@ -0,0 +1,55 @@ +package metrics + +import ( + "sync/atomic" +) + +type ( + metricsMonitor struct { + totalRequests int64 + successfulRequests int64 + failedRequests int64 + + totalLink int64 + crawledLink int64 + skippedLink int64 + duplicatedLink int64 + } +) + +func NewMetricsMonitor() *metricsMonitor { + return &metricsMonitor{} +} + +func (m *metricsMonitor) IncTotalRequests() { + atomic.AddInt64(&m.totalRequests, 1) +} +func (m *metricsMonitor) IncSuccessfulRequests() { + atomic.AddInt64(&m.successfulRequests, 1) +} +func (m *metricsMonitor) IncFailedRequests() { + atomic.AddInt64(&m.failedRequests, 1) +} +func (m *metricsMonitor) IncTotalLink() { + atomic.AddInt64(&m.totalLink, 1) +} +func (m *metricsMonitor) IncCrawledLink() { + atomic.AddInt64(&m.crawledLink, 1) +} +func (m *metricsMonitor) IncSkippedLink() { + atomic.AddInt64(&m.skippedLink, 1) +} +func (m *metricsMonitor) IncDuplicatedLink() { + atomic.AddInt64(&m.duplicatedLink, 1) +} +func (m *metricsMonitor) Metrics() map[string]int64 { + return map[string]int64{ + "total_requests": atomic.LoadInt64(&m.totalRequests), + "successful_requests": atomic.LoadInt64(&m.successfulRequests), + "failed_requests": atomic.LoadInt64(&m.failedRequests), + "total_link": atomic.LoadInt64(&m.totalLink), + "crawled_link": atomic.LoadInt64(&m.crawledLink), + "skipped_link": atomic.LoadInt64(&m.skippedLink), + "duplicated_link": atomic.LoadInt64(&m.duplicatedLink), + } +} diff --git a/plugin/monitor/monitor.go b/plugin/monitor/monitor.go deleted file mode 100644 index 18835b7..0000000 --- a/plugin/monitor/monitor.go +++ /dev/null @@ -1,59 +0,0 @@ -package monitor - -import ( - "sync/atomic" -) - -type ( - metricsMonitor struct { - totalRequests int64 - successfulRequests int64 - failedRequests int64 - retries int64 - redirects int64 - totalPages int64 - crawledPages int64 - skippedPages int64 - parsedLinks int64 - clientErrors int64 - serverErrors int64 - } -) - -func NewmetricsMonitor() *metricsMonitor { - return &metricsMonitor{} -} - -func (m *metricsMonitor) IncTotalRequests() { - atomic.AddInt64(&m.totalRequests, 1) -} -func (m *metricsMonitor) IncSuccessfulRequests() { - atomic.AddInt64(&m.successfulRequests, 1) -} -func (m *metricsMonitor) IncFailedRequests() { - atomic.AddInt64(&m.failedRequests, 1) -} -func (m *metricsMonitor) IncRetries() { - atomic.AddInt64(&m.retries, 1) -} -func (m *metricsMonitor) IncRedirects() { - atomic.AddInt64(&m.redirects, 1) -} -func (m *metricsMonitor) IncTotalPages() { - atomic.AddInt64(&m.totalPages, 1) -} -func (m *metricsMonitor) IncCrawledPages() { - atomic.AddInt64(&m.crawledPages, 1) -} -func (m *metricsMonitor) IncSkippedPages() { - atomic.AddInt64(&m.skippedPages, 1) -} -func (m *metricsMonitor) IncParsedLinks() { - atomic.AddInt64(&m.parsedLinks, 1) -} -func (m *metricsMonitor) IncClientErrors() { - atomic.AddInt64(&m.clientErrors, 1) -} -func (m *metricsMonitor) IncServerErrors() { - atomic.AddInt64(&m.serverErrors, 1) -} diff --git a/wbot.go b/wbot.go index 3173c26..e189f2c 100644 --- a/wbot.go +++ b/wbot.go @@ -37,16 +37,13 @@ type ( IncTotalRequests() IncSuccessfulRequests() IncFailedRequests() - IncRetries() - IncRedirects() - IncTotalPages() - IncCrawledPages() - IncSkippedPages() - IncParsedLinks() + IncTotalLink() + IncCrawledLink() + IncSkippedLink() + IncDuplicatedLink() - IncClientErrors() - IncServerErrors() + Metrics() map[string]int64 } Request struct { From 09a2730930912a751cb408ed5075cd7358e087f2 Mon Sep 17 00:00:00 2001 From: twiny Date: Sun, 18 Feb 2024 01:05:32 +0100 Subject: [PATCH 10/10] content: update readme - updated readme. - added .dev extention. - fix mod tidy --- README.md | 57 +++++++++++++++++++++++++++++++++++++++++------ crawler/config.go | 2 +- go.mod | 13 ++++------- go.sum | 30 +++++-------------------- utilities.go | 4 ++-- 5 files changed, 63 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 229172b..5212bb0 100644 --- a/README.md +++ b/README.md @@ -9,15 +9,58 @@ A configurable, thread-safe web crawler, provides a minimal interface for crawli - Memory-efficient, thread-safe. - Provides built-in interface: Fetcher, Store, Queue & a Logger. -## [Examples & API](https://github.com/twiny/wbot/wiki) +## API -## TODO +WBot provides a minimal API for crawling web pages. -- [ ] Add support for robots.txt. -- [ ] Add test cases. -- [ ] Implement `Fetch` using Chromedp. -- [ ] Add more examples. -- [ ] Add documentation. +```go +Run(links ...string) error +OnReponse(fn func(*wbot.Response)) +Metrics() map[string]int64 +Shutdown() +``` + +## Usage + +```go +package main + +import ( + "fmt" + "log" + + "github.com/rs/zerolog" + "github.com/twiny/wbot" + "github.com/twiny/wbot/crawler" +) + +func main() { + bot := crawler.New( + crawler.WithParallel(50), + crawler.WithMaxDepth(5), + crawler.WithRateLimit(&wbot.RateLimit{ + Hostname: "*", + Rate: "10/1s", + }), + crawler.WithLogLevel(zerolog.DebugLevel), + ) + defer bot.Shutdown() + + // read responses + bot.OnReponse(func(resp *wbot.Response) { + fmt.Printf("crawled: %s\n", resp.URL.String()) + }) + + if err := bot.Run( + "https://crawler-test.com/", + ); err != nil { + log.Fatal(err) + } + + log.Printf("finished crawling\n") +} + +``` ### Bugs diff --git a/crawler/config.go b/crawler/config.go index c76004c..1a7de4b 100644 --- a/crawler/config.go +++ b/crawler/config.go @@ -9,7 +9,7 @@ import ( const ( defaultReferrer = "https://www.google.com/search" - defaultUserAgent = "WBot/0.1.6 (+https://github.com/twiny/wbot)" + defaultUserAgent = "WBot/v0.2.0 (+https://github.com/twiny/wbot)" defaultTimeout = 10 * time.Second defaultMaxBodySize = int64(1024 * 1024 * 5) // 5MB ) diff --git a/go.mod b/go.mod index 48eab8d..cc34cf0 100644 --- a/go.mod +++ b/go.mod @@ -4,8 +4,6 @@ go 1.22.0 require ( github.com/PuerkitoBio/goquery v1.8.1 - github.com/go-echarts/statsview v0.3.4 - github.com/pkg/profile v1.7.0 github.com/rs/zerolog v1.32.0 github.com/temoto/robotstxt v1.1.2 github.com/twiny/flare v0.1.0 @@ -17,13 +15,10 @@ require ( require ( github.com/andybalholm/cascadia v1.3.1 // indirect github.com/benbjohnson/clock v1.3.0 // indirect - github.com/felixge/fgprof v0.9.3 // indirect - github.com/go-echarts/go-echarts/v2 v2.2.3 // indirect - github.com/google/pprof v0.0.0-20211214055906-6f57359322fd // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.19 // indirect - github.com/rs/cors v1.7.0 // indirect - golang.org/x/net v0.12.0 // indirect - golang.org/x/sys v0.12.0 // indirect - golang.org/x/text v0.11.0 // indirect + github.com/stretchr/testify v1.8.0 // indirect + golang.org/x/net v0.17.0 // indirect + golang.org/x/sys v0.13.0 // indirect + golang.org/x/text v0.13.0 // indirect ) diff --git a/go.sum b/go.sum index af4c519..5ca2f34 100644 --- a/go.sum +++ b/go.sum @@ -7,20 +7,11 @@ github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEq github.com/benbjohnson/clock v1.3.0 h1:ip6w0uFQkncKQ979AypyG0ER7mqUSBdKLOgAle/AT8A= github.com/benbjohnson/clock v1.3.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/bwesterb/go-ristretto v1.2.0/go.mod h1:fUIoIZaG73pV5biE2Blr2xEzDoMj7NFEuV9ekS419A0= -github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= -github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= -github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= github.com/cloudflare/circl v1.1.0/go.mod h1:prBCrKB9DV4poKZY1l9zBXg2QJY7mvgRvtMxxK7fi4I= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/felixge/fgprof v0.9.3 h1:VvyZxILNuCiUCSXtPtYmmtGvb65nqXh2QFWc0Wpf2/g= -github.com/felixge/fgprof v0.9.3/go.mod h1:RdbpDgzqYVh/T9fPELJyV7EYJuHB55UTEULNun8eiPw= -github.com/go-echarts/go-echarts/v2 v2.2.3 h1:H8oPdUpzuiV2K8S4xYZa1JRNjP3U0h7HVqvhPrmCk1A= -github.com/go-echarts/go-echarts/v2 v2.2.3/go.mod h1:6TOomEztzGDVDkOSCFBq3ed7xOYfbOqhaBzD0YV771A= -github.com/go-echarts/statsview v0.3.4 h1:CCuytRAutdnF901NrR4BzSjHXjUp8OyA3/iopgG/1/Y= -github.com/go-echarts/statsview v0.3.4/go.mod h1:AehKjL9cTFMeIo5QdV8sQO43vFmfY65X5GMWa3XMciY= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= @@ -31,31 +22,20 @@ github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeN github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-github/v50 v50.2.0/go.mod h1:VBY8FB6yPIjrtKhozXv4FQupxKLS6H4m6xFZlT43q8Q= github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17icRSOU623lUBU= -github.com/google/pprof v0.0.0-20211214055906-6f57359322fd h1:1FjCyPC+syAzJ5/2S8fqdZK1R22vvA0J7JZKcuOIQ7Y= -github.com/google/pprof v0.0.0-20211214055906-6f57359322fd/go.mod h1:KgnwoLYCZ8IQu3XUZ8Nc/bM9CCZFOyjUNOSygVozoDg= -github.com/ianlancetaylor/demangle v0.0.0-20210905161508-09a460cdf81d/go.mod h1:aYm2/VgdVmcIU8iMfdMvDMsRAQjcfZSKFby6HOFvi/w= -github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA= github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pkg/profile v1.7.0 h1:hnbDkaNWPCLMO9wGLdBFTIZvzDrDfBM2072E1S9gJkA= -github.com/pkg/profile v1.7.0/go.mod h1:8Uer0jas47ZQMJ7VD+OHknK4YDY07LPUC6dEvqDjvNo= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/rs/cors v1.7.0 h1:+88SsELBHx5r+hZ8TCkggzSstaWNbDvThkVK8H6f9ik= -github.com/rs/cors v1.7.0/go.mod h1:gFx+x8UowdsKA9AchylcLynDq+nNFfI8FkUZdN/jGCU= github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= github.com/rs/zerolog v1.32.0 h1:keLypqrlIjaFsbmJOBdB/qvyF8KEtCWHwobLp5l/mQ0= github.com/rs/zerolog v1.32.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.6.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= @@ -85,8 +65,9 @@ golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/net v0.12.0 h1:cfawfvKITfUsFCeJIHJrbSxpeu/E81khclypR0GVT50= golang.org/x/net v0.12.0/go.mod h1:zEVYFnQC7m/vmpQFELhcD1EWkZlX69l4oqgmer6hfKA= +golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= +golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= golang.org/x/oauth2 v0.6.0/go.mod h1:ycmewcwgD4Rpr3eZJLSB4Kyyljb3qDh40vJ8STE5HKw= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -103,8 +84,9 @@ golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.12.0 h1:CM0HF96J0hcLAwsHPJZjfdNzs0gftsLfgKt57wWHJ0o= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= @@ -119,8 +101,9 @@ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.11.0 h1:LAntKIrcmeSKERyiOh0XMV39LXS8IE9UL2yP7+f5ij4= golang.org/x/text v0.11.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= @@ -133,7 +116,6 @@ google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQ google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/utilities.go b/utilities.go index 93ad490..4918895 100644 --- a/utilities.go +++ b/utilities.go @@ -13,8 +13,8 @@ var tlds = map[string]bool{ "as": true, "asia": true, "at": true, "au": true, "ax": true, "be": true, "bg": true, "bi": true, "biz": true, "bj": true, "br": true, "by": true, "ca": true, "cat": true, "cc": true, "cl": true, "cn": true, "co": true, - "com": true, "coop": true, "cx": true, "de": true, "dk": true, "dm": true, - "dz": true, "edu": true, "ee": true, "eu": true, "fi": true, "fo": true, + "com": true, "coop": true, "cx": true, "de": true, "dev": true, "dk": true, + "dm": true, "dz": true, "edu": true, "ee": true, "eu": true, "fi": true, "fo": true, "fr": true, "ge": true, "gl": true, "gov": true, "gs": true, "hk": true, "hr": true, "hu": true, "id": true, "ie": true, "in": true, "info": true, "int": true, "io": true, "ir": true, "is": true, "je": true, "jobs": true,