Documentation
¶
Index ¶
- Constants
- Variables
- func ContextWithLogger(ctx context.Context, logger logging.Logger) context.Context
- func GetLoggerFromContext(ctx context.Context) logging.Logger
- func WithCache(cache Cacher) func(*ScrapeMate) error
- func WithConcurrency(concurrency int) func(*ScrapeMate) error
- func WithContext(ctx context.Context, cancelFn context.CancelCauseFunc) func(*ScrapeMate) error
- func WithExitBecauseOfInactivity(duration time.Duration) func(*ScrapeMate) error
- func WithFailed() func(*ScrapeMate) error
- func WithHTMLParser(parser HTMLParser) func(*ScrapeMate) error
- func WithHTTPFetcher(client HTTPFetcher) func(*ScrapeMate) error
- func WithInitJob(job IJob) func(*ScrapeMate) error
- func WithJobProvider(provider JobProvider) func(*ScrapeMate) error
- func WithLogger(log logging.Logger) func(*ScrapeMate) error
- type BrowserPage
- type Cacher
- type CsvCapable
- type HTMLParser
- type HTTPFetcher
- type IJob
- type Job
- func (j *Job) BrowserActions(_ context.Context, page BrowserPage) Response
- func (j *Job) DoCheckResponse(resp *Response) bool
- func (j *Job) DoScreenshot() bool
- func (j *Job) GetBody() []byte
- func (j *Job) GetCacheKey() string
- func (j *Job) GetFullURL() string
- func (j *Job) GetHeaders() map[string]string
- func (j *Job) GetID() string
- func (j *Job) GetMaxRetries() int
- func (j *Job) GetMaxRetryDelay() time.Duration
- func (j *Job) GetMethod() string
- func (j *Job) GetParentID() string
- func (j *Job) GetPriority() int
- func (j *Job) GetRetryPolicy() RetryPolicy
- func (j *Job) GetTimeout() time.Duration
- func (j *Job) GetURL() string
- func (j *Job) GetURLParams() map[string]string
- func (j *Job) Process(_ context.Context, _ *Response) (any, []IJob, error)
- func (j *Job) ProcessOnFetchError() bool
- func (j *Job) String() string
- func (j *Job) UseInResults() bool
- type JobProvider
- type Locator
- type PageResponse
- type Proxy
- type ProxyRotator
- type Response
- type Result
- type ResultWriter
- type RetryPolicy
- type ScrapeMate
- func (s *ScrapeMate) Close() error
- func (s *ScrapeMate) Concurrency() int
- func (s *ScrapeMate) DoJob(ctx context.Context, job IJob) (result any, next []IJob, err error)
- func (s *ScrapeMate) Done() <-chan struct{}
- func (s *ScrapeMate) Err() error
- func (s *ScrapeMate) Failed() <-chan IJob
- func (s *ScrapeMate) Results() <-chan Result
- func (s *ScrapeMate) Start() error
- type WaitUntilState
Constants ¶
const ( // DefaultUserAgent is the default user agent scrape mate uses DefaultUserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36" // RetryJob retry a job RetryJob = 0 // DiscardJob just discard it in case crawling fails DiscardJob = 1 // RefreshIP refresh the api and then retry job RefreshIP = 2 // StopScraping exit scraping completely when an error happens StopScraping = 3 // DefaultMaxRetryDelay the default max delay between 2 consequive retries DefaultMaxRetryDelay = 2 * time.Second // PriorityHigh high priority PriorityHigh = 0 // PriorityMedium medium priority PriorityMedium = 1 // PriorityLow low priority PriorityLow = 2 )
Variables ¶
var ( // ErrorNoJobProvider returned when you do not set a job provider in initialization ErrorNoJobProvider = errors.New("no job provider set") // ErroExitSignal is returned when scrapemate exits because of a system interrupt ErrorExitSignal = errors.New("exit signal received") // ErrorNoLogger returned when you try to initialize it with a nil logger ErrorNoLogger = errors.New("no logger set") // ErrorNoContext returned when you try to initialized it with a nil context ErrorNoContext = errors.New("no context set") // ErrorConcurrency returned when you try to initialize it with concurrency <1 ErrorConcurrency = errors.New("concurrency must be greater than 0") // ErrorNoHTMLFetcher returned when you try to initialize with a nil httpFetcher ErrorNoHTMLFetcher = errors.New("no http fetcher set") // ErrorNoHTMLParser returned when you try to initialized with a nil HtmlParser ErrorNoHTMLParser = errors.New("no html parser set") // ErrorNoCacher returned when you try to initialized with a nil Cacher ErrorNoCacher = errors.New("no cacher set") // ErrorNoCsvCapable returned when you try to write a csv file without a csv capable Data ErrorNotCsvCapable = errors.New("not csv capable") // ErrInactivityTimeout returned when the system exits because of inactivity ErrInactivityTimeout = errors.New("inactivity timeout") )
Functions ¶
func ContextWithLogger ¶ added in v0.4.3
ContextWithLogger returns a new context with the logger
func GetLoggerFromContext ¶ added in v0.2.2
GetLoggerFromContext returns a logger from the context or a default logger
func WithCache ¶ added in v0.1.1
func WithCache(cache Cacher) func(*ScrapeMate) error
WithCache sets the cache for the scrapemate
func WithConcurrency ¶
func WithConcurrency(concurrency int) func(*ScrapeMate) error
WithConcurrency sets the concurrency for the scrapemate
func WithContext ¶
func WithContext(ctx context.Context, cancelFn context.CancelCauseFunc) func(*ScrapeMate) error
WithContext sets the context for the scrapemate
func WithExitBecauseOfInactivity ¶ added in v0.5.0
func WithExitBecauseOfInactivity(duration time.Duration) func(*ScrapeMate) error
func WithFailed ¶
func WithFailed() func(*ScrapeMate) error
WithFailed sets the failed jobs channel for the scrapemate
func WithHTMLParser ¶ added in v0.4.0
func WithHTMLParser(parser HTMLParser) func(*ScrapeMate) error
WithHTMLParser sets the html parser for the scrapemate
func WithHTTPFetcher ¶ added in v0.4.0
func WithHTTPFetcher(client HTTPFetcher) func(*ScrapeMate) error
WithHTTPFetcher sets the http fetcher for the scrapemate
func WithInitJob ¶ added in v0.4.3
func WithInitJob(job IJob) func(*ScrapeMate) error
WithInitJob sets the first job to be processed It will be processed before the jobs from the job provider It is useful if you want to start the scraper with a specific job instead of the first one from the job provider A real use case is when you want to obtain some cookies before starting the scraping process (e.g. login) Important: The results from these job will be discarded !
func WithJobProvider ¶
func WithJobProvider(provider JobProvider) func(*ScrapeMate) error
WithJobProvider sets the job provider for the scrapemate
func WithLogger ¶
func WithLogger(log logging.Logger) func(*ScrapeMate) error
WithLogger sets the logger for the scrapemate
Types ¶
type BrowserPage ¶ added in v1.0.0
type BrowserPage interface {
// Goto navigates to a URL and waits for the specified state.
// Returns the page response with status code, headers, and body.
Goto(url string, waitUntil WaitUntilState) (*PageResponse, error)
// URL returns the current page URL.
URL() string
// Content returns the full HTML content of the page.
Content() (string, error)
// Reload reloads the current page.
Reload(waitUntil WaitUntilState) error
// Screenshot takes a screenshot of the page.
// If fullPage is true, captures the entire scrollable page.
Screenshot(fullPage bool) ([]byte, error)
// Eval executes JavaScript in the page context and returns the result.
Eval(js string, args ...any) (any, error)
// WaitForURL waits until the page URL matches the given pattern.
WaitForURL(url string, timeout time.Duration) error
// WaitForSelector waits for an element matching the selector to appear.
WaitForSelector(selector string, timeout time.Duration) error
// WaitForTimeout waits for the specified duration.
// Note: This is generally discouraged in favor of waiting for specific conditions.
WaitForTimeout(timeout time.Duration)
// Locator creates a locator for finding elements matching the selector.
Locator(selector string) Locator
// Close closes the page and releases resources.
Close() error
// Unwrap returns the underlying page object (e.g., playwright.Page or *rod.Page).
// This allows users to access library-specific features when needed.
Unwrap() any
}
BrowserPage is an abstraction over browser page implementations. It provides a common interface for browser automation libraries such as Playwright and go-rod.
type Cacher ¶ added in v0.1.1
type Cacher interface {
Close() error
Get(ctx context.Context, key string) (Response, error)
Set(ctx context.Context, key string, value *Response) error
}
Cacher is an interface for cache
type CsvCapable ¶ added in v0.2.1
CsvCapable is an interface for types that can be converted to csv It is used to convert the Data of a Result to csv
type HTMLParser ¶ added in v0.4.0
HTMLParser is an interface for html parsers
type HTTPFetcher ¶ added in v0.4.0
HTTPFetcher is an interface for http fetchers
type IJob ¶
type IJob interface {
fmt.Stringer
// GetID returns the unique identifier of the job.
GetID() string
// GetParentID returns the parent id of the job
GetParentID() string
// GetMethod returns the http method to use
GetMethod() string
// GetBody returns the body of the request
GetBody() []byte
// GetURL returns the url to request
GetURL() string
// GetHeaders returns the headers to use
GetHeaders() map[string]string
// GetURLParams returns the url params to use
GetURLParams() map[string]string
// GetFullURL returns the full url to request
// it includes the url params
GetFullURL() string
// GetTimeout returns the timeout of the job
GetTimeout() time.Duration
// GetPriority returns the priority of the job
GetPriority() int
// CheckResponse checks the response of the job
DoCheckResponse(resp *Response) bool
// GetActionOnResponse returns the action to perform on the response
GetRetryPolicy() RetryPolicy
// GetMaxRetries returns the max retries of the job
GetMaxRetries() int
// Process processes the job
Process(ctx context.Context, resp *Response) (any, []IJob, error)
// GetMaxRetryDelay returns the delay to wait before retrying
GetMaxRetryDelay() time.Duration
BrowserActions(ctx context.Context, page BrowserPage) Response
// DoScreenshot takes a screenshot of the page
// Only works if the scraper uses jsfetcher
DoScreenshot() bool
// GetCacheKey returns the key to use for caching
GetCacheKey() string
// UseInResults returns true if the job should be used in the results
UseInResults() bool
// ProcessOnFetchError returns true if the job should be processed even if the job failed
ProcessOnFetchError() bool
}
IJob is a job to be processed by the scrapemate
type Job ¶
type Job struct {
// ID is an identifier for the job
ID string
// ParentID is the parent id of the job
ParentID string
// Method can be one valid HTTP method
Method string
// Body is the request's body
Body []byte
// URL is the url to sent a request
URL string
// Headers is the map of headers to use in HTTP
Headers map[string]string
// URLParams are the url parameters to use in the query string
URLParams map[string]string
// Timeout is the timeout of that job. By timeout we mean the time
// it takes to finish a single crawl
Timeout time.Duration
// Priority is a number indicating the priority. By convention the higher
// the priority
Priority int
// MaxRetries defines the maximum number of retries when a job fails
MaxRetries int
// CheckResponse is a function that takes as an input a Response and returns:
// true: when the response is to be accepted
// false: when the response is to be rejected
// By default a response is accepted if status code is 200
CheckResponse func(resp *Response) bool
// RetryPolicy can be one of:
// RetryJob: to retry the job untl it's successful
// DiscardJob:for not accepted responses just discard them and do not retry the job
// RefreshIP: Similar to RetryJob with an importan difference
// Before the job is retried the IP is refreshed.
RetryPolicy RetryPolicy
// MaxRetryDelay By default when a job is rejected is retried with an exponential backof
// for a MaxRetries numbers of time. If the sleep time between the retries is more than
// MaxRetryDelay then it's capped to that. (Default is 2 seconds)
MaxRetryDelay time.Duration
// TakeScreenshot if true takes a screenshot of the page
TakeScreenshot bool
Response Response
}
Job is the base job that we may use
func (*Job) BrowserActions ¶
func (j *Job) BrowserActions(_ context.Context, page BrowserPage) Response
BrowserActions is the function that will be executed in the browser This is the function that will be executed in the browser this is a default implementation that will just return the response override this function to perform actions in the browser
func (*Job) DoCheckResponse ¶
CheckResponse checks the response of the job
func (*Job) DoScreenshot ¶
DoScreenshot used to check if we need a screenshot It's here since it's a common use case
func (*Job) GetCacheKey ¶ added in v0.1.1
GetCacheKey returns the key to use for caching
func (*Job) GetFullURL ¶ added in v0.2.2
func (*Job) GetHeaders ¶
GetHeaders returns the headers to use
func (*Job) GetMaxRetries ¶
GetMaxRetry returns the max retry of the job
func (*Job) GetMaxRetryDelay ¶
GetRetryDelay returns the delay to wait before retrying
func (*Job) GetParentID ¶ added in v0.5.1
GetParentID returns the parent id of the job
func (*Job) GetPriority ¶
GetPriority returns the priority of the job
func (*Job) GetRetryPolicy ¶
func (j *Job) GetRetryPolicy() RetryPolicy
GetRetryPolicy returns the action to perform on the response
func (*Job) GetTimeout ¶
GetTimeout returns the timeout of the job
func (*Job) GetURLParams ¶ added in v0.4.0
GetURLParams returns the url params to use
func (*Job) ProcessOnFetchError ¶ added in v0.5.3
ProcessOnFetchError returns true if the job should be processed even if the job failed
func (*Job) UseInResults ¶ added in v0.2.1
UseInResults returns true if the job should be used in the results
type JobProvider ¶
type JobProvider interface {
Jobs(ctx context.Context) (<-chan IJob, <-chan error)
// Push pushes a job to the job provider
Push(ctx context.Context, job IJob) error
}
JobProvider is an interface for job providers a job provider is a service that provides jobs to scrapemate scrapemate will call the job provider to get jobs
type Locator ¶ added in v1.0.0
type Locator interface {
// Click clicks on the first matching element.
Click(timeout time.Duration) error
// Count returns the number of matching elements.
Count() (int, error)
// First returns a locator for the first matching element.
First() Locator
}
Locator represents an element locator for finding elements on the page.
type PageResponse ¶ added in v1.0.0
type PageResponse struct {
// URL is the final URL after any redirects.
URL string
// StatusCode is the HTTP status code of the response.
StatusCode int
// Headers contains the response headers.
Headers http.Header
// Body contains the response body bytes.
Body []byte
}
PageResponse contains the response information from a page navigation.
type ProxyRotator ¶ added in v0.7.0
type ProxyRotator interface {
RoundTrip(req *http.Request) (*http.Response, error)
Next() Proxy
Proxies() []string
}
ProxyRotator is an interface for proxy rotators
type Response ¶
type Response struct {
URL string
StatusCode int
Headers http.Header
Duration time.Duration
Body []byte
Error error
Meta map[string]any
Screenshot []byte
// Document is the parsed document
// if you don't set an html parser the document will be nil
// Since each html parser has it's own document type
// the document is an interface
// You need to cast it to the type of the parser you are using
// For example if you are using goquery you need to cast it to *goquery.Document
// If you are using the stdib parser net/html the it will be *html.Node
Document any
}
Response is the struct that it is returned when crawling finishes
type ResultWriter ¶ added in v0.2.1
ResultWriter is an interface for result writers
type RetryPolicy ¶
type RetryPolicy int
type ScrapeMate ¶ added in v0.2.1
type ScrapeMate struct {
// contains filtered or unexported fields
}
Scrapemate contains unexporter fields
func New ¶
func New(options ...func(*ScrapeMate) error) (*ScrapeMate, error)
New creates a new scrapemate
func (*ScrapeMate) Close ¶ added in v0.7.1
func (s *ScrapeMate) Close() error
func (*ScrapeMate) Concurrency ¶ added in v0.2.1
func (s *ScrapeMate) Concurrency() int
Concurrency returns how many workers are running in parallel
func (*ScrapeMate) Done ¶ added in v0.2.1
func (s *ScrapeMate) Done() <-chan struct{}
Done returns a channel that's closed when the work is done
func (*ScrapeMate) Err ¶ added in v0.2.1
func (s *ScrapeMate) Err() error
Err returns the error that caused scrapemate's context cancellation
func (*ScrapeMate) Failed ¶ added in v0.2.1
func (s *ScrapeMate) Failed() <-chan IJob
Failed returns the chanell that contains the jobs that failed. It's nil if you don't use the WithFailed option
func (*ScrapeMate) Results ¶ added in v0.2.1
func (s *ScrapeMate) Results() <-chan Result
Results returns a channel containing the results
func (*ScrapeMate) Start ¶ added in v0.2.1
func (s *ScrapeMate) Start() error
Start starts the scraper
type WaitUntilState ¶ added in v1.0.0
type WaitUntilState string
WaitUntilState represents when navigation is considered complete.
const ( // WaitUntilLoad waits until the load event is fired. WaitUntilLoad WaitUntilState = "load" // WaitUntilDOMContentLoaded waits until the DOMContentLoaded event is fired. WaitUntilDOMContentLoaded WaitUntilState = "domcontentloaded" // WaitUntilNetworkIdle waits until there are no network connections for at least 500 ms. WaitUntilNetworkIdle WaitUntilState = "networkidle" )
Source Files
¶
Directories
¶
| Path | Synopsis |
|---|---|
|
adapters
|
|
|
cmd
|
|
|
proxy-test
command
|
|
|
Package mock is a generated GoMock package.
|
Package mock is a generated GoMock package. |