1 Star 0 Fork 0

wiseai/goalng爬虫框架colly

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
colly.go 44.28 KB
一键复制 编辑 原始数据 按行查看 历史
WGH 提交于 2024-03-26 00:30 . Implement content sniffing for HTML parsing
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559
// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package colly implements a HTTP scraping framework
package colly
import (
"bytes"
"context"
"crypto/rand"
"encoding/json"
"errors"
"fmt"
"hash/fnv"
"io"
"log"
"net/http"
"net/http/cookiejar"
"net/url"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/antchfx/htmlquery"
"github.com/antchfx/xmlquery"
"github.com/gocolly/colly/v2/debug"
"github.com/gocolly/colly/v2/storage"
"github.com/kennygrant/sanitize"
whatwgUrl "github.com/nlnwa/whatwg-url/url"
"github.com/temoto/robotstxt"
"google.golang.org/appengine/urlfetch"
)
// A CollectorOption sets an option on a Collector.
type CollectorOption func(*Collector)
// Collector provides the scraper instance for a scraping job
type Collector struct {
// UserAgent is the User-Agent string used by HTTP requests
UserAgent string
// Custom headers for the request
Headers *http.Header
// MaxDepth limits the recursion depth of visited URLs.
// Set it to 0 for infinite recursion (default).
MaxDepth int
// AllowedDomains is a domain whitelist.
// Leave it blank to allow any domains to be visited
AllowedDomains []string
// DisallowedDomains is a domain blacklist.
DisallowedDomains []string
// DisallowedURLFilters is a list of regular expressions which restricts
// visiting URLs. If any of the rules matches to a URL the
// request will be stopped. DisallowedURLFilters will
// be evaluated before URLFilters
// Leave it blank to allow any URLs to be visited
DisallowedURLFilters []*regexp.Regexp
// URLFilters is a list of regular expressions which restricts
// visiting URLs. If any of the rules matches to a URL the
// request won't be stopped. DisallowedURLFilters will
// be evaluated before URLFilters
// Leave it blank to allow any URLs to be visited
URLFilters []*regexp.Regexp
// AllowURLRevisit allows multiple downloads of the same URL
AllowURLRevisit bool
// MaxBodySize is the limit of the retrieved response body in bytes.
// 0 means unlimited.
// The default value for MaxBodySize is 10MB (10 * 1024 * 1024 bytes).
MaxBodySize int
// CacheDir specifies a location where GET requests are cached as files.
// When it's not defined, caching is disabled.
CacheDir string
// IgnoreRobotsTxt allows the Collector to ignore any restrictions set by
// the target host's robots.txt file. See http://www.robotstxt.org/ for more
// information.
IgnoreRobotsTxt bool
// Async turns on asynchronous network communication. Use Collector.Wait() to
// be sure all requests have been finished.
Async bool
// ParseHTTPErrorResponse allows parsing HTTP responses with non 2xx status codes.
// By default, Colly parses only successful HTTP responses. Set ParseHTTPErrorResponse
// to true to enable it.
ParseHTTPErrorResponse bool
// ID is the unique identifier of a collector
ID uint32
// DetectCharset can enable character encoding detection for non-utf8 response bodies
// without explicit charset declaration. This feature uses https://github.com/saintfish/chardet
DetectCharset bool
// RedirectHandler allows control on how a redirect will be managed
// use c.SetRedirectHandler to set this value
redirectHandler func(req *http.Request, via []*http.Request) error
// CheckHead performs a HEAD request before every GET to pre-validate the response
CheckHead bool
// TraceHTTP enables capturing and reporting request performance for crawler tuning.
// When set to true, the Response.Trace will be filled in with an HTTPTrace object.
TraceHTTP bool
// Context is the context that will be used for HTTP requests. You can set this
// to support clean cancellation of scraping.
Context context.Context
// MaxRequests limit the number of requests done by the instance.
// Set it to 0 for infinite requests (default).
MaxRequests uint32
store storage.Storage
debugger debug.Debugger
robotsMap map[string]*robotstxt.RobotsData
htmlCallbacks []*htmlCallbackContainer
xmlCallbacks []*xmlCallbackContainer
requestCallbacks []RequestCallback
responseCallbacks []ResponseCallback
responseHeadersCallbacks []ResponseHeadersCallback
errorCallbacks []ErrorCallback
scrapedCallbacks []ScrapedCallback
requestCount uint32
responseCount uint32
backend *httpBackend
wg *sync.WaitGroup
lock *sync.RWMutex
}
// RequestCallback is a type alias for OnRequest callback functions
type RequestCallback func(*Request)
// ResponseHeadersCallback is a type alias for OnResponseHeaders callback functions
type ResponseHeadersCallback func(*Response)
// ResponseCallback is a type alias for OnResponse callback functions
type ResponseCallback func(*Response)
// HTMLCallback is a type alias for OnHTML callback functions
type HTMLCallback func(*HTMLElement)
// XMLCallback is a type alias for OnXML callback functions
type XMLCallback func(*XMLElement)
// ErrorCallback is a type alias for OnError callback functions
type ErrorCallback func(*Response, error)
// ScrapedCallback is a type alias for OnScraped callback functions
type ScrapedCallback func(*Response)
// ProxyFunc is a type alias for proxy setter functions.
type ProxyFunc func(*http.Request) (*url.URL, error)
// AlreadyVisitedError is the error type for already visited URLs.
//
// It's returned synchronously by Visit when the URL passed to Visit
// is already visited.
//
// When already visited URL is encountered after following
// redirects, this error appears in OnError callback, and if Async
// mode is not enabled, is also returned by Visit.
type AlreadyVisitedError struct {
// Destination is the URL that was attempted to be visited.
// It might not match the URL passed to Visit if redirect
// was followed.
Destination *url.URL
}
// Error implements error interface.
func (e *AlreadyVisitedError) Error() string {
return fmt.Sprintf("%q already visited", e.Destination)
}
type htmlCallbackContainer struct {
Selector string
Function HTMLCallback
}
type xmlCallbackContainer struct {
Query string
Function XMLCallback
}
type cookieJarSerializer struct {
store storage.Storage
lock *sync.RWMutex
}
var collectorCounter uint32
// The key type is unexported to prevent collisions with context keys defined in
// other packages.
type key int
// ProxyURLKey is the context key for the request proxy address.
const ProxyURLKey key = iota
var (
// ErrForbiddenDomain is the error thrown if visiting
// a domain which is not allowed in AllowedDomains
ErrForbiddenDomain = errors.New("Forbidden domain")
// ErrMissingURL is the error type for missing URL errors
ErrMissingURL = errors.New("Missing URL")
// ErrMaxDepth is the error type for exceeding max depth
ErrMaxDepth = errors.New("Max depth limit reached")
// ErrForbiddenURL is the error thrown if visiting
// a URL which is not allowed by URLFilters
ErrForbiddenURL = errors.New("ForbiddenURL")
// ErrNoURLFiltersMatch is the error thrown if visiting
// a URL which is not allowed by URLFilters
ErrNoURLFiltersMatch = errors.New("No URLFilters match")
// ErrRobotsTxtBlocked is the error type for robots.txt errors
ErrRobotsTxtBlocked = errors.New("URL blocked by robots.txt")
// ErrNoCookieJar is the error type for missing cookie jar
ErrNoCookieJar = errors.New("Cookie jar is not available")
// ErrNoPattern is the error type for LimitRules without patterns
ErrNoPattern = errors.New("No pattern defined in LimitRule")
// ErrEmptyProxyURL is the error type for empty Proxy URL list
ErrEmptyProxyURL = errors.New("Proxy URL list is empty")
// ErrAbortedAfterHeaders is the error returned when OnResponseHeaders aborts the transfer.
ErrAbortedAfterHeaders = errors.New("Aborted after receiving response headers")
// ErrQueueFull is the error returned when the queue is full
ErrQueueFull = errors.New("Queue MaxSize reached")
// ErrMaxRequests is the error returned when exceeding max requests
ErrMaxRequests = errors.New("Max Requests limit reached")
// ErrRetryBodyUnseekable is the error when retry with not seekable body
ErrRetryBodyUnseekable = errors.New("Retry Body Unseekable")
)
var envMap = map[string]func(*Collector, string){
"ALLOWED_DOMAINS": func(c *Collector, val string) {
c.AllowedDomains = strings.Split(val, ",")
},
"CACHE_DIR": func(c *Collector, val string) {
c.CacheDir = val
},
"DETECT_CHARSET": func(c *Collector, val string) {
c.DetectCharset = isYesString(val)
},
"DISABLE_COOKIES": func(c *Collector, _ string) {
c.backend.Client.Jar = nil
},
"DISALLOWED_DOMAINS": func(c *Collector, val string) {
c.DisallowedDomains = strings.Split(val, ",")
},
"IGNORE_ROBOTSTXT": func(c *Collector, val string) {
c.IgnoreRobotsTxt = isYesString(val)
},
"FOLLOW_REDIRECTS": func(c *Collector, val string) {
if !isYesString(val) {
c.redirectHandler = func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
}
}
},
"MAX_BODY_SIZE": func(c *Collector, val string) {
size, err := strconv.Atoi(val)
if err == nil {
c.MaxBodySize = size
}
},
"MAX_DEPTH": func(c *Collector, val string) {
maxDepth, err := strconv.Atoi(val)
if err == nil {
c.MaxDepth = maxDepth
}
},
"MAX_REQUESTS": func(c *Collector, val string) {
maxRequests, err := strconv.ParseUint(val, 0, 32)
if err == nil {
c.MaxRequests = uint32(maxRequests)
}
},
"PARSE_HTTP_ERROR_RESPONSE": func(c *Collector, val string) {
c.ParseHTTPErrorResponse = isYesString(val)
},
"TRACE_HTTP": func(c *Collector, val string) {
c.TraceHTTP = isYesString(val)
},
"USER_AGENT": func(c *Collector, val string) {
c.UserAgent = val
},
}
var urlParser = whatwgUrl.NewParser(whatwgUrl.WithPercentEncodeSinglePercentSign())
// NewCollector creates a new Collector instance with default configuration
func NewCollector(options ...CollectorOption) *Collector {
c := &Collector{}
c.Init()
for _, f := range options {
f(c)
}
c.parseSettingsFromEnv()
return c
}
// UserAgent sets the user agent used by the Collector.
func UserAgent(ua string) CollectorOption {
return func(c *Collector) {
c.UserAgent = ua
}
}
// Headers sets the custom headers used by the Collector.
func Headers(headers map[string]string) CollectorOption {
return func(c *Collector) {
customHeaders := make(http.Header)
for header, value := range headers {
customHeaders.Add(header, value)
}
c.Headers = &customHeaders
}
}
// MaxDepth limits the recursion depth of visited URLs.
func MaxDepth(depth int) CollectorOption {
return func(c *Collector) {
c.MaxDepth = depth
}
}
// MaxRequests limit the number of requests done by the instance.
// Set it to 0 for infinite requests (default).
func MaxRequests(max uint32) CollectorOption {
return func(c *Collector) {
c.MaxRequests = max
}
}
// AllowedDomains sets the domain whitelist used by the Collector.
func AllowedDomains(domains ...string) CollectorOption {
return func(c *Collector) {
c.AllowedDomains = domains
}
}
// ParseHTTPErrorResponse allows parsing responses with HTTP errors
func ParseHTTPErrorResponse() CollectorOption {
return func(c *Collector) {
c.ParseHTTPErrorResponse = true
}
}
// DisallowedDomains sets the domain blacklist used by the Collector.
func DisallowedDomains(domains ...string) CollectorOption {
return func(c *Collector) {
c.DisallowedDomains = domains
}
}
// DisallowedURLFilters sets the list of regular expressions which restricts
// visiting URLs. If any of the rules matches to a URL the request will be stopped.
func DisallowedURLFilters(filters ...*regexp.Regexp) CollectorOption {
return func(c *Collector) {
c.DisallowedURLFilters = filters
}
}
// URLFilters sets the list of regular expressions which restricts
// visiting URLs. If any of the rules matches to a URL the request won't be stopped.
func URLFilters(filters ...*regexp.Regexp) CollectorOption {
return func(c *Collector) {
c.URLFilters = filters
}
}
// AllowURLRevisit instructs the Collector to allow multiple downloads of the same URL
func AllowURLRevisit() CollectorOption {
return func(c *Collector) {
c.AllowURLRevisit = true
}
}
// MaxBodySize sets the limit of the retrieved response body in bytes.
func MaxBodySize(sizeInBytes int) CollectorOption {
return func(c *Collector) {
c.MaxBodySize = sizeInBytes
}
}
// CacheDir specifies the location where GET requests are cached as files.
func CacheDir(path string) CollectorOption {
return func(c *Collector) {
c.CacheDir = path
}
}
// IgnoreRobotsTxt instructs the Collector to ignore any restrictions
// set by the target host's robots.txt file.
func IgnoreRobotsTxt() CollectorOption {
return func(c *Collector) {
c.IgnoreRobotsTxt = true
}
}
// TraceHTTP instructs the Collector to collect and report request trace data
// on the Response.Trace.
func TraceHTTP() CollectorOption {
return func(c *Collector) {
c.TraceHTTP = true
}
}
// StdlibContext sets the context that will be used for HTTP requests.
// You can set this to support clean cancellation of scraping.
func StdlibContext(ctx context.Context) CollectorOption {
return func(c *Collector) {
c.Context = ctx
}
}
// ID sets the unique identifier of the Collector.
func ID(id uint32) CollectorOption {
return func(c *Collector) {
c.ID = id
}
}
// Async turns on asynchronous network requests.
func Async(a ...bool) CollectorOption {
return func(c *Collector) {
if len(a) > 0 {
c.Async = a[0]
} else {
c.Async = true
}
}
}
// DetectCharset enables character encoding detection for non-utf8 response bodies
// without explicit charset declaration. This feature uses https://github.com/saintfish/chardet
func DetectCharset() CollectorOption {
return func(c *Collector) {
c.DetectCharset = true
}
}
// Debugger sets the debugger used by the Collector.
func Debugger(d debug.Debugger) CollectorOption {
return func(c *Collector) {
d.Init()
c.debugger = d
}
}
// CheckHead performs a HEAD request before every GET to pre-validate the response
func CheckHead() CollectorOption {
return func(c *Collector) {
c.CheckHead = true
}
}
// Init initializes the Collector's private variables and sets default
// configuration for the Collector
func (c *Collector) Init() {
c.UserAgent = "colly - https://github.com/gocolly/colly/v2"
c.Headers = nil
c.MaxDepth = 0
c.MaxRequests = 0
c.store = &storage.InMemoryStorage{}
c.store.Init()
c.MaxBodySize = 10 * 1024 * 1024
c.backend = &httpBackend{}
jar, _ := cookiejar.New(nil)
c.backend.Init(jar)
c.backend.Client.CheckRedirect = c.checkRedirectFunc()
c.wg = &sync.WaitGroup{}
c.lock = &sync.RWMutex{}
c.robotsMap = make(map[string]*robotstxt.RobotsData)
c.IgnoreRobotsTxt = true
c.ID = atomic.AddUint32(&collectorCounter, 1)
c.TraceHTTP = false
c.Context = context.Background()
}
// Appengine will replace the Collector's backend http.Client
// With an Http.Client that is provided by appengine/urlfetch
// This function should be used when the scraper is run on
// Google App Engine. Example:
//
// func startScraper(w http.ResponseWriter, r *http.Request) {
// ctx := appengine.NewContext(r)
// c := colly.NewCollector()
// c.Appengine(ctx)
// ...
// c.Visit("https://google.ca")
// }
func (c *Collector) Appengine(ctx context.Context) {
client := urlfetch.Client(ctx)
client.Jar = c.backend.Client.Jar
client.CheckRedirect = c.backend.Client.CheckRedirect
client.Timeout = c.backend.Client.Timeout
c.backend.Client = client
}
// Visit starts Collector's collecting job by creating a
// request to the URL specified in parameter.
// Visit also calls the previously provided callbacks
func (c *Collector) Visit(URL string) error {
if c.CheckHead {
if check := c.scrape(URL, "HEAD", 1, nil, nil, nil, true); check != nil {
return check
}
}
return c.scrape(URL, "GET", 1, nil, nil, nil, true)
}
// HasVisited checks if the provided URL has been visited
func (c *Collector) HasVisited(URL string) (bool, error) {
return c.checkHasVisited(URL, nil)
}
// HasPosted checks if the provided URL and requestData has been visited
// This method is useful more likely to prevent re-visit same URL and POST body
func (c *Collector) HasPosted(URL string, requestData map[string]string) (bool, error) {
return c.checkHasVisited(URL, requestData)
}
// Head starts a collector job by creating a HEAD request.
func (c *Collector) Head(URL string) error {
return c.scrape(URL, "HEAD", 1, nil, nil, nil, false)
}
// Post starts a collector job by creating a POST request.
// Post also calls the previously provided callbacks
func (c *Collector) Post(URL string, requestData map[string]string) error {
return c.scrape(URL, "POST", 1, createFormReader(requestData), nil, nil, true)
}
// PostRaw starts a collector job by creating a POST request with raw binary data.
// Post also calls the previously provided callbacks
func (c *Collector) PostRaw(URL string, requestData []byte) error {
return c.scrape(URL, "POST", 1, bytes.NewReader(requestData), nil, nil, true)
}
// PostMultipart starts a collector job by creating a Multipart POST request
// with raw binary data. PostMultipart also calls the previously provided callbacks
func (c *Collector) PostMultipart(URL string, requestData map[string][]byte) error {
boundary := randomBoundary()
hdr := http.Header{}
hdr.Set("Content-Type", "multipart/form-data; boundary="+boundary)
hdr.Set("User-Agent", c.UserAgent)
return c.scrape(URL, "POST", 1, createMultipartReader(boundary, requestData), nil, hdr, true)
}
// Request starts a collector job by creating a custom HTTP request
// where method, context, headers and request data can be specified.
// Set requestData, ctx, hdr parameters to nil if you don't want to use them.
// Valid methods:
// - "GET"
// - "HEAD"
// - "POST"
// - "PUT"
// - "DELETE"
// - "PATCH"
// - "OPTIONS"
func (c *Collector) Request(method, URL string, requestData io.Reader, ctx *Context, hdr http.Header) error {
return c.scrape(URL, method, 1, requestData, ctx, hdr, true)
}
// SetDebugger attaches a debugger to the collector
func (c *Collector) SetDebugger(d debug.Debugger) {
d.Init()
c.debugger = d
}
// UnmarshalRequest creates a Request from serialized data
func (c *Collector) UnmarshalRequest(r []byte) (*Request, error) {
req := &serializableRequest{}
err := json.Unmarshal(r, req)
if err != nil {
return nil, err
}
u, err := url.Parse(req.URL)
if err != nil {
return nil, err
}
ctx := NewContext()
for k, v := range req.Ctx {
ctx.Put(k, v)
}
return &Request{
Method: req.Method,
URL: u,
Depth: req.Depth,
Body: bytes.NewReader(req.Body),
Ctx: ctx,
ID: atomic.AddUint32(&c.requestCount, 1),
Headers: &req.Headers,
collector: c,
}, nil
}
func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, checkRevisit bool) error {
parsedWhatwgURL, err := urlParser.Parse(u)
if err != nil {
return err
}
parsedURL, err := url.Parse(parsedWhatwgURL.Href(false))
if err != nil {
return err
}
if hdr == nil {
hdr = http.Header{}
if c.Headers != nil {
for k, v := range *c.Headers {
for _, value := range v {
hdr.Add(k, value)
}
}
}
}
if _, ok := hdr["User-Agent"]; !ok {
hdr.Set("User-Agent", c.UserAgent)
}
if seeker, ok := requestData.(io.ReadSeeker); ok {
_, err := seeker.Seek(0, io.SeekStart)
if err != nil {
return err
}
}
req, err := http.NewRequest(method, parsedURL.String(), requestData)
if err != nil {
return err
}
req.Header = hdr
// The Go HTTP API ignores "Host" in the headers, preferring the client
// to use the Host field on Request.
if hostHeader := hdr.Get("Host"); hostHeader != "" {
req.Host = hostHeader
}
// note: once 1.13 is minimum supported Go version,
// replace this with http.NewRequestWithContext
req = req.WithContext(c.Context)
if err := c.requestCheck(parsedURL, method, req.GetBody, depth, checkRevisit); err != nil {
return err
}
u = parsedURL.String()
c.wg.Add(1)
if c.Async {
go c.fetch(u, method, depth, requestData, ctx, hdr, req)
return nil
}
return c.fetch(u, method, depth, requestData, ctx, hdr, req)
}
func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, req *http.Request) error {
defer c.wg.Done()
if ctx == nil {
ctx = NewContext()
}
request := &Request{
URL: req.URL,
Headers: &req.Header,
Host: req.Host,
Ctx: ctx,
Depth: depth,
Method: method,
Body: requestData,
collector: c,
ID: atomic.AddUint32(&c.requestCount, 1),
}
if req.Header.Get("Accept") == "" {
req.Header.Set("Accept", "*/*")
}
c.handleOnRequest(request)
if request.abort {
return nil
}
if method == "POST" && req.Header.Get("Content-Type") == "" {
req.Header.Add("Content-Type", "application/x-www-form-urlencoded")
}
var hTrace *HTTPTrace
if c.TraceHTTP {
hTrace = &HTTPTrace{}
req = hTrace.WithTrace(req)
}
origURL := req.URL
checkHeadersFunc := func(req *http.Request, statusCode int, headers http.Header) bool {
if req.URL != origURL {
request.URL = req.URL
request.Headers = &req.Header
}
c.handleOnResponseHeaders(&Response{Ctx: ctx, Request: request, StatusCode: statusCode, Headers: &headers})
return !request.abort
}
response, err := c.backend.Cache(req, c.MaxBodySize, checkHeadersFunc, c.CacheDir)
if proxyURL, ok := req.Context().Value(ProxyURLKey).(string); ok {
request.ProxyURL = proxyURL
}
if err := c.handleOnError(response, err, request, ctx); err != nil {
return err
}
atomic.AddUint32(&c.responseCount, 1)
response.Ctx = ctx
response.Request = request
response.Trace = hTrace
err = response.fixCharset(c.DetectCharset, request.ResponseCharacterEncoding)
if err != nil {
return err
}
c.handleOnResponse(response)
err = c.handleOnHTML(response)
if err != nil {
c.handleOnError(response, err, request, ctx)
}
err = c.handleOnXML(response)
if err != nil {
c.handleOnError(response, err, request, ctx)
}
c.handleOnScraped(response)
return err
}
func (c *Collector) requestCheck(parsedURL *url.URL, method string, getBody func() (io.ReadCloser, error), depth int, checkRevisit bool) error {
u := parsedURL.String()
if c.MaxDepth > 0 && c.MaxDepth < depth {
return ErrMaxDepth
}
if c.MaxRequests > 0 && c.requestCount >= c.MaxRequests {
return ErrMaxRequests
}
if err := c.checkFilters(u, parsedURL.Hostname()); err != nil {
return err
}
if method != "HEAD" && !c.IgnoreRobotsTxt {
if err := c.checkRobots(parsedURL); err != nil {
return err
}
}
if checkRevisit && !c.AllowURLRevisit {
// TODO weird behaviour, it allows CheckHead to work correctly,
// but it should probably better be solved with
// "check-but-not-save" flag or something
if method != "GET" && getBody == nil {
return nil
}
var body io.ReadCloser
if getBody != nil {
var err error
body, err = getBody()
if err != nil {
return err
}
defer body.Close()
}
uHash := requestHash(u, body)
visited, err := c.store.IsVisited(uHash)
if err != nil {
return err
}
if visited {
return &AlreadyVisitedError{parsedURL}
}
return c.store.Visited(uHash)
}
return nil
}
func (c *Collector) checkFilters(URL, domain string) error {
if len(c.DisallowedURLFilters) > 0 {
if isMatchingFilter(c.DisallowedURLFilters, []byte(URL)) {
return ErrForbiddenURL
}
}
if len(c.URLFilters) > 0 {
if !isMatchingFilter(c.URLFilters, []byte(URL)) {
return ErrNoURLFiltersMatch
}
}
if !c.isDomainAllowed(domain) {
return ErrForbiddenDomain
}
return nil
}
func (c *Collector) isDomainAllowed(domain string) bool {
for _, d2 := range c.DisallowedDomains {
if d2 == domain {
return false
}
}
if c.AllowedDomains == nil || len(c.AllowedDomains) == 0 {
return true
}
for _, d2 := range c.AllowedDomains {
if d2 == domain {
return true
}
}
return false
}
func (c *Collector) checkRobots(u *url.URL) error {
c.lock.RLock()
robot, ok := c.robotsMap[u.Host]
c.lock.RUnlock()
if !ok {
// no robots file cached
resp, err := c.backend.Client.Get(u.Scheme + "://" + u.Host + "/robots.txt")
if err != nil {
return err
}
defer resp.Body.Close()
robot, err = robotstxt.FromResponse(resp)
if err != nil {
return err
}
c.lock.Lock()
c.robotsMap[u.Host] = robot
c.lock.Unlock()
}
uaGroup := robot.FindGroup(c.UserAgent)
if uaGroup == nil {
return nil
}
eu := u.EscapedPath()
if u.RawQuery != "" {
eu += "?" + u.Query().Encode()
}
if !uaGroup.Test(eu) {
return ErrRobotsTxtBlocked
}
return nil
}
// String is the text representation of the collector.
// It contains useful debug information about the collector's internals
func (c *Collector) String() string {
return fmt.Sprintf(
"Requests made: %d (%d responses) | Callbacks: OnRequest: %d, OnHTML: %d, OnResponse: %d, OnError: %d",
atomic.LoadUint32(&c.requestCount),
atomic.LoadUint32(&c.responseCount),
len(c.requestCallbacks),
len(c.htmlCallbacks),
len(c.responseCallbacks),
len(c.errorCallbacks),
)
}
// Wait returns when the collector jobs are finished
func (c *Collector) Wait() {
c.wg.Wait()
}
// OnRequest registers a function. Function will be executed on every
// request made by the Collector
func (c *Collector) OnRequest(f RequestCallback) {
c.lock.Lock()
if c.requestCallbacks == nil {
c.requestCallbacks = make([]RequestCallback, 0, 4)
}
c.requestCallbacks = append(c.requestCallbacks, f)
c.lock.Unlock()
}
// OnResponseHeaders registers a function. Function will be executed on every response
// when headers and status are already received, but body is not yet read.
//
// Like in OnRequest, you can call Request.Abort to abort the transfer. This might be
// useful if, for example, you're following all hyperlinks, but want to avoid
// downloading files.
//
// Be aware that using this will prevent HTTP/1.1 connection reuse, as
// the only way to abort a download is to immediately close the connection.
// HTTP/2 doesn't suffer from this problem, as it's possible to close
// specific stream inside the connection.
func (c *Collector) OnResponseHeaders(f ResponseHeadersCallback) {
c.lock.Lock()
c.responseHeadersCallbacks = append(c.responseHeadersCallbacks, f)
c.lock.Unlock()
}
// OnResponse registers a function. Function will be executed on every response
func (c *Collector) OnResponse(f ResponseCallback) {
c.lock.Lock()
if c.responseCallbacks == nil {
c.responseCallbacks = make([]ResponseCallback, 0, 4)
}
c.responseCallbacks = append(c.responseCallbacks, f)
c.lock.Unlock()
}
// OnHTML registers a function. Function will be executed on every HTML
// element matched by the GoQuery Selector parameter.
// GoQuery Selector is a selector used by https://github.com/PuerkitoBio/goquery
func (c *Collector) OnHTML(goquerySelector string, f HTMLCallback) {
c.lock.Lock()
if c.htmlCallbacks == nil {
c.htmlCallbacks = make([]*htmlCallbackContainer, 0, 4)
}
c.htmlCallbacks = append(c.htmlCallbacks, &htmlCallbackContainer{
Selector: goquerySelector,
Function: f,
})
c.lock.Unlock()
}
// OnXML registers a function. Function will be executed on every XML
// element matched by the xpath Query parameter.
// xpath Query is used by https://github.com/antchfx/xmlquery
func (c *Collector) OnXML(xpathQuery string, f XMLCallback) {
c.lock.Lock()
if c.xmlCallbacks == nil {
c.xmlCallbacks = make([]*xmlCallbackContainer, 0, 4)
}
c.xmlCallbacks = append(c.xmlCallbacks, &xmlCallbackContainer{
Query: xpathQuery,
Function: f,
})
c.lock.Unlock()
}
// OnHTMLDetach deregister a function. Function will not be execute after detached
func (c *Collector) OnHTMLDetach(goquerySelector string) {
c.lock.Lock()
deleteIdx := -1
for i, cc := range c.htmlCallbacks {
if cc.Selector == goquerySelector {
deleteIdx = i
break
}
}
if deleteIdx != -1 {
c.htmlCallbacks = append(c.htmlCallbacks[:deleteIdx], c.htmlCallbacks[deleteIdx+1:]...)
}
c.lock.Unlock()
}
// OnXMLDetach deregister a function. Function will not be execute after detached
func (c *Collector) OnXMLDetach(xpathQuery string) {
c.lock.Lock()
deleteIdx := -1
for i, cc := range c.xmlCallbacks {
if cc.Query == xpathQuery {
deleteIdx = i
break
}
}
if deleteIdx != -1 {
c.xmlCallbacks = append(c.xmlCallbacks[:deleteIdx], c.xmlCallbacks[deleteIdx+1:]...)
}
c.lock.Unlock()
}
// OnError registers a function. Function will be executed if an error
// occurs during the HTTP request.
func (c *Collector) OnError(f ErrorCallback) {
c.lock.Lock()
if c.errorCallbacks == nil {
c.errorCallbacks = make([]ErrorCallback, 0, 4)
}
c.errorCallbacks = append(c.errorCallbacks, f)
c.lock.Unlock()
}
// OnScraped registers a function. Function will be executed after
// OnHTML, as a final part of the scraping.
func (c *Collector) OnScraped(f ScrapedCallback) {
c.lock.Lock()
if c.scrapedCallbacks == nil {
c.scrapedCallbacks = make([]ScrapedCallback, 0, 4)
}
c.scrapedCallbacks = append(c.scrapedCallbacks, f)
c.lock.Unlock()
}
// SetClient will override the previously set http.Client
func (c *Collector) SetClient(client *http.Client) {
c.backend.Client = client
}
// WithTransport allows you to set a custom http.RoundTripper (transport)
func (c *Collector) WithTransport(transport http.RoundTripper) {
c.backend.Client.Transport = transport
}
// DisableCookies turns off cookie handling
func (c *Collector) DisableCookies() {
c.backend.Client.Jar = nil
}
// SetCookieJar overrides the previously set cookie jar
func (c *Collector) SetCookieJar(j http.CookieJar) {
c.backend.Client.Jar = j
}
// SetRequestTimeout overrides the default timeout (10 seconds) for this collector
func (c *Collector) SetRequestTimeout(timeout time.Duration) {
c.backend.Client.Timeout = timeout
}
// SetStorage overrides the default in-memory storage.
// Storage stores scraping related data like cookies and visited urls
func (c *Collector) SetStorage(s storage.Storage) error {
if err := s.Init(); err != nil {
return err
}
c.store = s
c.backend.Client.Jar = createJar(s)
return nil
}
// SetProxy sets a proxy for the collector. This method overrides the previously
// used http.Transport if the type of the transport is not http.RoundTripper.
// The proxy type is determined by the URL scheme. "http"
// and "socks5" are supported. If the scheme is empty,
// "http" is assumed.
func (c *Collector) SetProxy(proxyURL string) error {
proxyParsed, err := url.Parse(proxyURL)
if err != nil {
return err
}
c.SetProxyFunc(http.ProxyURL(proxyParsed))
return nil
}
// SetProxyFunc sets a custom proxy setter/switcher function.
// See built-in ProxyFuncs for more details.
// This method overrides the previously used http.Transport
// if the type of the transport is not http.RoundTripper.
// The proxy type is determined by the URL scheme. "http"
// and "socks5" are supported. If the scheme is empty,
// "http" is assumed.
func (c *Collector) SetProxyFunc(p ProxyFunc) {
t, ok := c.backend.Client.Transport.(*http.Transport)
if c.backend.Client.Transport != nil && ok {
t.Proxy = p
t.DisableKeepAlives = true
} else {
c.backend.Client.Transport = &http.Transport{
Proxy: p,
DisableKeepAlives: true,
}
}
}
func createEvent(eventType string, requestID, collectorID uint32, kvargs map[string]string) *debug.Event {
return &debug.Event{
CollectorID: collectorID,
RequestID: requestID,
Type: eventType,
Values: kvargs,
}
}
func (c *Collector) handleOnRequest(r *Request) {
if c.debugger != nil {
c.debugger.Event(createEvent("request", r.ID, c.ID, map[string]string{
"url": r.URL.String(),
}))
}
for _, f := range c.requestCallbacks {
f(r)
}
}
func (c *Collector) handleOnResponse(r *Response) {
if c.debugger != nil {
c.debugger.Event(createEvent("response", r.Request.ID, c.ID, map[string]string{
"url": r.Request.URL.String(),
"status": http.StatusText(r.StatusCode),
}))
}
for _, f := range c.responseCallbacks {
f(r)
}
}
func (c *Collector) handleOnResponseHeaders(r *Response) {
if c.debugger != nil {
c.debugger.Event(createEvent("responseHeaders", r.Request.ID, c.ID, map[string]string{
"url": r.Request.URL.String(),
"status": http.StatusText(r.StatusCode),
}))
}
for _, f := range c.responseHeadersCallbacks {
f(r)
}
}
func (c *Collector) handleOnHTML(resp *Response) error {
if len(c.htmlCallbacks) == 0 {
return nil
}
contentType := resp.Headers.Get("Content-Type")
if contentType == "" {
contentType = http.DetectContentType(resp.Body)
}
// implementation of mime.ParseMediaType without parsing the params
// part
mediatype, _, _ := strings.Cut(contentType, ";")
mediatype = strings.TrimSpace(strings.ToLower(mediatype))
// TODO we also want to parse application/xml as XHTML if it has
// appropriate doctype
switch mediatype {
case "text/html", "application/xhtml+xml":
default:
return nil
}
doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body))
if err != nil {
return err
}
if href, found := doc.Find("base[href]").Attr("href"); found {
u, err := urlParser.ParseRef(resp.Request.URL.String(), href)
if err == nil {
baseURL, err := url.Parse(u.Href(false))
if err == nil {
resp.Request.baseURL = baseURL
}
}
}
for _, cc := range c.htmlCallbacks {
i := 0
doc.Find(cc.Selector).Each(func(_ int, s *goquery.Selection) {
for _, n := range s.Nodes {
e := NewHTMLElementFromSelectionNode(resp, s, n, i)
i++
if c.debugger != nil {
c.debugger.Event(createEvent("html", resp.Request.ID, c.ID, map[string]string{
"selector": cc.Selector,
"url": resp.Request.URL.String(),
}))
}
cc.Function(e)
}
})
}
return nil
}
func (c *Collector) handleOnXML(resp *Response) error {
if len(c.xmlCallbacks) == 0 {
return nil
}
contentType := strings.ToLower(resp.Headers.Get("Content-Type"))
isXMLFile := strings.HasSuffix(strings.ToLower(resp.Request.URL.Path), ".xml") || strings.HasSuffix(strings.ToLower(resp.Request.URL.Path), ".xml.gz")
if !strings.Contains(contentType, "html") && (!strings.Contains(contentType, "xml") && !isXMLFile) {
return nil
}
if strings.Contains(contentType, "html") {
doc, err := htmlquery.Parse(bytes.NewBuffer(resp.Body))
if err != nil {
return err
}
if e := htmlquery.FindOne(doc, "//base"); e != nil {
for _, a := range e.Attr {
if a.Key == "href" {
baseURL, err := resp.Request.URL.Parse(a.Val)
if err == nil {
resp.Request.baseURL = baseURL
}
break
}
}
}
for _, cc := range c.xmlCallbacks {
for _, n := range htmlquery.Find(doc, cc.Query) {
e := NewXMLElementFromHTMLNode(resp, n)
if c.debugger != nil {
c.debugger.Event(createEvent("xml", resp.Request.ID, c.ID, map[string]string{
"selector": cc.Query,
"url": resp.Request.URL.String(),
}))
}
cc.Function(e)
}
}
} else if strings.Contains(contentType, "xml") || isXMLFile {
doc, err := xmlquery.Parse(bytes.NewBuffer(resp.Body))
if err != nil {
return err
}
for _, cc := range c.xmlCallbacks {
xmlquery.FindEach(doc, cc.Query, func(i int, n *xmlquery.Node) {
e := NewXMLElementFromXMLNode(resp, n)
if c.debugger != nil {
c.debugger.Event(createEvent("xml", resp.Request.ID, c.ID, map[string]string{
"selector": cc.Query,
"url": resp.Request.URL.String(),
}))
}
cc.Function(e)
})
}
}
return nil
}
func (c *Collector) handleOnError(response *Response, err error, request *Request, ctx *Context) error {
if err == nil && (c.ParseHTTPErrorResponse || response.StatusCode < 203) {
return nil
}
if err == nil && response.StatusCode >= 203 {
err = errors.New(http.StatusText(response.StatusCode))
}
if response == nil {
response = &Response{
Request: request,
Ctx: ctx,
}
}
if c.debugger != nil {
c.debugger.Event(createEvent("error", request.ID, c.ID, map[string]string{
"url": request.URL.String(),
"status": http.StatusText(response.StatusCode),
}))
}
if response.Request == nil {
response.Request = request
}
if response.Ctx == nil {
response.Ctx = request.Ctx
}
for _, f := range c.errorCallbacks {
f(response, err)
}
return err
}
func (c *Collector) handleOnScraped(r *Response) {
if c.debugger != nil {
c.debugger.Event(createEvent("scraped", r.Request.ID, c.ID, map[string]string{
"url": r.Request.URL.String(),
}))
}
for _, f := range c.scrapedCallbacks {
f(r)
}
}
// Limit adds a new LimitRule to the collector
func (c *Collector) Limit(rule *LimitRule) error {
return c.backend.Limit(rule)
}
// Limits adds new LimitRules to the collector
func (c *Collector) Limits(rules []*LimitRule) error {
return c.backend.Limits(rules)
}
// SetRedirectHandler instructs the Collector to allow multiple downloads of the same URL
func (c *Collector) SetRedirectHandler(f func(req *http.Request, via []*http.Request) error) {
c.redirectHandler = f
c.backend.Client.CheckRedirect = c.checkRedirectFunc()
}
// SetCookies handles the receipt of the cookies in a reply for the given URL
func (c *Collector) SetCookies(URL string, cookies []*http.Cookie) error {
if c.backend.Client.Jar == nil {
return ErrNoCookieJar
}
u, err := url.Parse(URL)
if err != nil {
return err
}
c.backend.Client.Jar.SetCookies(u, cookies)
return nil
}
// Cookies returns the cookies to send in a request for the given URL.
func (c *Collector) Cookies(URL string) []*http.Cookie {
if c.backend.Client.Jar == nil {
return nil
}
u, err := url.Parse(URL)
if err != nil {
return nil
}
return c.backend.Client.Jar.Cookies(u)
}
// Clone creates an exact copy of a Collector without callbacks.
// HTTP backend, robots.txt cache and cookie jar are shared
// between collectors.
func (c *Collector) Clone() *Collector {
return &Collector{
AllowedDomains: c.AllowedDomains,
AllowURLRevisit: c.AllowURLRevisit,
CacheDir: c.CacheDir,
DetectCharset: c.DetectCharset,
DisallowedDomains: c.DisallowedDomains,
ID: atomic.AddUint32(&collectorCounter, 1),
IgnoreRobotsTxt: c.IgnoreRobotsTxt,
MaxBodySize: c.MaxBodySize,
MaxDepth: c.MaxDepth,
MaxRequests: c.MaxRequests,
DisallowedURLFilters: c.DisallowedURLFilters,
URLFilters: c.URLFilters,
CheckHead: c.CheckHead,
ParseHTTPErrorResponse: c.ParseHTTPErrorResponse,
UserAgent: c.UserAgent,
Headers: c.Headers,
TraceHTTP: c.TraceHTTP,
Context: c.Context,
store: c.store,
backend: c.backend,
debugger: c.debugger,
Async: c.Async,
redirectHandler: c.redirectHandler,
errorCallbacks: make([]ErrorCallback, 0, 8),
htmlCallbacks: make([]*htmlCallbackContainer, 0, 8),
xmlCallbacks: make([]*xmlCallbackContainer, 0, 8),
scrapedCallbacks: make([]ScrapedCallback, 0, 8),
lock: c.lock,
requestCallbacks: make([]RequestCallback, 0, 8),
responseCallbacks: make([]ResponseCallback, 0, 8),
robotsMap: c.robotsMap,
wg: &sync.WaitGroup{},
}
}
func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Request) error {
return func(req *http.Request, via []*http.Request) error {
if err := c.checkFilters(req.URL.String(), req.URL.Hostname()); err != nil {
return fmt.Errorf("Not following redirect to %q: %w", req.URL, err)
}
// allow redirects to the original destination
// to support websites redirecting to the same page while setting
// session cookies
samePageRedirect := normalizeURL(req.URL.String()) == normalizeURL(via[0].URL.String())
if !c.AllowURLRevisit && !samePageRedirect {
var body io.ReadCloser
if req.GetBody != nil {
var err error
body, err = req.GetBody()
if err != nil {
return err
}
defer body.Close()
}
uHash := requestHash(req.URL.String(), body)
visited, err := c.store.IsVisited(uHash)
if err != nil {
return err
}
if visited {
return &AlreadyVisitedError{req.URL}
}
err = c.store.Visited(uHash)
if err != nil {
return err
}
}
if c.redirectHandler != nil {
return c.redirectHandler(req, via)
}
// Honor golangs default of maximum of 10 redirects
if len(via) >= 10 {
return http.ErrUseLastResponse
}
lastRequest := via[len(via)-1]
// If domain has changed, remove the Authorization-header if it exists
if req.URL.Host != lastRequest.URL.Host {
req.Header.Del("Authorization")
}
return nil
}
}
func (c *Collector) parseSettingsFromEnv() {
for _, e := range os.Environ() {
if !strings.HasPrefix(e, "COLLY_") {
continue
}
pair := strings.SplitN(e[6:], "=", 2)
if f, ok := envMap[pair[0]]; ok {
f(c, pair[1])
} else {
log.Println("Unknown environment variable:", pair[0])
}
}
}
func (c *Collector) checkHasVisited(URL string, requestData map[string]string) (bool, error) {
hash := requestHash(URL, createFormReader(requestData))
return c.store.IsVisited(hash)
}
// SanitizeFileName replaces dangerous characters in a string
// so the return value can be used as a safe file name.
func SanitizeFileName(fileName string) string {
ext := filepath.Ext(fileName)
cleanExt := sanitize.BaseName(ext)
if cleanExt == "" {
cleanExt = ".unknown"
}
return strings.Replace(fmt.Sprintf(
"%s.%s",
sanitize.BaseName(fileName[:len(fileName)-len(ext)]),
cleanExt[1:],
), "-", "_", -1)
}
func createFormReader(data map[string]string) io.Reader {
form := url.Values{}
for k, v := range data {
form.Add(k, v)
}
return strings.NewReader(form.Encode())
}
func createMultipartReader(boundary string, data map[string][]byte) io.Reader {
dashBoundary := "--" + boundary
body := []byte{}
buffer := bytes.NewBuffer(body)
buffer.WriteString("Content-type: multipart/form-data; boundary=" + boundary + "\n\n")
for contentType, content := range data {
buffer.WriteString(dashBoundary + "\n")
buffer.WriteString("Content-Disposition: form-data; name=" + contentType + "\n")
buffer.WriteString(fmt.Sprintf("Content-Length: %d \n\n", len(content)))
buffer.Write(content)
buffer.WriteString("\n")
}
buffer.WriteString(dashBoundary + "--\n\n")
return bytes.NewReader(buffer.Bytes())
}
// randomBoundary was borrowed from
// github.com/golang/go/mime/multipart/writer.go#randomBoundary
func randomBoundary() string {
var buf [30]byte
_, err := io.ReadFull(rand.Reader, buf[:])
if err != nil {
panic(err)
}
return fmt.Sprintf("%x", buf[:])
}
func isYesString(s string) bool {
switch strings.ToLower(s) {
case "1", "yes", "true", "y":
return true
}
return false
}
func createJar(s storage.Storage) http.CookieJar {
return &cookieJarSerializer{store: s, lock: &sync.RWMutex{}}
}
func (j *cookieJarSerializer) SetCookies(u *url.URL, cookies []*http.Cookie) {
j.lock.Lock()
defer j.lock.Unlock()
cookieStr := j.store.Cookies(u)
// Merge existing cookies, new cookies have precedence.
cnew := make([]*http.Cookie, len(cookies))
copy(cnew, cookies)
existing := storage.UnstringifyCookies(cookieStr)
for _, c := range existing {
if !storage.ContainsCookie(cnew, c.Name) {
cnew = append(cnew, c)
}
}
j.store.SetCookies(u, storage.StringifyCookies(cnew))
}
func (j *cookieJarSerializer) Cookies(u *url.URL) []*http.Cookie {
cookies := storage.UnstringifyCookies(j.store.Cookies(u))
// Filter.
now := time.Now()
cnew := make([]*http.Cookie, 0, len(cookies))
for _, c := range cookies {
// Drop expired cookies.
if c.RawExpires != "" && c.Expires.Before(now) {
continue
}
// Drop secure cookies if not over https.
if c.Secure && u.Scheme != "https" {
continue
}
cnew = append(cnew, c)
}
return cnew
}
func isMatchingFilter(fs []*regexp.Regexp, d []byte) bool {
for _, r := range fs {
if r.Match(d) {
return true
}
}
return false
}
func normalizeURL(u string) string {
parsed, err := urlParser.Parse(u)
if err != nil {
return u
}
return parsed.String()
}
func requestHash(url string, body io.Reader) uint64 {
h := fnv.New64a()
// reparse the url to fix ambiguities such as
// "http://example.com" vs "http://example.com/"
io.WriteString(h, normalizeURL(url))
if body != nil {
io.Copy(h, body)
}
return h.Sum64()
}
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/wiseai/colly.git
git@gitee.com:wiseai/colly.git
wiseai
colly
goalng爬虫框架colly
master

搜索帮助