6 Star 12 Fork 3

Gitee 极速下载/Colly

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
此仓库是为了提升国内下载速度的镜像仓库,每日同步一次。 原始仓库: https://github.com/gocolly/colly
克隆/下载
colly_test.go 40.96 KB
一键复制 编辑 原始数据 按行查看 历史
WGH 提交于 2024-03-26 00:30 . Implement content sniffing for HTML parsing
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816
// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package colly
import (
"bufio"
"bytes"
"context"
"errors"
"fmt"
"net/http"
"net/http/httptest"
"net/url"
"os"
"reflect"
"regexp"
"strings"
"testing"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly/v2/debug"
)
var serverIndexResponse = []byte("hello world\n")
var robotsFile = `
User-agent: *
Allow: /allowed
Disallow: /disallowed
Disallow: /allowed*q=
`
func newUnstartedTestServer() *httptest.Server {
mux := http.NewServeMux()
mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
w.Write(serverIndexResponse)
})
mux.HandleFunc("/html", func(w http.ResponseWriter, r *http.Request) {
if r.URL.Query().Get("no-content-type") != "" {
w.Header()["Content-Type"] = nil
} else {
w.Header().Set("Content-Type", "text/html")
}
w.Write([]byte(`<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
</head>
<body>
<h1>Hello World</h1>
<p class="description">This is a test page</p>
<p class="description">This is a test paragraph</p>
</body>
</html>
`))
})
mux.HandleFunc("/xml", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/xml")
w.Write([]byte(`<?xml version="1.0" encoding="UTF-8"?>
<page>
<title>Test Page</title>
<paragraph type="description">This is a test page</paragraph>
<paragraph type="description">This is a test paragraph</paragraph>
</page>
`))
})
mux.HandleFunc("/login", func(w http.ResponseWriter, r *http.Request) {
if r.Method == "POST" {
w.Header().Set("Content-Type", "text/html")
w.Write([]byte(r.FormValue("name")))
}
})
mux.HandleFunc("/robots.txt", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
w.Write([]byte(robotsFile))
})
mux.HandleFunc("/allowed", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
w.Write([]byte("allowed"))
})
mux.HandleFunc("/disallowed", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
w.Write([]byte("disallowed"))
})
mux.Handle("/redirect", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
destination := "/redirected/"
if d := r.URL.Query().Get("d"); d != "" {
destination = d
}
http.Redirect(w, r, destination, http.StatusSeeOther)
}))
mux.Handle("/redirected/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprintf(w, `<a href="test">test</a>`)
}))
mux.HandleFunc("/set_cookie", func(w http.ResponseWriter, r *http.Request) {
c := &http.Cookie{Name: "test", Value: "testv", HttpOnly: false}
http.SetCookie(w, c)
w.WriteHeader(200)
w.Write([]byte("ok"))
})
mux.HandleFunc("/check_cookie", func(w http.ResponseWriter, r *http.Request) {
cs := r.Cookies()
if len(cs) != 1 || r.Cookies()[0].Value != "testv" {
w.WriteHeader(500)
w.Write([]byte("nok"))
return
}
w.WriteHeader(200)
w.Write([]byte("ok"))
})
mux.HandleFunc("/500", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
w.WriteHeader(500)
w.Write([]byte("<p>error</p>"))
})
mux.HandleFunc("/user_agent", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
w.Write([]byte(r.Header.Get("User-Agent")))
})
mux.HandleFunc("/host_header", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
w.Write([]byte(r.Host))
})
mux.HandleFunc("/accept_header", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
w.Write([]byte(r.Header.Get("Accept")))
})
mux.HandleFunc("/custom_header", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
w.Write([]byte(r.Header.Get("Test")))
})
mux.HandleFunc("/base", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
w.Write([]byte(`<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
<base href="http://xy.com/" />
</head>
<body>
<a href="z">link</a>
</body>
</html>
`))
})
mux.HandleFunc("/base_relative", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
w.Write([]byte(`<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
<base href="/foobar/" />
</head>
<body>
<a href="z">link</a>
</body>
</html>
`))
})
mux.HandleFunc("/tabs_and_newlines", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
w.Write([]byte(`<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
<base href="/foo bar/" />
</head>
<body>
<a href="x
y">link</a>
</body>
</html>
`))
})
mux.HandleFunc("/foobar/xy", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
w.Write([]byte(`<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
</head>
<body>
<p>hello</p>
</body>
</html>
`))
})
mux.HandleFunc("/100%25", func(w http.ResponseWriter, r *http.Request) {
w.Write([]byte("100 percent"))
})
mux.HandleFunc("/large_binary", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/octet-stream")
ww := bufio.NewWriter(w)
defer ww.Flush()
for {
// have to check error to detect client aborting download
if _, err := ww.Write([]byte{0x41}); err != nil {
return
}
}
})
mux.HandleFunc("/slow", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
ticker := time.NewTicker(100 * time.Millisecond)
defer ticker.Stop()
i := 0
for {
select {
case <-r.Context().Done():
return
case t := <-ticker.C:
fmt.Fprintf(w, "%s\n", t)
if flusher, ok := w.(http.Flusher); ok {
flusher.Flush()
}
i++
if i == 10 {
return
}
}
}
})
return httptest.NewUnstartedServer(mux)
}
func newTestServer() *httptest.Server {
srv := newUnstartedTestServer()
srv.Start()
return srv
}
var newCollectorTests = map[string]func(*testing.T){
"UserAgent": func(t *testing.T) {
for _, ua := range []string{
"foo",
"bar",
} {
c := NewCollector(UserAgent(ua))
if got, want := c.UserAgent, ua; got != want {
t.Fatalf("c.UserAgent = %q, want %q", got, want)
}
}
},
"MaxDepth": func(t *testing.T) {
for _, depth := range []int{
12,
34,
0,
} {
c := NewCollector(MaxDepth(depth))
if got, want := c.MaxDepth, depth; got != want {
t.Fatalf("c.MaxDepth = %d, want %d", got, want)
}
}
},
"AllowedDomains": func(t *testing.T) {
for _, domains := range [][]string{
{"example.com", "example.net"},
{"example.net"},
{},
nil,
} {
c := NewCollector(AllowedDomains(domains...))
if got, want := c.AllowedDomains, domains; !reflect.DeepEqual(got, want) {
t.Fatalf("c.AllowedDomains = %q, want %q", got, want)
}
}
},
"DisallowedDomains": func(t *testing.T) {
for _, domains := range [][]string{
{"example.com", "example.net"},
{"example.net"},
{},
nil,
} {
c := NewCollector(DisallowedDomains(domains...))
if got, want := c.DisallowedDomains, domains; !reflect.DeepEqual(got, want) {
t.Fatalf("c.DisallowedDomains = %q, want %q", got, want)
}
}
},
"DisallowedURLFilters": func(t *testing.T) {
for _, filters := range [][]*regexp.Regexp{
{regexp.MustCompile(`.*not_allowed.*`)},
} {
c := NewCollector(DisallowedURLFilters(filters...))
if got, want := c.DisallowedURLFilters, filters; !reflect.DeepEqual(got, want) {
t.Fatalf("c.DisallowedURLFilters = %v, want %v", got, want)
}
}
},
"URLFilters": func(t *testing.T) {
for _, filters := range [][]*regexp.Regexp{
{regexp.MustCompile(`\w+`)},
{regexp.MustCompile(`\d+`)},
{},
nil,
} {
c := NewCollector(URLFilters(filters...))
if got, want := c.URLFilters, filters; !reflect.DeepEqual(got, want) {
t.Fatalf("c.URLFilters = %v, want %v", got, want)
}
}
},
"AllowURLRevisit": func(t *testing.T) {
c := NewCollector(AllowURLRevisit())
if !c.AllowURLRevisit {
t.Fatal("c.AllowURLRevisit = false, want true")
}
},
"MaxBodySize": func(t *testing.T) {
for _, sizeInBytes := range []int{
1024 * 1024,
1024,
0,
} {
c := NewCollector(MaxBodySize(sizeInBytes))
if got, want := c.MaxBodySize, sizeInBytes; got != want {
t.Fatalf("c.MaxBodySize = %d, want %d", got, want)
}
}
},
"CacheDir": func(t *testing.T) {
for _, path := range []string{
"/tmp/",
"/var/cache/",
} {
c := NewCollector(CacheDir(path))
if got, want := c.CacheDir, path; got != want {
t.Fatalf("c.CacheDir = %q, want %q", got, want)
}
}
},
"IgnoreRobotsTxt": func(t *testing.T) {
c := NewCollector(IgnoreRobotsTxt())
if !c.IgnoreRobotsTxt {
t.Fatal("c.IgnoreRobotsTxt = false, want true")
}
},
"ID": func(t *testing.T) {
for _, id := range []uint32{
0,
1,
2,
} {
c := NewCollector(ID(id))
if got, want := c.ID, id; got != want {
t.Fatalf("c.ID = %d, want %d", got, want)
}
}
},
"DetectCharset": func(t *testing.T) {
c := NewCollector(DetectCharset())
if !c.DetectCharset {
t.Fatal("c.DetectCharset = false, want true")
}
},
"Debugger": func(t *testing.T) {
d := &debug.LogDebugger{}
c := NewCollector(Debugger(d))
if got, want := c.debugger, d; got != want {
t.Fatalf("c.debugger = %v, want %v", got, want)
}
},
"CheckHead": func(t *testing.T) {
c := NewCollector(CheckHead())
if !c.CheckHead {
t.Fatal("c.CheckHead = false, want true")
}
},
"Async": func(t *testing.T) {
c := NewCollector(Async())
if !c.Async {
t.Fatal("c.Async = false, want true")
}
},
}
func TestNoAcceptHeader(t *testing.T) {
ts := newTestServer()
defer ts.Close()
var receivedHeader string
// checks if Accept is enabled by default
func() {
c := NewCollector()
c.OnResponse(func(resp *Response) {
receivedHeader = string(resp.Body)
})
c.Visit(ts.URL + "/accept_header")
if receivedHeader != "*/*" {
t.Errorf("default Accept header isn't */*. got: %v", receivedHeader)
}
}()
// checks if Accept can be disabled
func() {
c := NewCollector()
c.OnRequest(func(r *Request) {
r.Headers.Del("Accept")
})
c.OnResponse(func(resp *Response) {
receivedHeader = string(resp.Body)
})
c.Visit(ts.URL + "/accept_header")
if receivedHeader != "" {
t.Errorf("failed to pass request with no Accept header. got: %v", receivedHeader)
}
}()
}
func TestNewCollector(t *testing.T) {
t.Run("Functional Options", func(t *testing.T) {
for name, test := range newCollectorTests {
t.Run(name, test)
}
})
}
func TestCollectorVisit(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
onRequestCalled := false
onResponseCalled := false
onScrapedCalled := false
c.OnRequest(func(r *Request) {
onRequestCalled = true
r.Ctx.Put("x", "y")
})
c.OnResponse(func(r *Response) {
onResponseCalled = true
if r.Ctx.Get("x") != "y" {
t.Error("Failed to retrieve context value for key 'x'")
}
if !bytes.Equal(r.Body, serverIndexResponse) {
t.Error("Response body does not match with the original content")
}
})
c.OnScraped(func(r *Response) {
if !onResponseCalled {
t.Error("OnScraped called before OnResponse")
}
if !onRequestCalled {
t.Error("OnScraped called before OnRequest")
}
onScrapedCalled = true
})
c.Visit(ts.URL)
if !onRequestCalled {
t.Error("Failed to call OnRequest callback")
}
if !onResponseCalled {
t.Error("Failed to call OnResponse callback")
}
if !onScrapedCalled {
t.Error("Failed to call OnScraped callback")
}
}
func TestCollectorVisitWithAllowedDomains(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector(AllowedDomains("localhost", "127.0.0.1", "::1"))
err := c.Visit(ts.URL)
if err != nil {
t.Errorf("Failed to visit url %s", ts.URL)
}
err = c.Visit("http://example.com")
if err != ErrForbiddenDomain {
t.Errorf("c.Visit should return ErrForbiddenDomain, but got %v", err)
}
}
func TestCollectorVisitWithDisallowedDomains(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector(DisallowedDomains("localhost", "127.0.0.1", "::1"))
err := c.Visit(ts.URL)
if err != ErrForbiddenDomain {
t.Errorf("c.Visit should return ErrForbiddenDomain, but got %v", err)
}
c2 := NewCollector(DisallowedDomains("example.com"))
err = c2.Visit("http://example.com:8080")
if err != ErrForbiddenDomain {
t.Errorf("c.Visit should return ErrForbiddenDomain, but got %v", err)
}
err = c2.Visit(ts.URL)
if err != nil {
t.Errorf("Failed to visit url %s", ts.URL)
}
}
func TestCollectorVisitResponseHeaders(t *testing.T) {
ts := newTestServer()
defer ts.Close()
var onResponseHeadersCalled bool
c := NewCollector()
c.OnResponseHeaders(func(r *Response) {
onResponseHeadersCalled = true
if r.Headers.Get("Content-Type") == "application/octet-stream" {
r.Request.Abort()
}
})
c.OnResponse(func(r *Response) {
t.Error("OnResponse was called")
})
c.Visit(ts.URL + "/large_binary")
if !onResponseHeadersCalled {
t.Error("OnResponseHeaders was not called")
}
}
func TestCollectorOnHTML(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
titleCallbackCalled := false
paragraphCallbackCount := 0
c.OnHTML("title", func(e *HTMLElement) {
titleCallbackCalled = true
if e.Text != "Test Page" {
t.Error("Title element text does not match, got", e.Text)
}
})
c.OnHTML("p", func(e *HTMLElement) {
paragraphCallbackCount++
if e.Attr("class") != "description" {
t.Error("Failed to get paragraph's class attribute")
}
})
c.OnHTML("body", func(e *HTMLElement) {
if e.ChildAttr("p", "class") != "description" {
t.Error("Invalid class value")
}
classes := e.ChildAttrs("p", "class")
if len(classes) != 2 {
t.Error("Invalid class values")
}
})
c.Visit(ts.URL + "/html")
if !titleCallbackCalled {
t.Error("Failed to call OnHTML callback for <title> tag")
}
if paragraphCallbackCount != 2 {
t.Error("Failed to find all <p> tags")
}
}
func TestCollectorContentSniffing(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
htmlCallbackCalled := false
c.OnResponse(func(r *Response) {
if (*r.Headers)["Content-Type"] != nil {
t.Error("Content-Type unexpectedly not nil")
}
})
c.OnHTML("html", func(e *HTMLElement) {
htmlCallbackCalled = true
})
err := c.Visit(ts.URL + "/html?no-content-type=yes")
if err != nil {
t.Fatal(err)
}
if !htmlCallbackCalled {
t.Error("OnHTML was not called")
}
}
func TestCollectorURLRevisit(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
visitCount := 0
c.OnRequest(func(r *Request) {
visitCount++
})
c.Visit(ts.URL)
c.Visit(ts.URL)
if visitCount != 1 {
t.Error("URL revisited")
}
c.AllowURLRevisit = true
c.Visit(ts.URL)
c.Visit(ts.URL)
if visitCount != 3 {
t.Error("URL not revisited")
}
}
func TestCollectorPostRevisit(t *testing.T) {
ts := newTestServer()
defer ts.Close()
postValue := "hello"
postData := map[string]string{
"name": postValue,
}
visitCount := 0
c := NewCollector()
c.OnResponse(func(r *Response) {
if postValue != string(r.Body) {
t.Error("Failed to send data with POST")
}
visitCount++
})
c.Post(ts.URL+"/login", postData)
c.Post(ts.URL+"/login", postData)
c.Post(ts.URL+"/login", map[string]string{
"name": postValue,
"lastname": "world",
})
if visitCount != 2 {
t.Error("URL POST revisited")
}
c.AllowURLRevisit = true
c.Post(ts.URL+"/login", postData)
c.Post(ts.URL+"/login", postData)
if visitCount != 4 {
t.Error("URL POST not revisited")
}
}
func TestCollectorURLRevisitCheck(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
visited, err := c.HasVisited(ts.URL)
if err != nil {
t.Error(err.Error())
}
if visited != false {
t.Error("Expected URL to NOT have been visited")
}
c.Visit(ts.URL)
visited, err = c.HasVisited(ts.URL)
if err != nil {
t.Error(err.Error())
}
if visited != true {
t.Error("Expected URL to have been visited")
}
errorTestCases := []struct {
Path string
DestinationError string
}{
{"/", "/"},
{"/redirect?d=/", "/"},
// now that /redirect?d=/ itself is recorded as visited,
// it's now returned in error
{"/redirect?d=/", "/redirect?d=/"},
{"/redirect?d=/redirect%3Fd%3D/", "/redirect?d=/"},
{"/redirect?d=/redirect%3Fd%3D/", "/redirect?d=/redirect%3Fd%3D/"},
{"/redirect?d=/redirect%3Fd%3D/&foo=bar", "/redirect?d=/"},
}
for i, testCase := range errorTestCases {
err := c.Visit(ts.URL + testCase.Path)
if testCase.DestinationError == "" {
if err != nil {
t.Errorf("got unexpected error in test %d: %q", i, err)
}
} else {
var ave *AlreadyVisitedError
if !errors.As(err, &ave) {
t.Errorf("err=%q returned when trying to revisit, expected AlreadyVisitedError", err)
} else {
if got, want := ave.Destination.String(), ts.URL+testCase.DestinationError; got != want {
t.Errorf("wrong destination in AlreadyVisitedError in test %d, got=%q want=%q", i, got, want)
}
}
}
}
}
func TestSetCookieRedirect(t *testing.T) {
type middleware = func(http.Handler) http.Handler
for _, m := range []middleware{
requireSessionCookieSimple,
requireSessionCookieAuthPage,
} {
t.Run("", func(t *testing.T) {
ts := newUnstartedTestServer()
ts.Config.Handler = m(ts.Config.Handler)
ts.Start()
defer ts.Close()
c := NewCollector()
c.OnResponse(func(r *Response) {
if got, want := r.Body, serverIndexResponse; !bytes.Equal(got, want) {
t.Errorf("bad response body got=%q want=%q", got, want)
}
if got, want := r.StatusCode, http.StatusOK; got != want {
t.Errorf("bad response code got=%d want=%d", got, want)
}
})
if err := c.Visit(ts.URL); err != nil {
t.Fatal(err)
}
})
}
}
func TestCollectorPostURLRevisitCheck(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
postValue := "hello"
postData := map[string]string{
"name": postValue,
}
posted, err := c.HasPosted(ts.URL+"/login", postData)
if err != nil {
t.Error(err.Error())
}
if posted != false {
t.Error("Expected URL to NOT have been visited")
}
c.Post(ts.URL+"/login", postData)
posted, err = c.HasPosted(ts.URL+"/login", postData)
if err != nil {
t.Error(err.Error())
}
if posted != true {
t.Error("Expected URL to have been visited")
}
postData["lastname"] = "world"
posted, err = c.HasPosted(ts.URL+"/login", postData)
if err != nil {
t.Error(err.Error())
}
if posted != false {
t.Error("Expected URL to NOT have been visited")
}
c.Post(ts.URL+"/login", postData)
posted, err = c.HasPosted(ts.URL+"/login", postData)
if err != nil {
t.Error(err.Error())
}
if posted != true {
t.Error("Expected URL to have been visited")
}
}
// TestCollectorURLRevisitDisallowed ensures that disallowed URL is not considered visited.
func TestCollectorURLRevisitDomainDisallowed(t *testing.T) {
ts := newTestServer()
defer ts.Close()
parsedURL, err := url.Parse(ts.URL)
if err != nil {
t.Fatal(err)
}
c := NewCollector(DisallowedDomains(parsedURL.Hostname()))
err = c.Visit(ts.URL)
if got, want := err, ErrForbiddenDomain; got != want {
t.Fatalf("wrong error on first visit: got=%v want=%v", got, want)
}
err = c.Visit(ts.URL)
if got, want := err, ErrForbiddenDomain; got != want {
t.Fatalf("wrong error on second visit: got=%v want=%v", got, want)
}
}
func TestCollectorPost(t *testing.T) {
ts := newTestServer()
defer ts.Close()
postValue := "hello"
c := NewCollector()
c.OnResponse(func(r *Response) {
if postValue != string(r.Body) {
t.Error("Failed to send data with POST")
}
})
c.Post(ts.URL+"/login", map[string]string{
"name": postValue,
})
}
func TestCollectorPostRaw(t *testing.T) {
ts := newTestServer()
defer ts.Close()
postValue := "hello"
c := NewCollector()
c.OnResponse(func(r *Response) {
if postValue != string(r.Body) {
t.Error("Failed to send data with POST")
}
})
c.PostRaw(ts.URL+"/login", []byte("name="+postValue))
}
func TestCollectorPostRawRevisit(t *testing.T) {
ts := newTestServer()
defer ts.Close()
postValue := "hello"
postData := "name=" + postValue
visitCount := 0
c := NewCollector()
c.OnResponse(func(r *Response) {
if postValue != string(r.Body) {
t.Error("Failed to send data with POST RAW")
}
visitCount++
})
c.PostRaw(ts.URL+"/login", []byte(postData))
c.PostRaw(ts.URL+"/login", []byte(postData))
c.PostRaw(ts.URL+"/login", []byte(postData+"&lastname=world"))
if visitCount != 2 {
t.Error("URL POST RAW revisited")
}
c.AllowURLRevisit = true
c.PostRaw(ts.URL+"/login", []byte(postData))
c.PostRaw(ts.URL+"/login", []byte(postData))
if visitCount != 4 {
t.Error("URL POST RAW not revisited")
}
}
func TestRedirect(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
c.OnHTML("a[href]", func(e *HTMLElement) {
u := e.Request.AbsoluteURL(e.Attr("href"))
if !strings.HasSuffix(u, "/redirected/test") {
t.Error("Invalid URL after redirect: " + u)
}
})
c.OnResponseHeaders(func(r *Response) {
if !strings.HasSuffix(r.Request.URL.String(), "/redirected/") {
t.Error("Invalid URL in Request after redirect (OnResponseHeaders): " + r.Request.URL.String())
}
})
c.OnResponse(func(r *Response) {
if !strings.HasSuffix(r.Request.URL.String(), "/redirected/") {
t.Error("Invalid URL in Request after redirect (OnResponse): " + r.Request.URL.String())
}
})
c.Visit(ts.URL + "/redirect")
}
func TestIssue594(t *testing.T) {
// This is a regression test for a data race bug. There's no
// assertions because it's meant to be used with race detector
ts := newTestServer()
defer ts.Close()
c := NewCollector()
// if timeout is set, this bug is not triggered
c.SetClient(&http.Client{Timeout: 0 * time.Second})
c.Visit(ts.URL)
}
func TestRedirectWithDisallowedURLs(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
c.DisallowedURLFilters = []*regexp.Regexp{regexp.MustCompile(ts.URL + "/redirected/test")}
c.OnHTML("a[href]", func(e *HTMLElement) {
u := e.Request.AbsoluteURL(e.Attr("href"))
err := c.Visit(u)
if !errors.Is(err, ErrForbiddenURL) {
t.Error("URL should have been forbidden: " + u)
}
})
c.Visit(ts.URL + "/redirect")
}
func TestBaseTag(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
c.OnHTML("a[href]", func(e *HTMLElement) {
u := e.Request.AbsoluteURL(e.Attr("href"))
if u != "http://xy.com/z" {
t.Error("Invalid <base /> tag handling in OnHTML: expected https://xy.com/z, got " + u)
}
})
c.Visit(ts.URL + "/base")
c2 := NewCollector()
c2.OnXML("//a", func(e *XMLElement) {
u := e.Request.AbsoluteURL(e.Attr("href"))
if u != "http://xy.com/z" {
t.Error("Invalid <base /> tag handling in OnXML: expected https://xy.com/z, got " + u)
}
})
c2.Visit(ts.URL + "/base")
}
func TestBaseTagRelative(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
c.OnHTML("a[href]", func(e *HTMLElement) {
u := e.Request.AbsoluteURL(e.Attr("href"))
expected := ts.URL + "/foobar/z"
if u != expected {
t.Errorf("Invalid <base /> tag handling in OnHTML: expected %q, got %q", expected, u)
}
})
c.Visit(ts.URL + "/base_relative")
c2 := NewCollector()
c2.OnXML("//a", func(e *XMLElement) {
u := e.Request.AbsoluteURL(e.Attr("href"))
expected := ts.URL + "/foobar/z"
if u != expected {
t.Errorf("Invalid <base /> tag handling in OnXML: expected %q, got %q", expected, u)
}
})
c2.Visit(ts.URL + "/base_relative")
}
func TestTabsAndNewlines(t *testing.T) {
// this test might look odd, but see step 3 of
// https://url.spec.whatwg.org/#concept-basic-url-parser
ts := newTestServer()
defer ts.Close()
visited := map[string]struct{}{}
expected := map[string]struct{}{
"/tabs_and_newlines": {},
"/foobar/xy": {},
}
c := NewCollector()
c.OnResponse(func(res *Response) {
visited[res.Request.URL.EscapedPath()] = struct{}{}
})
c.OnHTML("a[href]", func(e *HTMLElement) {
if err := e.Request.Visit(e.Attr("href")); err != nil {
t.Errorf("visit failed: %v", err)
}
})
if err := c.Visit(ts.URL + "/tabs_and_newlines"); err != nil {
t.Errorf("visit failed: %v", err)
}
if !reflect.DeepEqual(visited, expected) {
t.Errorf("visited=%v expected=%v", visited, expected)
}
}
func TestLonePercent(t *testing.T) {
ts := newTestServer()
defer ts.Close()
var visitedPath string
c := NewCollector()
c.OnResponse(func(res *Response) {
visitedPath = res.Request.URL.RequestURI()
})
if err := c.Visit(ts.URL + "/100%"); err != nil {
t.Errorf("visit failed: %v", err)
}
// Automatic encoding is not really correct: browsers
// would send bare percent here. However, Go net/http
// cannot send such requests due to
// https://github.com/golang/go/issues/29808. So we have two
// alternatives really: return an error when attempting
// to fetch such URLs, or at least try the encoded variant.
// This test checks that the latter is attempted.
if got, want := visitedPath, "/100%25"; got != want {
t.Errorf("got=%q want=%q", got, want)
}
// invalid URL escape in query component is not a problem,
// but check it anyway
if err := c.Visit(ts.URL + "/?a=100%zz"); err != nil {
t.Errorf("visit failed: %v", err)
}
if got, want := visitedPath, "/?a=100%zz"; got != want {
t.Errorf("got=%q want=%q", got, want)
}
}
func TestCollectorCookies(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
if err := c.Visit(ts.URL + "/set_cookie"); err != nil {
t.Fatal(err)
}
if err := c.Visit(ts.URL + "/check_cookie"); err != nil {
t.Fatalf("Failed to use previously set cookies: %s", err)
}
}
func TestRobotsWhenAllowed(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
c.IgnoreRobotsTxt = false
c.OnResponse(func(resp *Response) {
if resp.StatusCode != 200 {
t.Fatalf("Wrong response code: %d", resp.StatusCode)
}
})
err := c.Visit(ts.URL + "/allowed")
if err != nil {
t.Fatal(err)
}
}
func TestRobotsWhenDisallowed(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
c.IgnoreRobotsTxt = false
c.OnResponse(func(resp *Response) {
t.Fatalf("Received response: %d", resp.StatusCode)
})
err := c.Visit(ts.URL + "/disallowed")
if err.Error() != "URL blocked by robots.txt" {
t.Fatalf("wrong error message: %v", err)
}
}
func TestRobotsWhenDisallowedWithQueryParameter(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
c.IgnoreRobotsTxt = false
c.OnResponse(func(resp *Response) {
t.Fatalf("Received response: %d", resp.StatusCode)
})
err := c.Visit(ts.URL + "/allowed?q=1")
if err.Error() != "URL blocked by robots.txt" {
t.Fatalf("wrong error message: %v", err)
}
}
func TestIgnoreRobotsWhenDisallowed(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
c.IgnoreRobotsTxt = true
c.OnResponse(func(resp *Response) {
if resp.StatusCode != 200 {
t.Fatalf("Wrong response code: %d", resp.StatusCode)
}
})
err := c.Visit(ts.URL + "/disallowed")
if err != nil {
t.Fatal(err)
}
}
func TestConnectionErrorOnRobotsTxtResultsInError(t *testing.T) {
ts := newTestServer()
ts.Close() // immediately close the server to force a connection error
c := NewCollector()
c.IgnoreRobotsTxt = false
err := c.Visit(ts.URL)
if err == nil {
t.Fatal("Error expected")
}
}
func TestEnvSettings(t *testing.T) {
ts := newTestServer()
defer ts.Close()
os.Setenv("COLLY_USER_AGENT", "test")
defer os.Unsetenv("COLLY_USER_AGENT")
c := NewCollector()
valid := false
c.OnResponse(func(resp *Response) {
if string(resp.Body) == "test" {
valid = true
}
})
c.Visit(ts.URL + "/user_agent")
if !valid {
t.Fatalf("Wrong user-agent from environment")
}
}
func TestUserAgent(t *testing.T) {
const exampleUserAgent1 = "Example/1.0"
const exampleUserAgent2 = "Example/2.0"
const defaultUserAgent = "colly - https://github.com/gocolly/colly/v2"
ts := newTestServer()
defer ts.Close()
var receivedUserAgent string
func() {
c := NewCollector()
c.OnResponse(func(resp *Response) {
receivedUserAgent = string(resp.Body)
})
c.Visit(ts.URL + "/user_agent")
if got, want := receivedUserAgent, defaultUserAgent; got != want {
t.Errorf("mismatched User-Agent: got=%q want=%q", got, want)
}
}()
func() {
c := NewCollector(UserAgent(exampleUserAgent1))
c.OnResponse(func(resp *Response) {
receivedUserAgent = string(resp.Body)
})
c.Visit(ts.URL + "/user_agent")
if got, want := receivedUserAgent, exampleUserAgent1; got != want {
t.Errorf("mismatched User-Agent: got=%q want=%q", got, want)
}
}()
func() {
c := NewCollector(UserAgent(exampleUserAgent1))
c.OnResponse(func(resp *Response) {
receivedUserAgent = string(resp.Body)
})
c.Request("GET", ts.URL+"/user_agent", nil, nil, nil)
if got, want := receivedUserAgent, exampleUserAgent1; got != want {
t.Errorf("mismatched User-Agent (nil hdr): got=%q want=%q", got, want)
}
}()
func() {
c := NewCollector(UserAgent(exampleUserAgent1))
c.OnResponse(func(resp *Response) {
receivedUserAgent = string(resp.Body)
})
c.Request("GET", ts.URL+"/user_agent", nil, nil, http.Header{})
if got, want := receivedUserAgent, exampleUserAgent1; got != want {
t.Errorf("mismatched User-Agent (non-nil hdr): got=%q want=%q", got, want)
}
}()
func() {
c := NewCollector(UserAgent(exampleUserAgent1))
c.OnResponse(func(resp *Response) {
receivedUserAgent = string(resp.Body)
})
hdr := http.Header{}
hdr.Set("User-Agent", "")
c.Request("GET", ts.URL+"/user_agent", nil, nil, hdr)
if got, want := receivedUserAgent, ""; got != want {
t.Errorf("mismatched User-Agent (hdr with empty UA): got=%q want=%q", got, want)
}
}()
func() {
c := NewCollector(UserAgent(exampleUserAgent1))
c.OnResponse(func(resp *Response) {
receivedUserAgent = string(resp.Body)
})
hdr := http.Header{}
hdr.Set("User-Agent", exampleUserAgent2)
c.Request("GET", ts.URL+"/user_agent", nil, nil, hdr)
if got, want := receivedUserAgent, exampleUserAgent2; got != want {
t.Errorf("mismatched User-Agent (hdr with UA): got=%q want=%q", got, want)
}
}()
}
func TestHeaders(t *testing.T) {
const exampleHostHeader = "example.com"
const exampleTestHeader = "Testing"
ts := newTestServer()
defer ts.Close()
var receivedHeader string
func() {
c := NewCollector(
Headers(map[string]string{"Host": exampleHostHeader}),
)
c.OnResponse(func(resp *Response) {
receivedHeader = string(resp.Body)
})
c.Visit(ts.URL + "/host_header")
if got, want := receivedHeader, exampleHostHeader; got != want {
t.Errorf("mismatched Host header: got=%q want=%q", got, want)
}
}()
func() {
c := NewCollector(
Headers(map[string]string{"Test": exampleTestHeader}),
)
c.OnResponse(func(resp *Response) {
receivedHeader = string(resp.Body)
})
c.Visit(ts.URL + "/custom_header")
if got, want := receivedHeader, exampleTestHeader; got != want {
t.Errorf("mismatched custom header: got=%q want=%q", got, want)
}
}()
}
func TestParseHTTPErrorResponse(t *testing.T) {
contentCount := 0
ts := newTestServer()
defer ts.Close()
c := NewCollector(
AllowURLRevisit(),
)
c.OnHTML("p", func(e *HTMLElement) {
if e.Text == "error" {
contentCount++
}
})
c.Visit(ts.URL + "/500")
if contentCount != 0 {
t.Fatal("Content is parsed without ParseHTTPErrorResponse enabled")
}
c.ParseHTTPErrorResponse = true
c.Visit(ts.URL + "/500")
if contentCount != 1 {
t.Fatal("Content isn't parsed with ParseHTTPErrorResponse enabled")
}
}
func TestHTMLElement(t *testing.T) {
ctx := &Context{}
resp := &Response{
Request: &Request{
Ctx: ctx,
},
Ctx: ctx,
}
in := `<a href="http://go-colly.org">Colly</a>`
sel := "a[href]"
doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer([]byte(in)))
if err != nil {
t.Fatal(err)
}
elements := []*HTMLElement{}
i := 0
doc.Find(sel).Each(func(_ int, s *goquery.Selection) {
for _, n := range s.Nodes {
elements = append(elements, NewHTMLElementFromSelectionNode(resp, s, n, i))
i++
}
})
elementsLen := len(elements)
if elementsLen != 1 {
t.Errorf("element length mismatch. got %d, expected %d.\n", elementsLen, 1)
}
v := elements[0]
if v.Name != "a" {
t.Errorf("element tag mismatch. got %s, expected %s.\n", v.Name, "a")
}
if v.Text != "Colly" {
t.Errorf("element content mismatch. got %s, expected %s.\n", v.Text, "Colly")
}
if v.Attr("href") != "http://go-colly.org" {
t.Errorf("element href mismatch. got %s, expected %s.\n", v.Attr("href"), "http://go-colly.org")
}
}
func TestCollectorOnXMLWithHtml(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
titleCallbackCalled := false
paragraphCallbackCount := 0
c.OnXML("/html/head/title", func(e *XMLElement) {
titleCallbackCalled = true
if e.Text != "Test Page" {
t.Error("Title element text does not match, got", e.Text)
}
})
c.OnXML("/html/body/p", func(e *XMLElement) {
paragraphCallbackCount++
if e.Attr("class") != "description" {
t.Error("Failed to get paragraph's class attribute")
}
})
c.OnXML("/html/body", func(e *XMLElement) {
if e.ChildAttr("p", "class") != "description" {
t.Error("Invalid class value")
}
classes := e.ChildAttrs("p", "class")
if len(classes) != 2 {
t.Error("Invalid class values")
}
})
c.Visit(ts.URL + "/html")
if !titleCallbackCalled {
t.Error("Failed to call OnXML callback for <title> tag")
}
if paragraphCallbackCount != 2 {
t.Error("Failed to find all <p> tags")
}
}
func TestCollectorOnXMLWithXML(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
titleCallbackCalled := false
paragraphCallbackCount := 0
c.OnXML("//page/title", func(e *XMLElement) {
titleCallbackCalled = true
if e.Text != "Test Page" {
t.Error("Title element text does not match, got", e.Text)
}
})
c.OnXML("//page/paragraph", func(e *XMLElement) {
paragraphCallbackCount++
if e.Attr("type") != "description" {
t.Error("Failed to get paragraph's type attribute")
}
})
c.OnXML("/page", func(e *XMLElement) {
if e.ChildAttr("paragraph", "type") != "description" {
t.Error("Invalid type value")
}
classes := e.ChildAttrs("paragraph", "type")
if len(classes) != 2 {
t.Error("Invalid type values")
}
})
c.Visit(ts.URL + "/xml")
if !titleCallbackCalled {
t.Error("Failed to call OnXML callback for <title> tag")
}
if paragraphCallbackCount != 2 {
t.Error("Failed to find all <paragraph> tags")
}
}
func TestCollectorVisitWithTrace(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector(AllowedDomains("localhost", "127.0.0.1", "::1"), TraceHTTP())
c.OnResponse(func(resp *Response) {
if resp.Trace == nil {
t.Error("Failed to initialize trace")
}
})
err := c.Visit(ts.URL)
if err != nil {
t.Errorf("Failed to visit url %s", ts.URL)
}
}
func TestCollectorVisitWithCheckHead(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector(CheckHead())
var requestMethodChain []string
c.OnResponse(func(resp *Response) {
requestMethodChain = append(requestMethodChain, resp.Request.Method)
})
err := c.Visit(ts.URL)
if err != nil {
t.Errorf("Failed to visit url %s", ts.URL)
}
if requestMethodChain[0] != "HEAD" && requestMethodChain[1] != "GET" {
t.Errorf("Failed to perform a HEAD request before GET")
}
}
func TestCollectorDepth(t *testing.T) {
ts := newTestServer()
defer ts.Close()
maxDepth := 2
c1 := NewCollector(
MaxDepth(maxDepth),
AllowURLRevisit(),
)
requestCount := 0
c1.OnResponse(func(resp *Response) {
requestCount++
if requestCount >= 10 {
return
}
c1.Visit(ts.URL)
})
c1.Visit(ts.URL)
if requestCount < 10 {
t.Errorf("Invalid number of requests: %d (expected 10) without using MaxDepth", requestCount)
}
c2 := c1.Clone()
requestCount = 0
c2.OnResponse(func(resp *Response) {
requestCount++
resp.Request.Visit(ts.URL)
})
c2.Visit(ts.URL)
if requestCount != 2 {
t.Errorf("Invalid number of requests: %d (expected 2) with using MaxDepth 2", requestCount)
}
c1.Visit(ts.URL)
if requestCount < 10 {
t.Errorf("Invalid number of requests: %d (expected 10) without using MaxDepth again", requestCount)
}
requestCount = 0
c2.Visit(ts.URL)
if requestCount != 2 {
t.Errorf("Invalid number of requests: %d (expected 2) with using MaxDepth 2 again", requestCount)
}
}
func TestCollectorRequests(t *testing.T) {
ts := newTestServer()
defer ts.Close()
maxRequests := uint32(5)
c1 := NewCollector(
MaxRequests(maxRequests),
AllowURLRevisit(),
)
requestCount := 0
c1.OnResponse(func(resp *Response) {
requestCount++
c1.Visit(ts.URL)
})
c1.Visit(ts.URL)
if requestCount != 5 {
t.Errorf("Invalid number of requests: %d (expected 5) with MaxRequests", requestCount)
}
}
func TestCollectorContext(t *testing.T) {
// "/slow" takes 1 second to return the response.
// If context does abort the transfer after 0.5 seconds as it should,
// OnError will be called, and the test is passed. Otherwise, test is failed.
ts := newTestServer()
defer ts.Close()
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
defer cancel()
c := NewCollector(StdlibContext(ctx))
onErrorCalled := false
c.OnResponse(func(resp *Response) {
t.Error("OnResponse was called, expected OnError")
})
c.OnError(func(resp *Response, err error) {
onErrorCalled = true
if err != context.DeadlineExceeded {
t.Errorf("OnError got err=%#v, expected context.DeadlineExceeded", err)
}
})
err := c.Visit(ts.URL + "/slow")
if err != context.DeadlineExceeded {
t.Errorf("Visit return err=%#v, expected context.DeadlineExceeded", err)
}
if !onErrorCalled {
t.Error("OnError was not called")
}
}
func BenchmarkOnHTML(b *testing.B) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
c.OnHTML("p", func(_ *HTMLElement) {})
for n := 0; n < b.N; n++ {
c.Visit(fmt.Sprintf("%s/html?q=%d", ts.URL, n))
}
}
func BenchmarkOnXML(b *testing.B) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
c.OnXML("//p", func(_ *XMLElement) {})
for n := 0; n < b.N; n++ {
c.Visit(fmt.Sprintf("%s/html?q=%d", ts.URL, n))
}
}
func BenchmarkOnResponse(b *testing.B) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
c.AllowURLRevisit = true
c.OnResponse(func(_ *Response) {})
for n := 0; n < b.N; n++ {
c.Visit(ts.URL)
}
}
func requireSessionCookieSimple(handler http.Handler) http.Handler {
const cookieName = "session_id"
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if _, err := r.Cookie(cookieName); err == http.ErrNoCookie {
http.SetCookie(w, &http.Cookie{Name: cookieName, Value: "1"})
http.Redirect(w, r, r.RequestURI, http.StatusFound)
return
}
handler.ServeHTTP(w, r)
})
}
func requireSessionCookieAuthPage(handler http.Handler) http.Handler {
const setCookiePath = "/auth"
const cookieName = "session_id"
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == setCookiePath {
destination := r.URL.Query().Get("return")
http.Redirect(w, r, destination, http.StatusFound)
return
}
if _, err := r.Cookie(cookieName); err == http.ErrNoCookie {
http.SetCookie(w, &http.Cookie{Name: cookieName, Value: "1"})
http.Redirect(w, r, setCookiePath+"?return="+url.QueryEscape(r.RequestURI), http.StatusFound)
return
}
handler.ServeHTTP(w, r)
})
}
func TestCollectorPostRetry(t *testing.T) {
ts := newTestServer()
defer ts.Close()
postValue := "hello"
c := NewCollector()
try := false
c.OnResponse(func(r *Response) {
if r.Ctx.Get("notFirst") == "" {
r.Ctx.Put("notFirst", "first")
_ = r.Request.Retry()
return
}
if postValue != string(r.Body) {
t.Error("Failed to send data with POST")
}
try = true
})
c.Post(ts.URL+"/login", map[string]string{
"name": postValue,
})
if !try {
t.Error("OnResponse Retry was not called")
}
}
func TestCollectorGetRetry(t *testing.T) {
ts := newTestServer()
defer ts.Close()
try := false
c := NewCollector()
c.OnResponse(func(r *Response) {
if r.Ctx.Get("notFirst") == "" {
r.Ctx.Put("notFirst", "first")
_ = r.Request.Retry()
return
}
if !bytes.Equal(r.Body, serverIndexResponse) {
t.Error("Response body does not match with the original content")
}
try = true
})
c.Visit(ts.URL)
if !try {
t.Error("OnResponse Retry was not called")
}
}
func TestCollectorPostRetryUnseekable(t *testing.T) {
ts := newTestServer()
defer ts.Close()
try := false
postValue := "hello"
c := NewCollector()
c.OnResponse(func(r *Response) {
if postValue != string(r.Body) {
t.Error("Failed to send data with POST")
}
if r.Ctx.Get("notFirst") == "" {
r.Ctx.Put("notFirst", "first")
err := r.Request.Retry()
if !errors.Is(err, ErrRetryBodyUnseekable) {
t.Errorf("Unexpected error Type ErrRetryBodyUnseekable : %v", err)
}
return
}
try = true
})
c.Request("POST", ts.URL+"/login", bytes.NewBuffer([]byte("name="+postValue)), nil, nil)
if try {
t.Error("OnResponse Retry was called but BodyUnseekable")
}
}
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/mirrors/Colly.git
git@gitee.com:mirrors/Colly.git
mirrors
Colly
Colly
master

搜索帮助