1 Star 0 Fork 0

caiqimin_kevin@163.com/lazada-spider

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
main.go 6.69 KB
一键复制 编辑 原始数据 按行查看 历史
caiqimin_kevin@163.com 提交于 2024-03-03 20:02 . dev
package main
import (
"context"
"flag"
"fmt"
"io/ioutil"
"log"
"regexp"
"strconv"
"strings"
"time"
"github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp"
"golang.org/x/exp/slices"
)
type Group struct {
urls []string
groupId int
result string
err error
}
func findTheTarget(htmlSource, targetClass string) (error, []string) {
re := regexp.MustCompile("\\\"skuNames\\\":\\[(.*?)\\],")
found := re.FindString(htmlSource)
if found == "" {
found = "NA"
}
ret := []string{found}
return nil, ret
}
func getHttpContent(url string) (error, []byte) {
// create context
ctx, cancel := chromedp.NewContext(
context.Background(),
chromedp.WithLogf(log.Printf),
)
defer cancel()
// create a timeout as a safety net to prevent any infinite wait loops
ctx, cancel = context.WithTimeout(ctx, 60*time.Second)
defer cancel()
// set up a channel, so we can block later while we monitor the download
// progress
done := make(chan bool)
// set the download url as the chromedp GitHub user avatar
urlstr := url
// this will be used to capture the request id for matching network events
var requestID network.RequestID
// set up a listener to watch the network events and close the channel when
// complete the request id matching is important both to filter out
// unwanted network events and to reference the downloaded file later
chromedp.ListenTarget(ctx, func(v interface{}) {
switch ev := v.(type) {
case *network.EventRequestWillBeSent:
// log.Printf("EventRequestWillBeSent: %v: %v", ev.RequestID, ev.Request.URL)
if ev.Request.URL == urlstr {
requestID = ev.RequestID
}
case *network.EventLoadingFinished:
// log.Printf("EventLoadingFinished: %v", ev.RequestID)
if ev.RequestID == requestID {
close(done)
}
}
})
// all we need to do here is navigate to the download url
if err := chromedp.Run(ctx,
chromedp.Navigate(urlstr),
); err != nil {
return err, nil
}
// This will block until the chromedp listener closes the channel
<-done
// get the downloaded bytes for the request id
var buf []byte
if err := chromedp.Run(ctx, chromedp.ActionFunc(func(ctx context.Context) error {
var err error
buf, err = network.GetResponseBody(requestID).Do(ctx)
return err
})); err != nil {
fmt.Println("errrrrr2", err)
return err, nil
}
fmt.Println("=========================================================================")
fmt.Println(string(buf))
fmt.Println("=========================================================================")
return nil, buf
}
func splitToGroup(fileName string, lenPerGrp int) (error, []*Group) {
content, err := ioutil.ReadFile(fileName)
if err != nil {
return err, nil
}
rows := strings.Split(string(content), "\r\n")
n := len(rows)
var ret []*Group
var g *Group
k := -1
for i := 0; i < n; i++ {
s := rows[i]
if s == "" || strings.Trim(s, " ") == "" {
continue
}
k++
if k%lenPerGrp == 0 {
g = &Group{
groupId: k / lenPerGrp,
}
ret = append(ret, g)
}
g.urls = append(g.urls, s)
}
return nil, ret
}
func groupProc(g *Group, ch chan string) {
fmt.Printf("group=%v start\n", g.groupId)
var result []string
result = append(result, strconv.Itoa(g.groupId))
const MaxRetries int = 3
retries := 0
for _, url := range g.urls {
err, content := getHttpContent(url)
if err != nil {
time.Sleep(time.Second)
retries++
fmt.Printf("error gethttpContent, err=%v, retry=%v\n", err, retries)
continue
}
retries = 0
err, vals := findTheTarget(string(content), "sku-name")
if err != nil {
result = append(result, fmt.Sprintf("[%v]", "NA"))
continue
}
if vals == nil {
continue
}
result = append(result, "["+strings.Join(vals, ",")+"]")
}
ret := strings.Join(result, "\t")
g.result = ret
fmt.Printf("group=%v ended\n", g.groupId)
ch <- ret
}
var pInputFileName = flag.String("f", "", "处理文件名路径")
func test() {
// create context
ctx, cancel := chromedp.NewContext(
context.Background(),
chromedp.WithLogf(log.Printf),
)
defer cancel()
// create a timeout as a safety net to prevent any infinite wait loops
ctx, cancel = context.WithTimeout(ctx, 60*time.Second)
defer cancel()
// set up a channel, so we can block later while we monitor the download
// progress
done := make(chan bool)
// set the download url as the chromedp GitHub user avatar
// urlstr := "https://www.lazada.com.my/products/-i2729779008-s12896741969.html"
urlstr := "https://www.lazada.com.my/products/-i2729787539-s19023684171.html"
// this will be used to capture the request id for matching network events
var requestID network.RequestID
// set up a listener to watch the network events and close the channel when
// complete the request id matching is important both to filter out
// unwanted network events and to reference the downloaded file later
chromedp.ListenTarget(ctx, func(v interface{}) {
switch ev := v.(type) {
case *network.EventRequestWillBeSent:
log.Printf("EventRequestWillBeSent: %v: %v", ev.RequestID, ev.Request.URL)
if ev.Request.URL == urlstr {
requestID = ev.RequestID
}
case *network.EventLoadingFinished:
log.Printf("EventLoadingFinished: %v", ev.RequestID)
if ev.RequestID == requestID {
close(done)
}
}
})
// all we need to do here is navigate to the download url
if err := chromedp.Run(ctx,
chromedp.Navigate(urlstr),
); err != nil {
log.Fatal(err)
}
// This will block until the chromedp listener closes the channel
<-done
// get the downloaded bytes for the request id
var buf []byte
if err := chromedp.Run(ctx, chromedp.ActionFunc(func(ctx context.Context) error {
var err error
buf, err = network.GetResponseBody(requestID).Do(ctx)
return err
})); err != nil {
log.Fatal(err)
}
fmt.Println("=========================================================================")
fmt.Println(string(buf))
fmt.Println("=========================================================================")
}
func main() {
//TODO TEST START
// ff, _ := ioutil.ReadFile("test.html")
// findTheTarget(string(ff), "")
// // test()
// if true {
// return
// }
//TODO TEST END
flag.Parse()
if *pInputFileName == "" {
flag.Usage()
return
}
fileName := *pInputFileName
err, grps := splitToGroup(fileName, 2)
if err != nil {
fmt.Println(err)
return
}
slices.SortFunc(grps, func(a, b *Group) int {
if a.groupId < b.groupId {
return -1
}
if a.groupId > b.groupId {
return 1
}
return 0
})
ch := make(chan string, 10000)
for _, grp := range grps {
groupProc(grp, ch)
}
// var wg sync.WaitGroup
fmt.Println("===================RESULT=========================")
for _, g := range grps {
if g.err != nil {
fmt.Printf("group=%v error=%v", g.groupId, g.err)
continue
}
fmt.Println(g.result)
}
}
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/caiqimin/lazada-spider.git
git@gitee.com:caiqimin/lazada-spider.git
caiqimin
lazada-spider
lazada-spider
master

搜索帮助

0d507c66 1850385 C8b1a773 1850385