master

分支 (1)

管理

管理

master

lazada-spider
/
main.go

package main

import (
	"context"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"regexp"
	"strconv"
	"strings"
	"time"

	"github.com/chromedp/cdproto/network"
	"github.com/chromedp/chromedp"
	"golang.org/x/exp/slices"
)

type Group struct {
	urls    []string
	groupId int
	result  string
	err     error
}

func findTheTarget(htmlSource, targetClass string) (error, []string) {
	re := regexp.MustCompile("\\\"skuNames\\\":\\[(.*?)\\],")
	found := re.FindString(htmlSource)
	if found == "" {
		found = "NA"
	}
	ret := []string{found}
	return nil, ret
}

func getHttpContent(url string) (error, []byte) {
	// create context
	ctx, cancel := chromedp.NewContext(
		context.Background(),
		chromedp.WithLogf(log.Printf),
	)
	defer cancel()

	// create a timeout as a safety net to prevent any infinite wait loops
	ctx, cancel = context.WithTimeout(ctx, 60*time.Second)
	defer cancel()

	// set up a channel, so we can block later while we monitor the download
	// progress
	done := make(chan bool)

	// set the download url as the chromedp GitHub user avatar
	urlstr := url

	// this will be used to capture the request id for matching network events
	var requestID network.RequestID

	// set up a listener to watch the network events and close the channel when
	// complete the request id matching is important both to filter out
	// unwanted network events and to reference the downloaded file later
	chromedp.ListenTarget(ctx, func(v interface{}) {
		switch ev := v.(type) {
		case *network.EventRequestWillBeSent:
			// log.Printf("EventRequestWillBeSent: %v: %v", ev.RequestID, ev.Request.URL)
			if ev.Request.URL == urlstr {
				requestID = ev.RequestID
			}
		case *network.EventLoadingFinished:
			// log.Printf("EventLoadingFinished: %v", ev.RequestID)
			if ev.RequestID == requestID {
				close(done)
			}
		}
	})

	// all we need to do here is navigate to the download url
	if err := chromedp.Run(ctx,
		chromedp.Navigate(urlstr),
	); err != nil {
		return err, nil
	}

	// This will block until the chromedp listener closes the channel
	<-done
	// get the downloaded bytes for the request id
	var buf []byte
	if err := chromedp.Run(ctx, chromedp.ActionFunc(func(ctx context.Context) error {
		var err error
		buf, err = network.GetResponseBody(requestID).Do(ctx)
		return err
	})); err != nil {
		fmt.Println("errrrrr2", err)
		return err, nil
	}

	fmt.Println("=========================================================================")
	fmt.Println(string(buf))
	fmt.Println("=========================================================================")
	return nil, buf
}

func splitToGroup(fileName string, lenPerGrp int) (error, []*Group) {
	content, err := ioutil.ReadFile(fileName)
	if err != nil {
		return err, nil
	}
	rows := strings.Split(string(content), "\r\n")
	n := len(rows)
	var ret []*Group
	var g *Group
	k := -1
	for i := 0; i < n; i++ {
		s := rows[i]
		if s == "" || strings.Trim(s, " ") == "" {
			continue
		}
		k++
		if k%lenPerGrp == 0 {
			g = &Group{
				groupId: k / lenPerGrp,
			}
			ret = append(ret, g)
		}
		g.urls = append(g.urls, s)
	}
	return nil, ret
}

func groupProc(g *Group, ch chan string) {
	fmt.Printf("group=%v start\n", g.groupId)
	var result []string
	result = append(result, strconv.Itoa(g.groupId))
	const MaxRetries int = 3
	retries := 0
	for _, url := range g.urls {
		err, content := getHttpContent(url)
		if err != nil {
			time.Sleep(time.Second)
			retries++
			fmt.Printf("error gethttpContent, err=%v, retry=%v\n", err, retries)
			continue
		}
		retries = 0
		err, vals := findTheTarget(string(content), "sku-name")
		if err != nil {
			result = append(result, fmt.Sprintf("[%v]", "NA"))
			continue
		}
		if vals == nil {
			continue
		}
		result = append(result, "["+strings.Join(vals, ",")+"]")
	}
	ret := strings.Join(result, "\t")
	g.result = ret
	fmt.Printf("group=%v ended\n", g.groupId)
	ch <- ret
}

var pInputFileName = flag.String("f", "", "处理文件名路径")

func test() {
	// create context
	ctx, cancel := chromedp.NewContext(
		context.Background(),
		chromedp.WithLogf(log.Printf),
	)
	defer cancel()

	// create a timeout as a safety net to prevent any infinite wait loops
	ctx, cancel = context.WithTimeout(ctx, 60*time.Second)
	defer cancel()

	// set up a channel, so we can block later while we monitor the download
	// progress
	done := make(chan bool)

	// set the download url as the chromedp GitHub user avatar
	// urlstr := "https://www.lazada.com.my/products/-i2729779008-s12896741969.html"
	urlstr := "https://www.lazada.com.my/products/-i2729787539-s19023684171.html"

	// this will be used to capture the request id for matching network events
	var requestID network.RequestID

	// set up a listener to watch the network events and close the channel when
	// complete the request id matching is important both to filter out
	// unwanted network events and to reference the downloaded file later
	chromedp.ListenTarget(ctx, func(v interface{}) {
		switch ev := v.(type) {
		case *network.EventRequestWillBeSent:
			log.Printf("EventRequestWillBeSent: %v: %v", ev.RequestID, ev.Request.URL)
			if ev.Request.URL == urlstr {
				requestID = ev.RequestID
			}
		case *network.EventLoadingFinished:
			log.Printf("EventLoadingFinished: %v", ev.RequestID)
			if ev.RequestID == requestID {
				close(done)
			}
		}
	})

	// all we need to do here is navigate to the download url
	if err := chromedp.Run(ctx,
		chromedp.Navigate(urlstr),
	); err != nil {
		log.Fatal(err)
	}

	// This will block until the chromedp listener closes the channel
	<-done
	// get the downloaded bytes for the request id
	var buf []byte
	if err := chromedp.Run(ctx, chromedp.ActionFunc(func(ctx context.Context) error {
		var err error
		buf, err = network.GetResponseBody(requestID).Do(ctx)
		return err
	})); err != nil {
		log.Fatal(err)
	}

	fmt.Println("=========================================================================")
	fmt.Println(string(buf))
	fmt.Println("=========================================================================")
}

func main() {
	//TODO TEST START
	// ff, _ := ioutil.ReadFile("test.html")
	// findTheTarget(string(ff), "")
	// // test()
	// if true {
	// 	return
	// }
	//TODO TEST END
	flag.Parse()
	if *pInputFileName == "" {
		flag.Usage()
		return
	}
	fileName := *pInputFileName
	err, grps := splitToGroup(fileName, 2)
	if err != nil {
		fmt.Println(err)
		return
	}
	slices.SortFunc(grps, func(a, b *Group) int {
		if a.groupId < b.groupId {
			return -1
		}
		if a.groupId > b.groupId {
			return 1
		}
		return 0
	})
	ch := make(chan string, 10000)
	for _, grp := range grps {
		groupProc(grp, ch)
	}
	// var wg sync.WaitGroup
	fmt.Println("===================RESULT=========================")
	for _, g := range grps {
		if g.err != nil {
			fmt.Printf("group=%v error=%v", g.groupId, g.err)
			continue
		}
		fmt.Println(g.result)
	}
}