1 Star 1 Fork 5

nmnl/爬虫-1688商品详情数据

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
main.py 6.81 KB
一键复制 编辑 原始数据 按行查看 历史
gebinda 提交于 2021-07-08 18:27 . 重构
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from time import sleep
import os
import requests
import random
from config import url,proxy,is_window,sleep_time
from compiler import get_img_url
from download import download
from load_file import get_url_list
import sys
reload(sys)
sys.setdefaultencoding('utf8')
# 爬取目标地址示例
# url = 'https://shop460bu04581k99.1688.com/page/offerlist.htm?spm=a261y.7663282.0.0.7b86ff8clcxtim'
# 商品列表数据
urls = []
# 存放数据目录
path = 'images'
# 错误次数
count = 0
# 商品详情对象
class Obj:
pass
# 获取浏览器对象
def get_browser():
option = ChromeOptions()
# 关闭自动控制
option.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
option.add_experimental_option('useAutomationExtension', False)
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) # 不加载图片,加快访问速度
# 代理IP
BG_proxy = requests.get(proxy).text
option.add_argument("--proxy-server=" + BG_proxy)
# 是否显示浏览器
if not is_window:
option.add_argument('--headless')
option.add_argument('--disable-gpu')
# 干掉全局变量
browser = webdriver.Chrome(options=option)
browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
})
return browser
# 获取商品详情页数据
def get_itemPage(lis):
# 获取浏览器对象,需要每个分页获取一次,更换代理IP,待优化
web = get_browser()
web.get(lis[0])
# 模拟向下滑动浏览
def swipe_down(self,second):
for i in range(int(second/0.1)):
#根据i的值,模拟上下滑动
if(i%2==0):
js = "var q=document.documentElement.scrollTop=" + str(300+400*i)
else:
js = "var q=document.documentElement.scrollTop=" + str(200 * i)
web.execute_script(js)
sleep(0.1)
js = "var q=document.documentElement.scrollTop=100000"
web.execute_script(js)
sleep(0.1)
swipe_time = random.randint(1, 3)
swipe_down(swipe_time)
# 登录框
if web.find_elements_by_xpath('//*[@id="sufei-dialog-close"]') != []:
web.find_elements_by_xpath('//*[@id="sufei-dialog-close"]')[0].click()
web.execute_script("window.scrollTo(0,10000)")
sleep(2)
lazyload_el = web.find_elements_by_xpath('desc-lazyload-container')
if lazyload_el != []:
# 提取相关数据
actives = web.find_elements_by_xpath('//*[@id="mod-detail-attributes"]/div')
if len(actives) > 1:
web.execute_script("arguments[0].click();", actives[1])
tabs = web.find_elements_by_xpath('//*[@id="mod-detail-attributes"]/div[1]/table/tbody/tr/td')
article_num = "暂无货号"
# label.decode('utf-8')
for index,item in enumerate(tabs):
if item.text.decode('utf-8') == "货号".decode('utf-8'):
article_num = tabs[index+1].text
break
lazyload_url = lazyload_el[0].get_attribute('data-tfs-url')
# 将数据放入列表中存储
imgObj = Obj()
imgObj.name = article_num
print("当前商品货号:" + article_num)
imgObj.url = lazyload_url
urls.append(imgObj)
# 等待
sleep(sleep_time)
lis.pop(0)
web.quit()
count = 0
else:
count += 1
if count > 3:
print("遇到验证码,开始抓取现有资源")
else:
print("无法查找元素,重新进去页面,当前第"+count+"次")
get_itemPage(lis)
# 获取分页地址
def get_urls():
urls = get_url_list()
if not is_window:
print("程序正在运行,请不要关闭此窗口")
else:
print("正在打开浏览器...")
print("请不要手动操作自动打开的浏览器,防止程序出错!")
get_itemPage(urls)
# 开始程序
get_urls()
# 遍历模板,获取图片地址
for i,item in enumerate(urls):
get_img_url(item)
print("图片数据全部解析完成,准备开始下载......")
# 创建总目录
if not os.path.isdir(path):
os.mkdir(path)
print(path+"创建文件夹成功!")
# 下载图片
for el in urls:
for index,_url in enumerate(el.imgUrls):
download(el.name,_url,index)
# 模板数据地址示例
# ['https://itemcdn.tmall.com/1688offer/icoss2216880252cb747993ec73eb32', 'https://itemcdn.tmall.com/1688offer/icoss28999643886075bc1c8fe72434', 'https://itemcdn.tmall.com/1688offer/icoss3381449399f66ead444e6e1ad1', 'https://itemcdn.tmall.com/1688offer/icoss2638187564e65eb75c23721b7b', 'https://itemcdn.tmall.com/1688offer/icoss1457411858d46bb1245e5d66c2', 'https://itemcdn.tmall.com/1688offer/icoss3220729198ca8b3a2436961ca0', 'https://itemcdn.tmall.com/1688offer/icoss14399905038be7c4a900aee6f8', 'https://itemcdn.tmall.com/1688offer/icoss67232252651f006a9247ea4b5', 'https://itemcdn.tmall.com/1688offer/icoss1270387875ab9c7da121638cdf', 'https://itemcdn.tmall.com/1688offer/icoss1558338297bc88656551d8443e', 'https://itemcdn.tmall.com/1688offer/icoss26276137513d44a2e9218457d9', 'https://itemcdn.tmall.com/1688offer/icoss10620209258b1e64e1b8ead6f8', 'https://itemcdn.tmall.com/1688offer/icoss1740101800c91fa4cdb8cdb847', 'https://itemcdn.tmall.com/1688offer/icoss37651667727b3f9b5a3892a5da', 'https://itemcdn.tmall.com/1688offer/icoss28270765dbf4c4a2011110ce', 'https://itemcdn.tmall.com/1688offer/icoss29783720005245faf526e3e32e', 'https://itemcdn.tmall.com/1688offer/icoss390997959601111c5417918ff5', 'https://itemcdn.tmall.com/1688offer/icoss15075652157be5ff87fabaf5e1', 'https://itemcdn.tmall.com/1688offer/icoss67574882333b8eda369f291d2', 'https://itemcdn.tmall.com/1688offer/icoss12517867162802246a1f45b4b6', 'https://itemcdn.tmall.com/1688offer/icoss359044238ea80ca6100c19507', 'https://itemcdn.tmall.com/1688offer/icoss202385300738750953d60cee31', 'https://itemcdn.tmall.com/1688offer/icoss8294834377299bd186c3aeb1b', 'https://itemcdn.tmall.com/1688offer/icoss3417803030598e641b2529d29e', 'https://itemcdn.tmall.com/1688offer/icoss666852499152d395858704e90', 'https://itemcdn.tmall.com/1688offer/icoss2696436235871ac44c779c425c', 'https://itemcdn.tmall.com/1688offer/icoss1884443357340e9d27495703b3', 'https://itemcdn.tmall.com/1688offer/icoss379745107886c4a0fb918c04c3', 'https://itemcdn.tmall.com/1688offer/icoss21175014038965ac5ec72697cd', 'https://itemcdn.tmall.com/1688offer/icoss449250950cf4c8c6f5c9f6515', 'https://itemcdn.tmall.com/1688offer/icoss323599495522b4bd157573fcdf', 'https://itemcdn.tmall.com/1688offer/icoss846029247d0c1afb54288e5a']
# driver_service = Service( 'chromedriver.ext') #括号内填写 驱动路径
# driver_service.command_line_args()
# driver_service.start()
# driver = webdriver.Chrome()
# driver.get("http://www.baidu.com")
# driver.quit()
# driver_service.stop()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/nmnl/crawler-1688-product-details.git
git@gitee.com:nmnl/crawler-1688-product-details.git
nmnl
crawler-1688-product-details
爬虫-1688商品详情数据
master

搜索帮助