1 Star 1 Fork 0

茗趣yhj/多线程爬取王者荣耀高清壁纸

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
main.py 6.69 KB
一键复制 编辑 原始数据 按行查看 历史
茗趣yhj 提交于 2021-07-18 05:36 . 重命名 demo.py 为 main.py
# -*- coding: utf-8 -*-
"""
@File Name:王者荣耀高清壁纸下载.py
@Author: name
@Mail: name@qq.com
@Created Time: 2021/7/18 10:43
@Description: 王者荣耀高清壁纸下载
"""
import os
import yaml
import time
import logging
import requests
import threading
from queue import Queue
from urllib import parse, request
from logging.handlers import RotatingFileHandler
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"referer": 'https://pvp.qq.com/web201605/wallpaper.shtml'
}
def exact_url(data):
"""提取url"""
image_url_list = []
for i in range(1, 9):
image_url = parse.unquote(data['sProdImgNo_{}'.format(i)]).replace('200', '0')
image_url_list.append(image_url)
return image_url_list
# 单线程下载
class MyDaemon(object):
def __init__(self):
self.headers = headers
# https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page=1&iOrder=0&iSortNumClose=1&jsoncallback=jQuery17102853119693390389_1626576162865&iAMSActivityId=51991&_everyRead=true&iTypeId=2&iFlowId=267733&iActId=2735&iModuleId=2735&_=1626576287149
# &jsoncallback=jQuery17102853119693390389_1626576162865 去掉该参数后返回格式即为json
self.url = 'https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page=1&iOrder=0&iSortNumClose=1&iAMSActivityId=51991&_everyRead=true&iTypeId=2&iFlowId=267733&iActId=2735&iModuleId=2735&_=1626576287149'
def send_request(self):
"""高清壁纸请求链接"""
resp = requests.get(self.url, headers=self.headers)
return resp.json()
def parse_json(self, json_data):
"""解析json数据"""
data_dict = {}
data_list = json_data['List']
for data in data_list:
image_url_list = exact_url(data)
sProdName = parse.unquote(data['sProdName'])
data_dict[sProdName] = image_url_list
# 保存图片
self.save_pic(data_dict)
def save_pic(self, data):
""""保存图片"""
# 名称格式: imgs/sProdName
for key in data:
dirpath = '../data/imgs/{}/'.format(key.strip(' ').replace('1:1', ''))
# dirpath = os.path.join('../data/imgs', key.split(' '))
os.mkdir(dirpath)
# 下载保存图片
for index, image_url in enumerate(data[key]):
request.urlretrieve(image_url, dirpath + '{}.jpg'.format(index + 1))
# request.urlretrieve(image_url, os.path.join(dirpath, '{}.jpg'.format(index + 1)))
print('{}下载完毕'.format(data[key][index]))
def run(self):
## 如果没有此文件,请手动配置config 字典 也可注释
## {'mysql': {'host': '', 'port': 3306, 'user': '', 'password': '', 'database': ''},...}
config_file = open(os.path.dirname(os.path.abspath(__file__)) + '/config.yaml', encoding='utf-8')
self.config = yaml.safe_load(config_file)
config_file.close()
########### 如有报错 此块注释 #########
name = 'lianjie_house'
logging.basicConfig(level=logging.INFO)
handler = RotatingFileHandler(os.path.dirname(os.path.abspath(__file__)) + '/../logs/%s.log' % name,
maxBytes=134217728, backupCount=7)
formatter = logging.Formatter('%(asctime)s - %(lineno)d- %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logging.getLogger().addHandler(handler)
requests_log = logging.getLogger("requests.packages.urllib3")
requests_log.setLevel(logging.ERROR)
########### 如有报错 此块注释 #########
logging.warning('启动 [%s]', name)
logging.warning('主线程 PID [%s]', os.getpid())
result = self.send_request()
self.parse_json(result)
# 生产者线程
class Producer(threading.Thread):
def __init__(self, page_queue, image_url_queue):
super().__init__() # 调用父类__init__() 方法
self.page_queue = page_queue
self.image_url_queue = image_url_queue
def run(self):
while not self.page_queue.empty():
page_url = self.page_queue.get()
resp = requests.get(page_url, headers=headers)
json_data = resp.json()
data_dict = {}
data_list = json_data['List']
for data in data_list:
image_url_list = exact_url(data)
sProdName = parse.unquote(data['sProdName'])
data_dict[sProdName] = image_url_list
for key in data_dict:
dirpath = '../data/imgs/{}/'.format(key.strip(' ').replace('1:1', ''))
if not os.path.exists(dirpath):
os.mkdir(dirpath)
time.sleep(0.1)
for index, image_url in enumerate(data_dict[key]):
# 生产图片url
tmp_dict = {
"image_path": dirpath + '{}.jpg'.format(index + 1),
'image_url': image_url
}
self.image_url_queue.put(tmp_dict)
# 消费者线程
class Customer(threading.Thread):
def __init__(self, image_url_queue):
super().__init__() # 调用父类__init__() 方法
self.image_url_queue = image_url_queue
def run(self):
while True:
try:
image_data = self.image_url_queue.get(timeout=10)
request.urlretrieve(image_data['image_url'], image_data['image_path'])
except Exception as e:
print('消费者_{}取数据报错:{}'.format(self.getName(), str(e)))
break
finally:
time.sleep(1)
# 启动线程
def start():
page_queue = Queue(26)
image_url_queue = Queue(1000)
for i in range(0, 26):
page_url = f'https://apps.game.qq.com/cgi-bin/ams/module/ishow/V1.0/query/workList_inc.cgi?activityId=2735&sVerifyCode=ABCD&sDataType=JSON&iListNum=20&totalpage=0&page={i}&iOrder=0&iSortNumClose=1&iAMSActivityId=51991&_everyRead=true&iTypeId=2&iFlowId=267733&iActId=2735&iModuleId=2735&_=1626576287149'
page_queue.put(page_url)
# 创建生产者线程对象
for i in range(5):
t = Producer(page_queue, image_url_queue)
t.start()
# 创建消费者线程对象
for i in range(5):
t = Customer(image_url_queue)
t.start()
if __name__ == '__main__':
# 多线程
start()
# 单线程
# my_daemon = MyDaemon()
# my_daemon.run()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/yhjkcjm/mult_thread_pic.git
git@gitee.com:yhjkcjm/mult_thread_pic.git
yhjkcjm
mult_thread_pic
多线程爬取王者荣耀高清壁纸
master

搜索帮助