代码拉取完成,页面将自动刷新
同步操作将从 bxqtee/learn_python3_spider 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
# encoding = utf-8
import concurrent
import os
from concurrent.futures import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
def header(referer):
headers = {
'Host': 'i.meizitu.net',
'Pragma': 'no-cache',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
'Referer': '{}'.format(referer),
}
return headers
def request_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
except requests.RequestException:
return None
def get_page_urls():
for i in range(1, 2):
baseurl = 'https://www.mzitu.com/page/{}'.format(i)
html = request_page(baseurl)
soup = BeautifulSoup(html, 'lxml')
list = soup.find(class_='postlist').find_all('li')
urls = []
for item in list:
url = item.find('span').find('a').get('href')
print('页面链接:%s' % url)
urls.append(url)
return urls
def download_Pic(title, image_list):
# 新建文件夹
os.mkdir(title)
j = 1
# 下载图片
for item in image_list:
filename = '%s/%s.jpg' % (title, str(j))
print('downloading....%s : NO.%s' % (title, str(j)))
with open(filename, 'wb') as f:
img = requests.get(item, headers=header(item)).content
f.write(img)
j += 1
def download(url):
html = request_page(url)
soup = BeautifulSoup(html, 'lxml')
total = soup.find(class_='pagenavi').find_all('a')[-2].find('span').string
title = soup.find('h2').string
image_list = []
for i in range(int(total)):
html = request_page(url + '/%s' % (i + 1))
soup = BeautifulSoup(html, 'lxml')
img_url = soup.find('img').get('src')
image_list.append(img_url)
download_Pic(title, image_list)
def download_all_images(list_page_urls):
# 获取每一个详情妹纸
# works = len(list_page_urls)
with concurrent.futures.ProcessPoolExecutor(max_workers=5) as exector:
for url in list_page_urls:
exector.submit(download, url)
if __name__ == '__main__':
# 获取每一页的链接和名称
list_page_urls = get_page_urls()
download_all_images(list_page_urls)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。