代码拉取完成,页面将自动刷新
import asyncio
import time
from functools import wraps
import os
import requests # aiohttp
from bs4 import BeautifulSoup
import pymongo
url = "http://47.103.13.124:8001/base"
urls = [url] * 10
cookies = {
'session': '.eJyrViotTi1SsqpWyiyOT0zJzcxTsjLQUcrJTwexSopKU3WUcvOTMnNSlayUDM3gQEkHrDE-M0XJyhjCzkvMBSmKKTVNMjMDkiamFkq1tQDfeR3n.YKXKWQ.QcA_zWfTFZFGlGik_5milrY3gRA'
}
client = pymongo.MongoClient(host='localhost', port=27017)
db = client.crawler
collection = db.demo1
def run_time(func):
@wraps(func)
def wrapper(*args, **kwargs):
start = time.perf_counter() # 启动时间
r = func(*args, **kwargs)
end = time.perf_counter() # 结束时间
print(f'PID: {os.getpid()}{func.__module__}.{func.__name__}: {end - start}')
return r
return wrapper
async def crawler(_url):
# 发起网络请求,是阻塞
# 0.1 + 0.1 + 0.1 = 0.3
r = requests.get(_url, cookies=cookies)
soup = BeautifulSoup(r.text, 'lxml')
movie_list = soup.find('div', class_='movie-list').find_all('a')
datas = []
for movie in movie_list:
img_url = movie.find('img').attrs.get('src')
title = movie.find('h5').get_text()
desc = movie.find('p').get_text()
score = movie.find('small').get_text()
desc2 = movie.find('small').find_next_sibling('small').get_text()
datas.append({
'img_url': img_url,
'title': title,
'desc': desc,
'score': score,
'desc2': desc2
})
# 将执行的权限,交出去,自己被挂起,执行完之后,回到事件循环
time.sleep(1)
# await asyncio.sleep(1)
# 将数据插入 mongo 中
collection.insert_many(datas)
# async def main():
# for _url in urls:
# # 只有自己一个任务,在串行执行
# await crawler(_url)
async def main():
tasks = []
for _url in urls:
tasks.append(asyncio.create_task(crawler(_url)))
await asyncio.wait(tasks)
@run_time
def run():
asyncio.run(main())
run()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。