1 Star 4 Fork 4

ayuliao/CrawlerCode

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
3.提速爬虫-协程.py 2.03 KB
一键复制 编辑 原始数据 按行查看 历史
二两的分身 提交于 2021-05-26 12:35 . code
import asyncio
import time
from functools import wraps
import os
import requests # aiohttp
from bs4 import BeautifulSoup
import pymongo
url = "http://47.103.13.124:8001/base"
urls = [url] * 10
cookies = {
'session': '.eJyrViotTi1SsqpWyiyOT0zJzcxTsjLQUcrJTwexSopKU3WUcvOTMnNSlayUDM3gQEkHrDE-M0XJyhjCzkvMBSmKKTVNMjMDkiamFkq1tQDfeR3n.YKXKWQ.QcA_zWfTFZFGlGik_5milrY3gRA'
}
client = pymongo.MongoClient(host='localhost', port=27017)
db = client.crawler
collection = db.demo1
def run_time(func):
@wraps(func)
def wrapper(*args, **kwargs):
start = time.perf_counter() # 启动时间
r = func(*args, **kwargs)
end = time.perf_counter() # 结束时间
print(f'PID: {os.getpid()}{func.__module__}.{func.__name__}: {end - start}')
return r
return wrapper
async def crawler(_url):
# 发起网络请求,是阻塞
# 0.1 + 0.1 + 0.1 = 0.3
r = requests.get(_url, cookies=cookies)
soup = BeautifulSoup(r.text, 'lxml')
movie_list = soup.find('div', class_='movie-list').find_all('a')
datas = []
for movie in movie_list:
img_url = movie.find('img').attrs.get('src')
title = movie.find('h5').get_text()
desc = movie.find('p').get_text()
score = movie.find('small').get_text()
desc2 = movie.find('small').find_next_sibling('small').get_text()
datas.append({
'img_url': img_url,
'title': title,
'desc': desc,
'score': score,
'desc2': desc2
})
# 将执行的权限,交出去,自己被挂起,执行完之后,回到事件循环
time.sleep(1)
# await asyncio.sleep(1)
# 将数据插入 mongo 中
collection.insert_many(datas)
# async def main():
# for _url in urls:
# # 只有自己一个任务,在串行执行
# await crawler(_url)
async def main():
tasks = []
for _url in urls:
tasks.append(asyncio.create_task(crawler(_url)))
await asyncio.wait(tasks)
@run_time
def run():
asyncio.run(main())
run()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/ayuLiao/crawler-code.git
git@gitee.com:ayuLiao/crawler-code.git
ayuLiao
crawler-code
CrawlerCode
master

搜索帮助