5 Star 2 Fork 2

yrd/python博客小站

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
spider.py 4.04 KB
一键复制 编辑 原始数据 按行查看 历史
yrd 提交于 2021-01-19 20:41 . 项目重新架构便于答辩
import random
from bs4 import BeautifulSoup
import requests
from app.db.db import Article, db_insert_article
s = requests.Session() # 生成Session对象,用于保存Cookie
ARTICLE_FILE_PATH = 'D:\\studyProject\\python\\blog\\article.txt' # 存放文章的文件
TEMP_FILE_PATH = 'D:\\studyProject\\python\\blog\\temp.txt' # 存放单个文章
HEADERS = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/87.0.4280.141 Safari/537.36"
}
def get_article_pic(pic_elements, index):
"""
根据索引获取文章封面路径
:param pic_elements:
:param index:
:return:
"""
i = 0
for pic in pic_elements:
if index == i:
pic_src = 'https:' + pic.get('original')
return pic_src
i += 1
def get_article_title(title_elements, index):
"""
根据索引获取文章标题内容
:param title_elements:
:param index:
:return:
"""
i = 0
for title in title_elements:
if index == i:
title_content = title.string.strip()
return title_content
i += 1
def get_article_keyword(keyword_elements, index):
"""
根据索引获取文章的关键字
:param keyword_elements:
:param index:
:return:
"""
i = 0
for keyword in keyword_elements:
if index == i:
tag = keyword.next_sibling.next_sibling.string.strip()
print(tag)
return tag
i += 1
def get_comment_view(vc):
res = 0
if vc == 0:
res = random.randint(200, 1000)
else:
res = random.randint(0, 5)
return res
def get_article_content(link):
"""
根据链接获取文章的发布时间和内容
:param link:
:return:返回文章对象
"""
r = s.get(link)
r.raise_for_status()
r.encoding = 'utf-8'
a_soup = BeautifulSoup(r.text, 'lxml')
article_content = a_soup.find_all('article')[0]
article_time = a_soup.find_all('span', class_='time')[0].string.split(' ')[0]
article = Article()
article.content = str(article_content)
article.addtime = article_time
return article
def get_article_desc(desc_elements, index):
"""
返回文章描述
:param desc_elements:
:param index:
:return:
"""
i = 0
for desc in desc_elements:
if index == i:
desc_content = desc.string.replace("\n", "").strip()
return desc_content
i += 1
def storage_article_text(url):
"""
根据文章列表页面分析数据
:param:url
:return:
"""
# article_list_url = 'https://mp.sohu.com/profile?xpt=emhhbmhsZXBAc29odS5jb20=&_f=index_pagemp_1&spm=smpc.content' \
# '.author.1.1610841891253tbFNvG2'
r = s.get(url, headers=HEADERS)
r.raise_for_status()
soup = BeautifulSoup(r.text, 'lxml')
pic_list = soup.find_all('img', class_='cover-pic') # 爬取的文章封面图片
title_list = soup.find_all('a', attrs={'data-spm-type': 'content'}) # 爬取文章的标题
keyword_list = soup.find_all('i', class_='mp-iconqietu-biaoqian') # 文章的关键字
desc_list = soup.find_all('p', class_='feed-brief') # 文章的关键字描述
index = 0
for link in title_list:
link_content = 'https:' + link['href']
pic = get_article_pic(pic_list, index)
title = get_article_title(title_list, index)
keyword = get_article_keyword(keyword_list, index)
desc = get_article_desc(desc_list, index)
views = get_comment_view(0)
comments = get_comment_view(1)
article = get_article_content(link_content)
article.pic = pic
article.title = title
article.keyword = keyword
article.desc = desc
article.views = views
article.source = 'https://www.sohu.com'
article.ischeck = 1
article.istop = 0
article.comments = comments
db_insert_article(article)
index += 1
if index > 9: # 一次最多插入十条记录
break
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/inoyou27/blog.git
git@gitee.com:inoyou27/blog.git
inoyou27
blog
python博客小站
master

搜索帮助