1 Star 0 Fork 0

codesman/discuss

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
dicuss_search.py 3.66 KB
一键复制 编辑 原始数据 按行查看 历史
zzz 提交于 2020-06-01 11:39 . init project...
# -*- coding:utf-8 -*-
import re
import json
import time
import traceback
import requests
from lxml import etree
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}
def get_session():
req_session = requests.Session()
req_session.headers.update(HEADERS)
return req_session
def get_response(session,url):
try:
response = session.get(url, timeout=60)
except:
print(traceback.format_exc())
response = session.get(url, timeout=60)
return response
def get_post_list(session, basic_url):
"""获取帖子列表"""
url = basic_url
while True:
one_post = ''
try:
print('帖子列表链接', url)
time.sleep(5)
res =get_response(session,url)
html_res = etree.HTML(res.text)
tr_list = html_res.xpath('//div[@class="search-result__results"]/table/tbody/tr')
if tr_list:
for one_tr in tr_list:
post_href = one_tr.xpath('./td[@class="search-result-subject-box"]/span[@class="search-result-subject"]/a/@href')
if post_href:
post_href = post_href[0]
post_title = one_tr.xpath('./td[@class="search-result-subject-box"]/span[@class="search-result-subject"]/a//text()')
if post_title:
post_title = ''.join(post_title)
print('帖子链接为:', post_href)
content_list = get_post_info(session, post_href)
one_post = {'post_href': post_href, 'post_title': post_title, 'content_list': content_list}
if content_list:
print('帖子内容为:', content_list[0])
save_data(one_post)
else:
print('获取帖子列表异常')
break
next_url = html_res.xpath('//div[@class="pagination-buttons"]/a[@class="next"]/@href')
if next_url:
url = 'https://www.discuss.com.hk/' + next_url[0]
else:
print('获取下一页连接失败')
break
except:
print(traceback.format_exc())
save_data(one_post)
break
def get_post_info(session, post_url):
time.sleep(10)
post_res =get_response(session,post_url)
html_res = etree.HTML(post_res.text)
if html_res is not None:
div_list = html_res.xpath('//div[@class="postmessage-content t_msgfont"]')
content_list = []
for one_div in div_list:
content = one_div.xpath('./span/text()')
if content:
content = ''.join(content)
content = content.replace('\n', '').replace('\r', '').replace('\t', '')
content_list.append(content)
return content_list
else:
print(post_res.text)
print('链接请求错误')
return []
def save_data(data):
with open('14.3.政治.tsv', 'a+', encoding='utf-8') as f:
f.write(json.dumps(data,ensure_ascii=False))
f.write('\n')
if __name__ == '__main__':
session = get_session()
url = 'https://www.discuss.com.hk/search.php?searchsubmit=true&srchtxt=%E5%8F%8D%E9%80%81%E4%B8%AD&orderby=dateline&page=1'
get_post_list(session, url)
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/zzserver/discuss.git
git@gitee.com:zzserver/discuss.git
zzserver
discuss
discuss
master

搜索帮助

0d507c66 1850385 C8b1a773 1850385