1 Star 0 Fork 0

codesman/discuss

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
discuss_spider_1.py 5.56 KB
一键复制 编辑 原始数据 按行查看 历史
zzz 提交于 2020-06-01 11:39 . init project...
# -*- coding:utf-8 -*-
import re
import json
import time
import random
import traceback
import requests
from lxml import etree
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}
def get_session():
req_session = requests.Session()
req_session.headers.update(HEADERS)
return req_session
def get_response(session,url):
try:
response = session.get(url, timeout=60)
except:
print(traceback.format_exc())
response = session.get(url, timeout=60)
return response
def get_post_list(session, basic_url):
"""获取帖子列表"""
url = basic_url
# get_response(session, basic_url)
# session.post('https://www.discuss.com.hk/agreement18.php',data={'catid':'152','adult':'agreed'})
while True:
one_post = ''
try:
print('帖子列表链接', url)
time.sleep(3)
res =get_response(session,url)
html_res = etree.HTML(res.text)
tbody_list = html_res.xpath('//tbody[contains(@id,"normalthread_")]')
if tbody_list:
for num,one_tbody in enumerate(tbody_list):
post_href = one_tbody.xpath('./tr/th/span[@class="tsubject"]/a/@href')
if post_href:
post_href ='https://www.discuss.com.hk/'+ post_href[0]
else:
continue
post_title = one_tbody.xpath('./tr/th/span[@class="tsubject"]/a/text()')
if post_title:
post_title = ''.join(post_title)
print('帖子链接为:', post_href)
content_list = get_post_info(session, post_href)
one_post = {'post_href': post_href, 'post_title': post_title, 'content_list': content_list}
if content_list:
# print('帖子内容为:', content_list[0])
print(num)
save_data(one_post)
else:
print('获取帖子列表异常')
print(res.text)
break
next_url = html_res.xpath('//div[@class="pagination-buttons"]/a[@class="next"]/@href')
if next_url:
url = 'https://www.discuss.com.hk/' + next_url[0]
else:
print('获取下一页连接失败')
break
except:
print(traceback.format_exc())
save_data(one_post)
break
def get_post_info(session, post_url):
time.sleep(8)
post_res =get_response(session,post_url)
html_res = etree.HTML(post_res.text)
if html_res is not None:
div_list = html_res.xpath('//div[@class="postmessage-content t_msgfont"]')
content_list = []
for one_div in div_list:
content = one_div.xpath('./span/text()')
if content:
content = ''.join(content)
content = content.replace('\n', '').replace('\r', '').replace('\t', '').replace('[]','').strip()
content_list.append(content)
return content_list
else:
print(post_res.text)
print('链接请求错误')
return []
def save_data(data):
with open('14.4.政治.tsv', 'a+', encoding='utf-8') as f:
f.write(json.dumps(data,ensure_ascii=False))
f.write('\n')
if __name__ == '__main__':
session = get_session()
# url = 'https://www.discuss.com.hk/forumdisplay.php?fid=204&page=180' #风水
# url ='https://www.discuss.com.hk/forumdisplay.php?fid=212' #棋牌
# url ='https://www.discuss.com.hk/forumdisplay.php?fid=417&page=3' #攀登
url_list =[
# 'https://www.discuss.com.hk/forumdisplay.php?fid=1162',#史前文明
# 'https://www.discuss.com.hk/forumdisplay.php?fid=801' #香港掌故與懷舊文化
# 'https://www.discuss.com.hk/forumdisplay.php?fid=724',# 宗教资料室
# 'https://www.discuss.com.hk/forumdisplay.php?fid=721', # 天主教信仰交流
# 'https://www.discuss.com.hk/forumdisplay.php?fid=723', #其他信仰
# 'https://www.discuss.com.hk/forumdisplay.php?fid=312',# 中国历史
# 'https://www.discuss.com.hk/forumdisplay.php?fid=722', #基督信仰交流
# 'https://www.discuss.com.hk/forumdisplay.php?fid=720', # 佛教信仰交流
# 'https://digital.discuss.com.hk/forumdisplay.php?fid=1149'
# 'https://digital.discuss.com.hk/forumdisplay.php?fid=707'
# 'https://www.discuss.com.hk/forumdisplay.php?fid=233&page=429'
# 'https://digital.discuss.com.hk/forumdisplay.php?fid=700&page=131'
# 'https://www.discuss.com.hk/forumdisplay.php?fid=994'
'https://www.discuss.com.hk/forumdisplay.php?fid=62&filter=0&orderby=dateline&ascdesc=DESC&page=1',
'https://news.discuss.com.hk/forumdisplay.php?fid=106&orderby=dateline&ascdesc=DESC&filter=0'
]
for one_url in url_list:
get_post_list(session, one_url)
print('当前连接结束时间为',time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),one_url)
# get_post_list(session, url)
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/zzserver/discuss.git
git@gitee.com:zzserver/discuss.git
zzserver
discuss
discuss
master

搜索帮助

0d507c66 1850385 C8b1a773 1850385