代码拉取完成,页面将自动刷新
# -*- coding:utf-8 -*-
import re
import json
import time
import traceback
import requests
from lxml import etree
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}
def get_session():
req_session = requests.Session()
req_session.headers.update(HEADERS)
return req_session
def get_response(session,url):
try:
response = session.get(url, timeout=60)
except:
print(traceback.format_exc())
response = session.get(url, timeout=60)
return response
def get_post_list(session, basic_url):
"""获取帖子列表"""
url = basic_url
while True:
one_post = ''
try:
print('帖子列表链接', url)
time.sleep(5)
res =get_response(session,url)
html_res = etree.HTML(res.text)
tr_list = html_res.xpath('//div[@class="search-result__results"]/table/tbody/tr')
if tr_list:
for one_tr in tr_list:
post_href = one_tr.xpath('./td[@class="search-result-subject-box"]/span[@class="search-result-subject"]/a/@href')
if post_href:
post_href = post_href[0]
post_title = one_tr.xpath('./td[@class="search-result-subject-box"]/span[@class="search-result-subject"]/a//text()')
if post_title:
post_title = ''.join(post_title)
print('帖子链接为:', post_href)
content_list = get_post_info(session, post_href)
one_post = {'post_href': post_href, 'post_title': post_title, 'content_list': content_list}
if content_list:
print('帖子内容为:', content_list[0])
save_data(one_post)
else:
print('获取帖子列表异常')
break
next_url = html_res.xpath('//div[@class="pagination-buttons"]/a[@class="next"]/@href')
if next_url:
url = 'https://www.discuss.com.hk/' + next_url[0]
else:
print('获取下一页连接失败')
break
except:
print(traceback.format_exc())
save_data(one_post)
break
def get_post_info(session, post_url):
time.sleep(10)
post_res =get_response(session,post_url)
html_res = etree.HTML(post_res.text)
if html_res is not None:
div_list = html_res.xpath('//div[@class="postmessage-content t_msgfont"]')
content_list = []
for one_div in div_list:
content = one_div.xpath('./span/text()')
if content:
content = ''.join(content)
content = content.replace('\n', '').replace('\r', '').replace('\t', '')
content_list.append(content)
return content_list
else:
print(post_res.text)
print('链接请求错误')
return []
def save_data(data):
with open('14.3.政治.tsv', 'a+', encoding='utf-8') as f:
f.write(json.dumps(data,ensure_ascii=False))
f.write('\n')
if __name__ == '__main__':
session = get_session()
url = 'https://www.discuss.com.hk/search.php?searchsubmit=true&srchtxt=%E5%8F%8D%E9%80%81%E4%B8%AD&orderby=dateline&page=1'
get_post_list(session, url)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。