1 Star 3 Fork 0

萧石/public_data

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
中华典藏_下载四大名著.py 3.54 KB
一键复制 编辑 原始数据 按行查看 历史
萧石 提交于 2024-11-20 15:20 . 中华典藏
# -*- coding: utf-8 -*-
"""
# @FileName :下载四大名著.py
# @Time :2024/7/8
# @Author :CL
# @email :1037654919@qq.com
"""
import re
import time
import requests
from bs4 import BeautifulSoup
from retrying import retry
proxies = {
'http': 'http://127.0.0.1:15732',
'https': 'http://127.0.0.1:15732'
}
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Pragma": "no-cache",
"Referer": "https://www.zhonghuadiancang.com/wenxueyishu/shuihuzhuan/",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Linux\""
}
cookies = {
"Hm_lvt_19055beeefdc53f8c9d46de44d9a851e": "1720403251",
"HMACCOUNT": "E2603C916CED7269",
"__vtins__JilA8kkNgJyF30yn": "%7B%22sid%22%3A%20%22026d12f7-3a1b-5a78-8955-063e9588f754%22%2C%20%22vd%22%3A%201%2C%20%22stt%22%3A%200%2C%20%22dr%22%3A%200%2C%20%22expires%22%3A%201720405340616%2C%20%22ct%22%3A%201720403540616%7D",
"__51uvsct__JilA8kkNgJyF30yn": "1",
"__51vcke__JilA8kkNgJyF30yn": "b15d3ae8-0e24-54ec-bbb0-5d18f855b0c4",
"__51vuft__JilA8kkNgJyF30yn": "1720403540623",
"Hm_lpvt_19055beeefdc53f8c9d46de44d9a851e": "1720403854"
}
@retry(stop_max_attempt_number=5, wait_fixed=5000)
def get_txt(url = "https://www.zhonghuadiancang.com/wenxueyishu/xiyouji/55058.html"):
response = requests.get(url, headers=headers, cookies=cookies,proxies=proxies)
# print(response.text)
print(response.url, response.status_code)
response.encoding = "utf-8"
soup = BeautifulSoup(response.text, "html.parser")
title = soup.find("h1").text
ps = soup.find("div", id="content").find_all("p")
content =title +'\n'+ "\n".join([p.text.strip() for p in ps])
return content
def main():
mingzhu_urls = {
# 'https://www.zhonghuadiancang.com/wenxueyishu/xiyouji/55058.html': 100,
# 'https://www.zhonghuadiancang.com/wenxueyishu/hongloumeng/216610.html': 120,
# 'https://www.zhonghuadiancang.com/wenxueyishu/sanguoyanyi/285691.html': 120,
'https://www.zhonghuadiancang.com/wenxueyishu/shuihuzhuan/55158.html': 120
}
for url, num in mingzhu_urls.items():
base_url = url.rsplit('/', 1)[0] # 获取 URL 的基础部分,去掉最后一个斜杠后面的部分
# 使用正则表达式匹配数字部分
match = re.search(r'/(\d+)\.html$', url)
number = int(match.group(1))
with open(f"{url.split('https://www.zhonghuadiancang.com/wenxueyishu/')[1].split('/')[0]}.txt", "a", encoding="utf-8") as f:
for page in range(num + number, num + number+1):
constructed_url = f"{base_url}/{page}.html"
content = get_txt(constructed_url)
if content:
f.write(content+"\n\n\n")
else:
time.sleep(30)
content = get_txt(constructed_url)
f.write(content + "\n\n\n")
time.sleep(5)
if __name__ == '__main__':
print()
content = get_txt()
print(content)
main()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/beihai_xiaoshi/public_data.git
git@gitee.com:beihai_xiaoshi/public_data.git
beihai_xiaoshi
public_data
public_data
master

搜索帮助