代码拉取完成,页面将自动刷新
import requests
# 设置代理
proxies = {
'http': 'http://127.0.0.1:15732',
'https': 'http://127.0.0.1:15732'
}
def get_text(url = "https://www.dbxsd.com/book/p7836/3031015.html"):
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "no-cache",
"pragma": "no-cache",
"priority": "u=0, i",
"referer": "https://www.dbxsd.com/book/p7836/",
"sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Linux\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
}
cookies = {
"playcount": "15"
}
response = requests.get(url, headers=headers, cookies=cookies,proxies=proxies)
response.encoding = "utf-8"
print(response.url,response)
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
datas = soup.find('div',id="cont-body").find_all('p')
text =''''''
for data in datas:
text +=data.text.strip() + '\n'
print(text)
return text
def main():
url_dict ={
'https://www.dbxsc.com/book/p7836/3031015.html':16,
'https://www.dbxsd.com/book/p7836/3031016.html':27,
'https://www.dbxsd.com/book/p7836/3031017.html':15,
'https://www.dbxsd.com/book/p7836/3031018.html':26
}
for url,lens in url_dict.items():
for page in range(1,lens+1):
if page == 1:
text = get_text(url)
else:
text = get_text(url=url.replace('.html',f'_{page}.html'))
with open('白银时代1.txt', 'a', encoding='utf-8') as f:
f.write(text+'\n\n\n')
pass
if __name__ == "__main__":
main()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。