1 Star 0 Fork 11

WSS/北网-2分院人工智能-1804C-资管

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
x23us_spider.py 3.79 KB
一键复制 编辑 原始数据 按行查看 历史
zhangpengju 提交于 2018-10-12 08:32 . '小说下载框架及x23us实例'
from spider import Spider
'''
url = 'https://www.x23us.com/html/66/66656/'
book_author_regex = '<meta name="og:novel:author" content="(.*?)"/> '
book_name_regex = '<meta name="og:novel:book_name" content="(.*?)"/>'
book_chapter_regex = '<td class="L"><a href="(.*?).html">(.*?)</a></td>'
x = Spider(url).get_info(book_author = book_author_regex,
book_name = book_name_regex,
book_chapter=book_chapter_regex,
)
'''
class BookInfoApi(Spider):
def book_info(self):
self.book_author_regex = '<meta name="og:novel:author" content="(.*?)"/> '
self.book_name_regex = '<meta name="og:novel:book_name" content="(.*?)"/>'
self.book_chapter_regex = '<td class="L"><a href="(.*?).html">(.*?)</a></td>'
self.book_info = self.get_info(
book_author = self.book_author_regex,
book_name = self.book_name_regex,
book_chapter= self.book_chapter_regex,
)
return self.book_info
class ChapterInfo(Spider):
def content_info(self):
self.chapter_name_regex = '<h1>(.*?)</h1>'
self.chapter_content_regex = '<dd id="contents">(.*?)</dd>'
self.chapter_next_regex = '<dd><h3><a href="/html/66/66656/">上一页</a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <a href="/html/66/66656/" title="圣墟最新章节更新列表">返回最新章节列表</a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <a href="/html/66/66656/27429412.html">下一页</a></h3></dd>'
self.chapter_info = self.get_info(
chapter_name = self.chapter_name_regex,
chapter_content = self.chapter_content_regex,
chapter_next = self.chapter_next_regex,
)
return self.chapter_info
def save(book_url = 'https://www.x23us.com/html/4/4779/'):
book_info = BookInfoApi(book_url).book_info()
with open('书名:{}-作者:{}.txt'.format(book_info['book_name'][0],book_info['book_author'][0]),'w') as f:
for chapter_url,chapter_name in book_info['book_chapter'][:10]:
chapter_url = book_url+ chapter_url + '.html'
chapter_info = ChapterInfo(chapter_url).content_info()
print(chapter_info['chapter_name'][0])
#print(chapter_info)
f.write(chapter_info['chapter_name'][0])
f.write('\n\n')
f.write(chapter_info['chapter_content'][0].replace('&nbsp;',' ').replace('<br />','\n'))
f.write('\n\n')
f.write('*'*20)
f.write('\n\n')
if __name__ == '__main__':
'''
book_info:
{'book_author': ['塞林格'],
'book_name': ['麦田里的守望者'],
'book_chapter': [('1524372', '·内容提要·'), ('1524375', '·作品赏析·'), ('1524378', '第01节'), ('1524381', '第02节'), ('1524384', '第03节'), ('1524387', '第04节'), ('1524390', '第05节'), ('1524394', '第06节'), ('1524397', '第07节'), ('1524400', '第08节'), ('1524403', '第09节'), ('1524406', '第10节'), ('1524410', '第11节'), ('1524413', '第12节'), ('1524416', '第13节'), ('1524419', '第14节'), ('1524422', '第15节'), ('1524425', '第16节'), ('1524428', '第17节'), ('1524431', '第18节'), ('1524434', '第19节'), ('1524437', '第20节'), ('1524441', '第21节'), ('1524444', '第22节'), ('1524447', '第23节'), ('1524450', '第24节'), ('1524453', '第25节'), ('1524456', '第26节')]}
chapter_info:
{'chapter_name': ['正文 ·内容提要·'],
'chapter_content': ['&n...],
'chapter_next': []
}
'''
save()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/WANGSUHAIS/BeiWang-2FenYuanRenGongZhiNen-1804C-u.git
git@gitee.com:WANGSUHAIS/BeiWang-2FenYuanRenGongZhiNen-1804C-u.git
WANGSUHAIS
BeiWang-2FenYuanRenGongZhiNen-1804C-u
北网-2分院人工智能-1804C-资管
master

搜索帮助