代码拉取完成,页面将自动刷新
同步操作将从 zhangpengju/北网-2分院人工智能-1804C-资管 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
from spider import Spider
import threading,_thread
from collections import deque
import glob
'''
url = 'https://www.x23us.com/html/66/66656/'
book_author_regex = '<meta name="og:novel:author" content="(.*?)"/> '
book_name_regex = '<meta name="og:novel:book_name" content="(.*?)"/>'
book_chapter_regex = '<td class="L"><a href="(.*?).html">(.*?)</a></td>'
x = Spider(url).get_info(book_author = book_author_regex,
book_name = book_name_regex,
book_chapter=book_chapter_regex,
)
'''
class Duoxian(Spider):
def __init__(self,
url = 'https://www.x23us.com/html/66/66656/',
charset = 'gbk',
):
super().__init__()
#Spiedr.__init__(self,url,charset)
self.book_info_dict = self.book_info()
self.book_name = self.book_info_dict['book_name'][0]
self.book_info = deque(list(enumerate(self.book_info_dict['book_info'])))
self.book_author = self.book_info_dict['book_author'][0]
def book_info(self):
book_info_regex = '<td class="L"><a href="(.*?)">(.*?)</a></td>'
book_name_regex = '<meta property="og:title" content="(.*?)"/>'
book_author_regex = '<meta name="og:novel:author" content="(.*?)"/>'
if not glob.glob('save_index.txt'):
return self.get_info(book_info = book_info_regex,
book_name = book_name_regex,
book_author = book_author_regex,
)
else:
with open('save_index.txt','r') as ff:
site = ff.read()
print(site)
info_dict = self.get_info(book_info = book_info_regex,
book_name = book_name_regex,
book_author = book_author_regex,
)
info_dict['book_info'] = info_dict['book_info'][int(site):]
return info_dict
def chapter_info(self,chapter_url):
chapter_content_regex = '<dd id="contents">(.*?)</dd>'
return Spider(chapter_url).get_info(
chapter_content = chapter_content_regex)
def book_save(self):
while True:
info = self.book_info.popleft()
print(info[0])
if glob.glob('save_index.txt'):
with open('save_index.txt','r') as f:
site = f.read()
with open('save_index.txt','w') as f:
f.write(str(int(site)+1))
else:
with open('save_index.txt','w') as f:
f.write('0')
chapter_url = self.url+info[1][0]
chapter_name = info[1][1]
print(chapter_name)
#print(chapter_name,threading.current_thread())
content = self.chapter_info(chapter_url)['chapter_content'][0]
with open('书名:{}--作者:{}.txt'.format(self.book_name,
self.book_author),'a') as f:
f.write(chapter_name)
f.write('\n\n')
f.write(content)
f.write('\n\n')
'''
book_info_dict = self.book_info()
book_name = book_info_dict['book_name'][0]
book_info = book_info_dict['book_info']
book_author = book_info_dict['book_author'][0]
#print(book_info)
with open('书名:{}--作者:{}.txt'.format(book_name,
book_author),'a') as f:
for url_id,chapter_name in book_info[:3]:
print(chapter_name)
chapter_url = self.url+url_id
content = self.chapter_info(chapter_url)['chapter_content'][0]
f.write(chapter_name)
f.write('\n\n')
f.write(content)
f.write('\n\n')'''
def my_thread(self):
pool = []
for i in range(1):
pool.append(threading.Thread(target=self.book_save))
for i in pool:
i.start()
if __name__ == '__main__':
Duoxian().book_save()
#Duoxian().my_thread()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。