代码拉取完成,页面将自动刷新
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import re
import urlparse
class HtmlParser(object):
def _get_new_urls(self,new_url,soup):
new_urls = set()
links = soup.find_all('a',href=re.compile(r'/item'))
for v in links:
new_urlc = v['href']
new_full_url = urlparse.urljoin(new_url,new_urlc)
new_urls.add(new_full_url)
return new_urls
def _get_new_data(self,new_url,soup):
res_data = {}
# url
res_data['url'] = new_url
# <dd class="lemmaWgt-lemmaTitle-title"><h1>Python</h1>
title_node = soup.find('dd',class_='lemmaWgt-lemmaTitle-title').find('h1')
res_data['title'] = title_node.get_text()
# <div class="lemma-summary" label-module="lemmaSummary">
summary_node = soup.find('div',class_='lemma-summary')
res_data['summary'] = summary_node.get_text()
return res_data
def parse(self,new_url,html_cont):
if new_url is None or html_cont is None:
return
soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
new_urls = self._get_new_urls(new_url,soup)
new_data = self._get_new_data(new_url,soup)
return new_urls,new_data
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。