master

分支 (1)

管理

管理

master

crawler_download_hd_pictures
/
html_parse.py

#! /usr/bin/env python3
# coding=utf-8

from bs4 import BeautifulSoup
import re

class Parse():
	# new_urls, new_data = self.parse.Parser(new_url, html)
	def Parser(self, new_url, html):
		soup = BeautifulSoup(html, "html.parser", from_encoding="utf-8")
		new_urls = self._get_new_urls(soup)
		#new_data = self._get_new_data(new_url, soup)

		return new_urls

	def _get_new_urls(self, soup):
		new_urls = set()

		links = soup.find_all("span", class_="plid")
		for link in links:
			new_urls.add(((link.get_text()).split())[1])

		return new_urls

	def Parser2(self, url, html):
		soup = BeautifulSoup(html, "html.parser", from_encoding="utf-8")
		new_url = self._get_new_urls2(soup)

		return new_url

	def _get_new_urls2(self, soup):

		link = soup.find("a", class_ = "original-file-changed", id = "highres")
		return link["href"]