代码拉取完成,页面将自动刷新
# -*- codeing = utf-8 -*-
# @Time : 2021/1/26 22:25
# @Author : Zyh
# @File : test4.py
# @Software : PyCharm
import urllib.request, urllib.error
from bs4 import BeautifulSoup
import re
import xlwt
from operator import itemgetter,attrgetter
def main():
url1 = "https://www.dota2.com.cn/heroes"
url2 = "http://dotamax.com/hero/"
# askURl(url)
# x = getData(url)
html = findHtml(url1)
heroname = FindHeroName(url2)
savePath(heroname,html)
# x = getData(heroUrllist)
# ---------------------正则表达式模式---------------
# 只返回括号里的内容
# ------------------链接---------------
# <a
# findLink = re.compile(r'<a class="heroPickerIconLink" href="(.*?)" id="(.*?)" target="_blank">')
findLink = re.compile(r'<a class="heroPickerIconLink.*" href="(.*?)".*>',re.S)
# findLink = re.compile(r'<div.*class="hero-list-hero Unused-Hero".*onclick="DoNav();">',re.S)
# <option
# findLink = re.compile(r'<select onchange="updateFilters()" id="filterName" class="filterSelect noCaps" name="">',re.S)
# -----------------------------------
# ---------------------图片-------------
findImgSrc = re.compile(r'<img class="hero-hover opacity-img img-shadow" src="(.*?)">', re.S) # 防止空连接,re.S
# --------------------------------------
# ---------------------英雄名称
# findHero = re.compile(r'<option value="(.*?)">(.*)</option>')
findHero = re.compile(r'div class="hero-list-hero Unused-Hero" id="(.*)" onclick=".*/hero/detail/(.*?)/.*">',re.S)
#-----------------------------------------------------------------
# 爬取网页
def findHtml(baseurl):
html = askURl(baseurl)
datalist = []
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all("a"):
data = []
item = str(item)
html = re.findall(findLink,item)
if html:
datalist.append(html)
datalist.sort(key=lambda k:k[0].lower())
# print(datalist[1])
# print(len(datalist))
return datalist
def FindHeroName(baseurl):
html = askURl(baseurl)
datalist = []
soup = BeautifulSoup(html, "html.parser")
# ------HeroName&Img-------
# i = 1
for item in soup.find_all("div", class_="hero-list-hero Unused-Hero"):
data = []
item = str(item)
name= re.findall(findHero, item)
data.append(name[0][1])
data.append(name[0][0])
imgSrc = re.findall(findImgSrc,item)
data.extend(imgSrc)
datalist.append(data)
# print(datalist[0][1])
datalist.sort()
# print(datalist)
# print(len(datalist))
return datalist
# y = sorted(datalist,key=itemgetter(0).lower(),reverse=True)
# print(datalist)
# print(heroUrl)
# -----------------html-------
# data = []
# for item in soup.find_all("option"):
# item = str(item)
# heroname = re.findall(findHero,item)
# data = data + heroname
# heroName = data[17:]
# #----以元组第一个 排序
# heroName.sort(key=lambda k:k[0])
# x = sorted(heroName,key=itemgetter(0),reverse=True)
# print(heroName)
# print(len(heroName))
# print(type(heroName[0]))
# print(heroName[0])
# for i,item in enumerate(heroName):
# datalist.insert(i,item)
# i+=1
# print(datalist)
# for i, item in enumerate(heroName):
# print(i,item)
# for i, item in enumerate(datalist):
# print(i,item)
# for i,item in enumerate(heroName):
# print(i,heroName.pop(),datalist.pop())
# for i,item in enumerate(datalist):
# print(i,item)
# print(datalist)
# print(heroName)
# for item,item2 in datalist,heroName:
# data = item.insert(i,item2)
# i+=1
# html.append(data)
# print(html)
# -----------------------------
# return heroUrl
def getData(urlList):
datalist = []
for item in urlList:
html = askURl(item)
# 解析数据
soup = BeautifulSoup(html, "html.parser")
#-------解析herohtml,获取数据------
return datalist
# ---------def getData end----------
#---------- 指定一个URL的网页
def askURl(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
requset = urllib.request.Request(url=url, headers=head)
html = ""
try:
response = urllib.request.urlopen(requset)
html = response.read().decode("utf-8")
# print(html)
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
return html
#------------------def askUrl end --------------
def savePath(hero,html):
book = xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet = book.add_sheet('Dota2Hero',cell_overwrite_ok=False)
col = ("英雄英文名","英雄中文名","英雄图片","网址")
for i in range(0,len(col)):
sheet.write(0,i,col[i])
for i in range(0,len(hero)):
data = hero[i]
for j in range(0,3):
sheet.write(i+1,j,data[j])
for i in range(0,len(html)):
sheet.write(i+1,3,html[i])
book.save('test.xls')
print("sucesss")
if __name__ == '__main__':
main()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。