master

分支 (1)

管理

管理

master

Dota2pc
/
test4.py

# -*- codeing = utf-8 -*-
# @Time : 2021/1/26 22:25
# @Author : Zyh
# @File : test4.py
# @Software : PyCharm

import urllib.request, urllib.error
from bs4 import BeautifulSoup
import re
import xlwt
from operator import itemgetter,attrgetter


def main():
    url1 = "https://www.dota2.com.cn/heroes"
    url2 = "http://dotamax.com/hero/"
    # askURl(url)
    # x = getData(url)
    html = findHtml(url1)
    heroname = FindHeroName(url2)
    savePath(heroname,html)


    # x = getData(heroUrllist)


# ---------------------正则表达式模式---------------
    # 只返回括号里的内容
# ------------------链接---------------
# <a
# findLink = re.compile(r'<a class="heroPickerIconLink" href="(.*?)" id="(.*?)" target="_blank">')
findLink = re.compile(r'<a class="heroPickerIconLink.*" href="(.*?)".*>',re.S)
# findLink = re.compile(r'<div.*class="hero-list-hero Unused-Hero".*onclick="DoNav();">',re.S)

# <option
# findLink = re.compile(r'<select onchange="updateFilters()" id="filterName" class="filterSelect noCaps" name="">',re.S)
# -----------------------------------

# ---------------------图片-------------
findImgSrc = re.compile(r'<img class="hero-hover opacity-img img-shadow" src="(.*?)">', re.S)  # 防止空连接，re.S
# --------------------------------------

# ---------------------英雄名称
# findHero = re.compile(r'<option value="(.*?)">(.*)</option>')
findHero = re.compile(r'div class="hero-list-hero Unused-Hero" id="(.*)" onclick=".*/hero/detail/(.*?)/.*">',re.S)


#-----------------------------------------------------------------

# 爬取网页
def findHtml(baseurl):
    html = askURl(baseurl)
    datalist = []
    soup = BeautifulSoup(html, "html.parser")
    for item in soup.find_all("a"):
        data = []
        item = str(item)
        html = re.findall(findLink,item)
        if html:
            datalist.append(html)
    datalist.sort(key=lambda k:k[0].lower())
    # print(datalist[1])
    # print(len(datalist))
    return datalist


def FindHeroName(baseurl):
    html = askURl(baseurl)
    datalist = []
    soup = BeautifulSoup(html, "html.parser")

    # ------HeroName&Img-------
    # i = 1
    for item in soup.find_all("div", class_="hero-list-hero Unused-Hero"):
        data = []
        item = str(item)

        name= re.findall(findHero, item)
        data.append(name[0][1])
        data.append(name[0][0])
        imgSrc = re.findall(findImgSrc,item)
        data.extend(imgSrc)
        datalist.append(data)
    # print(datalist[0][1])
    datalist.sort()
    # print(datalist)
    # print(len(datalist))
    return datalist
    # y = sorted(datalist,key=itemgetter(0).lower(),reverse=True)
    # print(datalist)
    # print(heroUrl)

    # -----------------html-------
    # data = []
    # for item in soup.find_all("option"):
    #     item = str(item)
    #     heroname = re.findall(findHero,item)
    #     data = data + heroname
    # heroName = data[17:]
    # #----以元组第一个 排序
    # heroName.sort(key=lambda k:k[0])
    # x = sorted(heroName,key=itemgetter(0),reverse=True)
    # print(heroName)
    # print(len(heroName))
    # print(type(heroName[0]))
    # print(heroName[0])

    # for i,item in enumerate(heroName):
    #     datalist.insert(i,item)
    #     i+=1
    # print(datalist)
    # for i, item in enumerate(heroName):
    #     print(i,item)
    # for i, item in enumerate(datalist):
    #     print(i,item)
    # for i,item in enumerate(heroName):
    #     print(i,heroName.pop(),datalist.pop())
    # for i,item in enumerate(datalist):
    #     print(i,item)
    # print(datalist)
    # print(heroName)
    # for item,item2 in datalist,heroName:
    #        data =  item.insert(i,item2)
    #        i+=1
    #        html.append(data)
    # print(html)
    # -----------------------------


    # return heroUrl


def getData(urlList):
    datalist = []
    for item in urlList:
        html = askURl(item)
    # 解析数据
        soup = BeautifulSoup(html, "html.parser")
        #-------解析herohtml,获取数据------

    return datalist


# ---------def getData end----------


#---------- 指定一个URL的网页
def askURl(url):
    head = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    }
    requset = urllib.request.Request(url=url, headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(requset)
        html = response.read().decode("utf-8")
        # print(html)
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
    return html

#------------------def askUrl end --------------


def savePath(hero,html):
    book = xlwt.Workbook(encoding="utf-8",style_compression=0)
    sheet = book.add_sheet('Dota2Hero',cell_overwrite_ok=False)
    col = ("英雄英文名","英雄中文名","英雄图片","网址")
    for i in range(0,len(col)):
        sheet.write(0,i,col[i])
    for i in range(0,len(hero)):
        data = hero[i]
        for j in range(0,3):
            sheet.write(i+1,j,data[j])
    for i in range(0,len(html)):
        sheet.write(i+1,3,html[i])
    book.save('test.xls')
    print("sucesss")


if __name__ == '__main__':
    main()