zhihu-reptile
/
ZhihuSpider.py

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import xlwt
from openpyxl import Workbook
import ddddocr, requests
import cv2
from faker import Factory
from openpyxl import load_workbook
import os
from user_agents import parse
import logging
import urllib.parse as urlParse


######## 个人本地配置 ########
保存文件的路径 = '/Users/flyingwhale/Downloads/知乎文章爬取/' # 末尾需要包含/
免登陆插件的路径 = '/Users/flyingwhale/Downloads/crx/知乎免登陆插件.crx' # 免登陆插件完整绝对路径
是否扫描登录 = '是' #枚举：是、否；如果是，则需要在页面打开30秒内扫描登录，如果否，则启用免登陆插件
是否无头爬取 = '否' #枚举：是、否，如果是，则不展示游览器页面无头爬取（此时无法扫码登录）
是否模拟点击下一页 = '是' #枚举：是、否；如果是，则模拟点击下一页爬取文章（小概率无法点击跳转下一页），如果否，则通过拼页码参数直接跳转实现翻页（个别用户可能无法爬取）
爬取的用户文章地址列表 = [
    # "https://www.zhihu.com/people/40-70-53-32-43/posts",
    # "https://www.zhihu.com/people/mu-yi-95-56/posts",
    # "https://www.zhihu.com/people/bao-xian-da-shi-97/posts",
    # "https://www.zhihu.com/people/wang-meng-meng-58-69/posts",
    # "https://www.zhihu.com/people/alicehu-xing/posts",
    # "https://www.zhihu.com/people/vobaowo-bao/posts",
    # "https://www.zhihu.com/people/liang-shu-shuo-bao-xian/posts",
    # "https://www.zhihu.com/people/fan-xiao-bei-bao-xian-jing-ji/posts",
    # "https://www.zhihu.com/people/xiao-zhou-lao-shi-shuo-bao-xian/posts",
    # "https://www.zhihu.com/people/li-zhan-sheng-44/posts",
    "https://www.zhihu.com/people/xie-xian-sheng-51-65/posts",

    # "https://www.zhihu.com/people/jia-zhuang-bao-biao-46/posts",
    # "https://www.zhihu.com/people/frankchm/posts",
    # "https://www.zhihu.com/people/glad-20-70/posts",
    # "https://www.zhihu.com/people/xiao-yu-er-32-25-57/posts",
    "https://www.zhihu.com/people/pengxian-sen-3/posts",
    "https://www.zhihu.com/people/93-95-95-31/posts",


    # "https://www.zhihu.com/people/luo-xing-hai-30/posts",
    "https://www.zhihu.com/people/84-14-85-10-4/posts",
    "https://www.zhihu.com/people/baoxianjiaofu/posts",


    "https://www.zhihu.com/people/qian-qian-qian-qian-77-93/posts",

    # "https://www.zhihu.com/people/96-94-68-21/posts",
    # "https://www.zhihu.com/people/gao-bao-xian-de-ao-te-man/posts",
    # "https://www.zhihu.com/people/zhang-ling-61-42/posts",
    # "https://www.zhihu.com/people/bao-xian-da-bai-hua/posts",
    # "https://www.zhihu.com/people/duochao/posts",
    # "https://www.zhihu.com/people/li-ying-92-67-68/posts",
    # "https://www.zhihu.com/people/jing-bo-ran-83/posts",
    # "https://www.zhihu.com/people/hua-jie-de-bao-xian-ji/posts",
    # "https://www.zhihu.com/people/tang-chen-yang-89/posts",
    # "https://www.zhihu.com/people/17-56-95-65/posts",
    # "https://www.zhihu.com/people/xiang-gang-bao-xian-ying-bao/posts",
    # "https://www.zhihu.com/people/chen-jun-55-97-37/posts",
    # "https://www.zhihu.com/people/xiang-gang-bao-xian-ke-pu/posts",
    # "https://www.zhihu.com/people/chen-dori/posts",
    # "https://www.zhihu.com/people/carolinetao/posts",
    # "https://www.zhihu.com/people/yang-sheng-5-38/posts",
    # "https://www.zhihu.com/people/chi-hai-yuan/posts",
    # "https://www.zhihu.com/people/xiao-xiong-40-84/posts",
    # "https://www.zhihu.com/people/alice-5-92-15-49/posts",
    # "https://www.zhihu.com/people/7hao-49-92/posts",
    # "https://www.zhihu.com/people/shouxihebaoren/posts",
    # "https://www.zhihu.com/people/reseted1506068936/posts",
    # "https://www.zhihu.com/people/xiao-ke-ai-9-88-43/posts",
    "https://www.zhihu.com/people/ma-zha-44-37/posts",
    "https://www.zhihu.com/people/shui-mi-tao-96-46/posts",
    "https://www.zhihu.com/people/ceng-peng-xin-13431810119/posts",
    "https://www.zhihu.com/people/zaki-56-67/posts",
    "https://www.zhihu.com/people/huang-xiao-ze-4/posts",
    "https://www.zhihu.com/people/zouml/posts",
    "https://www.zhihu.com/people/wang-da-fu-45-48/posts",
    "https://www.zhihu.com/people/zhang-jian-1-41-50/posts",
    "https://www.zhihu.com/people/renee-hou-64/posts",
    "https://www.zhihu.com/people/zhongguogepai/posts",
    "https://www.zhihu.com/people/er-xiao-jie-shuo-xian/posts",
    "https://www.zhihu.com/people/jdmbest-43/posts",
    "https://www.zhihu.com/people/bao-xian-cha-cha-50/posts",
    "https://www.zhihu.com/people/chao-ji-ma-li-2021/posts",
    "https://www.zhihu.com/people/chao-ji-ma-li-2020/posts",
    "https://www.zhihu.com/people/chao-ji-ma-li-2022/posts"
    "https://www.zhihu.com/people/bao-xian-ji-ke-92/posts",
    "https://www.zhihu.com/people/da-tong-bao-xian-fu-wu-7/posts",
    "https://www.zhihu.com/people/qi-e-wen-bao/posts",
    "https://www.zhihu.com/people/ping-an-jian-kang-bao-xian/posts",
    "https://www.zhihu.com/people/hui-ze-bao-xian-wang-35/posts",
    "https://www.zhihu.com/people/nai-ba-bao-cheng-xin-bao-xian/posts",
    "https://www.zhihu.com/people/yang-guang-bao-xian-sui-ebao/posts"
    "https://www.zhihu.com/people/tbo-shi-jiao-ni-mai-bao-xian/posts",
    "https://www.zhihu.com/people/zhong-min-bao-xian-wang/posts",
    "https://www.zhihu.com/people/shen-lan-bao-zhuan-xin-bao-xian/posts",
    "https://www.zhihu.com/people/zhuan-xin-bao-xian-ce-ping/posts"

    # 用户被限制访问
    # "https://www.zhihu.com/people/da-er-wen-2hao/posts",
    # 页面加载不全，已爬取页面所有内容 todo
    # "https://www.zhihu.com/people/liao-an-ping-37/posts/posts_by_votes",
    # 爬取中断
    # "https://www.zhihu.com/people/bao-xian-cha-cha/posts?page=96",  # 可以直接拼接page参数实现跳页爬取
    # 爬取中断
    # "https://www.zhihu.com/people/jiang-xiao-bai-16-94-98/posts?page=138",  # 正常不拼接page参数，从第一页开始爬取
]

########


def getPcUserAgent():
    userAgent = ''
    count = 0
    maxCount = 1000
    isGetPcUserAgent = False
    while(count < maxCount):
        userAgent = Factory.create().user_agent()
        userAgentModel = parse(userAgent)
        if userAgentModel != None and not userAgentModel.is_mobile and userAgentModel.is_pc:
            isGetPcUserAgent = True
            break
        count += 1
    return userAgent if isGetPcUserAgent else 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0; Touch)'


chrome_options = webdriver.ChromeOptions()
# 爬虫伪装
chrome_options.add_argument(
    'user-agent=' + getPcUserAgent()
)
if 是否扫描登录 == '否':
    chrome_options.add_extension(免登陆插件的路径) #自己下载的crx路径
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('Accept-Encoding=gzip,deflate,br')
chrome_options.add_argument('Accept-Language=zh-CN,zh;q=0.9')
if 是否无头爬取 == '是':
    chrome_options.add_argument('--headless=new')
chrome_options.add_experimental_option('excludeSwitches',['enable-automation'])
driver = webdriver.Chrome(chrome_options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
  "source": """
    Object.defineProperty(navigator, 'webdriver', {
      get: () => undefined
    })
  """
})

os.environ["http_proxy"] = "HO4T8ANLNN7F680D:63B54387B2DE50D6@http-dyn.abuyun.com:9020"
# os.environ["http_proxy"] = "60.168.81.171:9999"

'''
1、登录知乎网站
2、获取爬取不同用户文章地址列表
3、爬取每个用户文章内容
'''
def work():
    # 1、登录知乎网站
    if 是否扫描登录 == '是':
        login()
    # 2、获取爬取不同用户文章地址列表
    urls = getDifferentUserArticleUrlList()
    # 3、爬取每个用户文章内容
    crawlingUserArticles(urls)

def login():
    # 进入登录页面
    driver.get('https://www.zhihu.com/signin?next=%2F')
    # 等待扫码登录成功
    try:
        userInfoTag = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CLASS_NAME, "AppHeader-userInfo"))
        )
    except Exception:
        print('没有找到用户信息标签')
        raise


    # 通过手机号验证码登录需要滑动验证，不采用此方案
    # 输入手机号
    # mobile = '15579293801'
    # mobileInputElement = driver.find_element("xpath", '//input[@name="username"]')
    # mobileInputElement.send_keys(mobile)
    # time.sleep(1)
    # # 点击获取验证码
    # getVerifyCodeElement = driver.find_element("xpath", '//button[text()="获取短信验证码"]')
    # getVerifyCodeElement.click()
    # time.sleep(20)

def getDifferentUserArticleUrlList():
    return 爬取的用户文章地址列表


'''
    1、对每个用户文章循环处理
    2、滑动屏幕，点击所有阅读全文按钮
    3、爬取当前页面所有文章内容
    4、保存每页文章数据到excel
    5、跳转到下一页继续爬取
'''
def crawlingUserArticles(urlList):
    time.sleep(5)
    # 1、对每个用户文章循环处理
    # 处理每个用户的文章地址
    for url in urlList:
        try:
            crawlingUserArticle(url)
        except Exception as e:
            print('爬取文章失败！url：' + url)
            print('爬取文章异常%s' % e)
            logging.exception(e)


def crawlingUserArticle(url):
    # 跳转到用户文章tab栏
    print('=' * 30)
    print('进入用户文章地址：' + url)
    driver.get(url)
    # 2、滑动屏幕，点击所有阅读全文按钮
    # 一页一页地滑动屏幕，并尝试点击阅读全文
    head = ['标题', '文章链接', '赞同数', '头图链接', '正文内容', '正文html']
    data = []
    urlQueryDict = urlParse.parse_qs(urlParse.urlsplit(url).query)
    pageNo = 1 if urlQueryDict.get('page') == None else int(urlQueryDict.get('page')[0])
    totalPage = pageNo
    usernameTag = find_element(driver, "//h1[@class='ProfileHeader-title']/span[@class='ProfileHeader-name']")
    username = usernameTag.text if usernameTag != None else '未知'
    account = url.split('?')[0].split('/')[-2]
    isCalTotalPage = False
    try:
        while (pageNo <= totalPage):
            # 等到文章加载完成后开始进行爬取
            try:
                articleTitleTag = WebDriverWait(driver, 20).until(
                    EC.visibility_of_all_elements_located(
                        (By.XPATH, '/descendant-or-self::H2[contains(@class,"ContentItem-title")]//A[1]'))
                )
                time.sleep(5)
            except Exception:
                articleTag = find_element(driver,
                                          '//DIV[@id="Profile-posts"]/DIV[2]/div[not(contains(@class,"Pagination"))]')
                if not (articleTag != None and articleTag.text == '还没有文章'):
                    print('未正常加载出来文章')
                    isLogin = find_element(driver, "//div[@class='signFlowModal-container']")
                    if isLogin != None:
                        print("有登录弹窗")
                    raise
                print('没有更多文章')
                break

            # 获取当前页面总分页数
            paginationTagList = find_elements(driver, "//div[@class='Pagination']/button")
            if paginationTagList != None and len(paginationTagList) > 0 and isCalTotalPage == False:
                totalPage = int(paginationTagList[-1].text) if paginationTagList[-1].text != '下一页' else int(
                    paginationTagList[-2].text)
                isCalTotalPage = True
            # 读取当前文章分页面的数据内容，并计算需要爬取的文章
            totalArticleTagList = find_elements(driver,
                                           '//DIV[@id="Profile-posts"]/DIV[2]/div[not(contains(@class,"Pagination"))]')
            articleTagList = calNeedCrawlingArticleList(totalArticleTagList, username)
            if len(articleTagList) == 0:
                print('当前页%s内容均已爬取，跳过！' % str(pageNo))
                # 提前跳转下一页
                pageNo = goToNextPage(pageNo, totalPage, url)
                continue

            # 获取阅读全文按钮
            readMoreButtonList = find_elements(driver,
                                               "//main[@class='App-main']//div[@id='Profile-posts']/div[2]/div[not(contains(@class,'Pagination'))]//div[@class='RichContent-inner']/button")
            print('阅读全文按钮个数：' + str(len(readMoreButtonList)))
            for i in range(len(readMoreButtonList)):
                readMoreButton = readMoreButtonList[i]
                driver.execute_script("arguments[0].scrollIntoView();", readMoreButton)
                try:
                    readMoreTag = WebDriverWait(driver, 5).until(
                        EC.element_to_be_clickable((By.XPATH, "//div[@class='RichContent-inner']/button"))
                    )
                    readMoreButton.click()
                    time.sleep(1)
                except Exception:
                    print('没有找到阅读全文标签')
                    raise
                print('获取点击阅读全文' + str(i))
                driver.implicitly_wait(0.5)
            # 3、爬取当前页面所有文章内容
            haveNoArticle = False
            print('文章总页数：' + str(totalPage))
            print('当前页面文章数量：%s，爬取的文章数量：%s' % (str(len(totalArticleTagList)), str(len(articleTagList))))
            for articleTag in articleTagList:
                if articleTag.text == '还没有文章':
                    haveNoArticle = True
                    break
                articleTitleTag = find_element(articleTag,
                                               './descendant-or-self::H2[contains(@class,"ContentItem-title")]//A[1]')
                agreeCountTag = find_element(articleTag,
                                             './descendant-or-self::BUTTON[contains(@class,"Button VoteButton VoteButton--up FEfUrdfMIKpQDJDqkjte")]')
                articleContentTag = find_element(articleTag, ".//div[@class='RichContent']")
                articleHeadImgTag = find_element(articleTag, './descendant-or-self::IMG[contains(@class,"css-1phd9a0")]')

                title = articleTitleTag.text if articleTitleTag != None else ''
                articleUrl = articleTitleTag.get_property('href') if articleTitleTag != None else ''
                agreeCount = agreeCountTag.text.replace('赞同', '').replace(' ',
                                                                            '') if agreeCountTag != None and agreeCountTag != '' else ''
                articleContent = articleContentTag.text if articleContentTag != None else ''
                articleHeadImgUrl = articleHeadImgTag.get_property('src') if articleHeadImgTag != None else ''
                articleHtml = articleTag.get_attribute("outerHTML")
                articleTotalHtml = getHtml(articleUrl) if articleUrl != '' else ''
                saveHtml(username, account, title, articleUrl.split('/')[-1], articleTotalHtml)
                data.append([title, articleUrl, agreeCount, articleHeadImgUrl, articleContent, articleHtml])
            if haveNoArticle:
                break
            # 4、保存每个用户文章数据到excel
            saveExcel2(username, account, head, data)
            # 5、跳转到下一页继续爬取
            pageNo = goToNextPage(pageNo, totalPage, url)


    except Exception as e:
        print('爬取文章发生异常%s' % e)
        logging.exception(e)


def goToNextPage(pageNo, totalPage, url):
    # 跳转到下一页，如果已到最后一页则终止
    pageNo += 1
    if pageNo > totalPage:
        return totalPage + 1
    nextPageUrl = url + '?page=' + str(pageNo) if '?page=' not in url else url.split('?')[0] + '?page=' + str(pageNo)
    print('跳转下一页：' + nextPageUrl)
    print('=' * 15)
    if 是否模拟点击下一页 == '否':
        driver.get(nextPageUrl)
    else:
        # 滑动到页面底部，点击下一页按钮，跳转下一页（找不到下一页，则说明爬取结束）
        nextPageTag = find_element(driver,
                                   "//BUTTON[contains(@class,'Button PaginationButton PaginationButton-next')]")
        if nextPageTag == None:
            print('当前用户文章已爬取完成！')
            return totalPage + 1
        driver.execute_script("arguments[0].scrollIntoView();", nextPageTag)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        nextPageTag2 = ''
        try:
            nextPageTag2 = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable(
                    (By.XPATH, "//BUTTON[contains(@class,'Button PaginationButton PaginationButton-next')]"))
            )
            time.sleep(1)
            nextPageTag2.click()
        except Exception:
            print('没有找到下一页标签，尝试script方式点击')
            driver.execute_script("arguments[0].click();", nextPageTag2)

    return pageNo


def find_element(elementDriver, xpath):
    element = None
    try:
        element = elementDriver.find_element("xpath", xpath)
    except Exception:
        print('未找到元素！xpath' + xpath)
    return element

def find_elements(elementDriver, xpath):
    elements = None
    try:
        elements = elementDriver.find_elements("xpath", xpath)
    except Exception:
        print('未找到元素！xpath' + xpath)
    return elements


def calNeedCrawlingArticleList(articleTagList, username):
    filePath = 保存文件的路径 + username + '-知乎文章.xlsx'

    if not os.path.exists(filePath):
        return


    wb = load_workbook(filePath)
    table = wb.active
    articleUrlList = getExcelColText(table, 1)[1:]
    articleTagOutputList = []
    for articleTag in articleTagList:
        articleTitleTag = find_element(articleTag,
                                       './descendant-or-self::H2[contains(@class,"ContentItem-title")]//A[1]')
        if articleTitleTag == None:
            print('计算需要爬取的文章项异常！username:%s' % username)
            return articleTagList
        articleUrl = articleTitleTag.get_property('href') if articleTitleTag != None else ''
        if articleUrl != '' and articleUrl not in articleUrlList:
            articleTagOutputList.append(articleTag)
    return articleTagOutputList


def get_distance(bg_url, front_url):
    slide = ddddocr.DdddOcr(det=False, ocr=False)
    background_bytes = requests.get(bg_url).content
    target_bytes = requests.get(front_url).content
    res = slide.slide_match(target_bytes, background_bytes, simple_target=True)
    print(res)
    slider_distance = res.get("target")[0]
    # 这里需要在识别出来的距离上+10，应该是跟图片大小有关导致的，需要自己去进行调整
    slider_distance = slider_distance + 10
    return slider_distance

def test():
    bg_url = "https://necaptcha.nosdn.127.net/e0c39e43553447eb9fdb8a47f4e3084f@2x.jpg"
    front_url = "https://necaptcha.nosdn.127.net/27453be6d47f4468884257df88aab056@2x.png"
    slider_distance = get_distance(bg_url, front_url)
    print(slider_distance)

def cal(targetX, width):
    return targetX/(320 / width) - 22

def cal2(targetX, width):
    return targetX/(width / 320) - 81


def saveHtml(username, account, title, urlId, html):
    saveHtmlPath = 保存文件的路径 + 'html/' + username + '-' + account + '/'
    fileAbsolutePath = saveHtmlPath + username + '-' + urlId + '-知乎文章.html'
    fileSecondAbsolutePath = saveHtmlPath + username + '-' + title + '-' + urlId + '-知乎文章.html'
    if not os.path.exists(saveHtmlPath):
        os.makedirs(saveHtmlPath)

    if html == '':
        print('爬取的文章html为空,username:%s,title:%s' % (username, title) )

    if os.path.exists(fileAbsolutePath) or os.path.exists(fileSecondAbsolutePath):
        return

    fileHandle = open(fileAbsolutePath, mode='w')
    fileHandle.write(html)
    fileHandle.close()

def saveExcel(username, head, data):
    workbook = xlwt.Workbook(encoding='utf-8')
    table = workbook.add_sheet('知乎文章')
    for i in range(len(head)):
        table.write(0, i, head[i])

    for i in range(len(data)):
        for j in range(len(data[i])):
            table.write(i + 1, j, data[i][j])
    workbook.save(保存文件的路径)

def saveExcel2(username, account, head, data):
    if len(data) == 0:
        return

    fileAbsolutePath = 保存文件的路径 + username + '-知乎文章.xlsx'

    if os.path.exists(fileAbsolutePath):
        updateExistExcel(fileAbsolutePath, username, head, data)
    else:
        saveNewExcel(fileAbsolutePath, username, head, data)

def saveNewExcel(filePath, username, head, data):
    wb = Workbook()
    table = wb.active
    for i in range(1, len(head) + 1):
        if head[i-1] != '':
            table.cell(row = 1, column = i, value = head[i-1])

    for i in range(2, 1 + len(data)):
        for j in range(1, 1 + len(data[i-2])):
            if data[i-2][j-1] != '':
                table.cell(row = i, column = j, value = data[i-2][j-1])
    wb.save(filePath)

def updateExistExcel(filePath, username, head, data):
    wb = load_workbook(filePath)
    table = wb.active
    head = getExcelRowText(table, 0)
    maxRow = table.max_row
    maxColumn = table.max_column
    writeNewStartRow = maxRow + 1
    articleUrlList = getExcelColText(table, 1)[1:]
    for i in range(len(data)):
        articleUrl = data[i][1]
        if (articleUrl in articleUrlList
                and table.cell(articleUrlList.index(articleUrl) + 2, 6).value != None
                and table.cell(articleUrlList.index(articleUrl) + 2, 6).value != ''):
            continue
        for j in range(len(data[i])):
            if articleUrl in articleUrlList and not (
                    table.cell(articleUrlList.index(articleUrl) + 2, 6).value != None
                    and table.cell(articleUrlList.index(articleUrl) + 2, 6).value != ''
            ):
                oldDataRow = articleUrlList.index(articleUrl) + 2
                table.cell(oldDataRow, j+1, data[i][j])
            else:
                table.cell(writeNewStartRow, j + 1, data[i][j])
                articleUrlList.append(articleUrl)
                writeNewStartRow += 1
    wb.save(filePath)

'''
    获取excel列数据，坐标从0开始
'''
def getExcelColText(table, colNum):
    return [col.value for col in [col for col in table.columns][colNum]]

'''
    获取excel行数据，坐标从0开始
'''
def getExcelRowText(table, rowNum):
    return [row.value for row in [row for row in table.rows][rowNum]]


def getHtml(url):
    response = requests.get(url)
    html = response.text
    return html

if __name__ == '__main__':
    print('start')
    work()
    driver.quit()
    print('ok')