1 Star 1 Fork 1

叶乐敏/知乎爬虫

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
ZhihuSpider.py 22.73 KB
一键复制 编辑 原始数据 按行查看 历史
yelemin 提交于 2023-09-13 15:21 . 完善爬取修复逻辑
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import xlwt
from openpyxl import Workbook
import ddddocr, requests
import cv2
from faker import Factory
from openpyxl import load_workbook
import os
from user_agents import parse
import logging
import urllib.parse as urlParse
######## 个人本地配置 ########
保存文件的路径 = '/Users/flyingwhale/Downloads/知乎文章爬取/' # 末尾需要包含/
免登陆插件的路径 = '/Users/flyingwhale/Downloads/crx/知乎免登陆插件.crx' # 免登陆插件完整绝对路径
是否扫描登录 = '是' #枚举:是、否;如果是,则需要在页面打开30秒内扫描登录,如果否,则启用免登陆插件
是否无头爬取 = '否' #枚举:是、否,如果是,则不展示游览器页面无头爬取(此时无法扫码登录)
是否模拟点击下一页 = '是' #枚举:是、否;如果是,则模拟点击下一页爬取文章(小概率无法点击跳转下一页),如果否,则通过拼页码参数直接跳转实现翻页(个别用户可能无法爬取)
爬取的用户文章地址列表 = [
# "https://www.zhihu.com/people/40-70-53-32-43/posts",
# "https://www.zhihu.com/people/mu-yi-95-56/posts",
# "https://www.zhihu.com/people/bao-xian-da-shi-97/posts",
# "https://www.zhihu.com/people/wang-meng-meng-58-69/posts",
# "https://www.zhihu.com/people/alicehu-xing/posts",
# "https://www.zhihu.com/people/vobaowo-bao/posts",
# "https://www.zhihu.com/people/liang-shu-shuo-bao-xian/posts",
# "https://www.zhihu.com/people/fan-xiao-bei-bao-xian-jing-ji/posts",
# "https://www.zhihu.com/people/xiao-zhou-lao-shi-shuo-bao-xian/posts",
# "https://www.zhihu.com/people/li-zhan-sheng-44/posts",
"https://www.zhihu.com/people/xie-xian-sheng-51-65/posts",
# "https://www.zhihu.com/people/jia-zhuang-bao-biao-46/posts",
# "https://www.zhihu.com/people/frankchm/posts",
# "https://www.zhihu.com/people/glad-20-70/posts",
# "https://www.zhihu.com/people/xiao-yu-er-32-25-57/posts",
"https://www.zhihu.com/people/pengxian-sen-3/posts",
"https://www.zhihu.com/people/93-95-95-31/posts",
# "https://www.zhihu.com/people/luo-xing-hai-30/posts",
"https://www.zhihu.com/people/84-14-85-10-4/posts",
"https://www.zhihu.com/people/baoxianjiaofu/posts",
"https://www.zhihu.com/people/qian-qian-qian-qian-77-93/posts",
# "https://www.zhihu.com/people/96-94-68-21/posts",
# "https://www.zhihu.com/people/gao-bao-xian-de-ao-te-man/posts",
# "https://www.zhihu.com/people/zhang-ling-61-42/posts",
# "https://www.zhihu.com/people/bao-xian-da-bai-hua/posts",
# "https://www.zhihu.com/people/duochao/posts",
# "https://www.zhihu.com/people/li-ying-92-67-68/posts",
# "https://www.zhihu.com/people/jing-bo-ran-83/posts",
# "https://www.zhihu.com/people/hua-jie-de-bao-xian-ji/posts",
# "https://www.zhihu.com/people/tang-chen-yang-89/posts",
# "https://www.zhihu.com/people/17-56-95-65/posts",
# "https://www.zhihu.com/people/xiang-gang-bao-xian-ying-bao/posts",
# "https://www.zhihu.com/people/chen-jun-55-97-37/posts",
# "https://www.zhihu.com/people/xiang-gang-bao-xian-ke-pu/posts",
# "https://www.zhihu.com/people/chen-dori/posts",
# "https://www.zhihu.com/people/carolinetao/posts",
# "https://www.zhihu.com/people/yang-sheng-5-38/posts",
# "https://www.zhihu.com/people/chi-hai-yuan/posts",
# "https://www.zhihu.com/people/xiao-xiong-40-84/posts",
# "https://www.zhihu.com/people/alice-5-92-15-49/posts",
# "https://www.zhihu.com/people/7hao-49-92/posts",
# "https://www.zhihu.com/people/shouxihebaoren/posts",
# "https://www.zhihu.com/people/reseted1506068936/posts",
# "https://www.zhihu.com/people/xiao-ke-ai-9-88-43/posts",
"https://www.zhihu.com/people/ma-zha-44-37/posts",
"https://www.zhihu.com/people/shui-mi-tao-96-46/posts",
"https://www.zhihu.com/people/ceng-peng-xin-13431810119/posts",
"https://www.zhihu.com/people/zaki-56-67/posts",
"https://www.zhihu.com/people/huang-xiao-ze-4/posts",
"https://www.zhihu.com/people/zouml/posts",
"https://www.zhihu.com/people/wang-da-fu-45-48/posts",
"https://www.zhihu.com/people/zhang-jian-1-41-50/posts",
"https://www.zhihu.com/people/renee-hou-64/posts",
"https://www.zhihu.com/people/zhongguogepai/posts",
"https://www.zhihu.com/people/er-xiao-jie-shuo-xian/posts",
"https://www.zhihu.com/people/jdmbest-43/posts",
"https://www.zhihu.com/people/bao-xian-cha-cha-50/posts",
"https://www.zhihu.com/people/chao-ji-ma-li-2021/posts",
"https://www.zhihu.com/people/chao-ji-ma-li-2020/posts",
"https://www.zhihu.com/people/chao-ji-ma-li-2022/posts"
"https://www.zhihu.com/people/bao-xian-ji-ke-92/posts",
"https://www.zhihu.com/people/da-tong-bao-xian-fu-wu-7/posts",
"https://www.zhihu.com/people/qi-e-wen-bao/posts",
"https://www.zhihu.com/people/ping-an-jian-kang-bao-xian/posts",
"https://www.zhihu.com/people/hui-ze-bao-xian-wang-35/posts",
"https://www.zhihu.com/people/nai-ba-bao-cheng-xin-bao-xian/posts",
"https://www.zhihu.com/people/yang-guang-bao-xian-sui-ebao/posts"
"https://www.zhihu.com/people/tbo-shi-jiao-ni-mai-bao-xian/posts",
"https://www.zhihu.com/people/zhong-min-bao-xian-wang/posts",
"https://www.zhihu.com/people/shen-lan-bao-zhuan-xin-bao-xian/posts",
"https://www.zhihu.com/people/zhuan-xin-bao-xian-ce-ping/posts"
# 用户被限制访问
# "https://www.zhihu.com/people/da-er-wen-2hao/posts",
# 页面加载不全,已爬取页面所有内容 todo
# "https://www.zhihu.com/people/liao-an-ping-37/posts/posts_by_votes",
# 爬取中断
# "https://www.zhihu.com/people/bao-xian-cha-cha/posts?page=96", # 可以直接拼接page参数实现跳页爬取
# 爬取中断
# "https://www.zhihu.com/people/jiang-xiao-bai-16-94-98/posts?page=138", # 正常不拼接page参数,从第一页开始爬取
]
########
def getPcUserAgent():
userAgent = ''
count = 0
maxCount = 1000
isGetPcUserAgent = False
while(count < maxCount):
userAgent = Factory.create().user_agent()
userAgentModel = parse(userAgent)
if userAgentModel != None and not userAgentModel.is_mobile and userAgentModel.is_pc:
isGetPcUserAgent = True
break
count += 1
return userAgent if isGetPcUserAgent else 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0; Touch)'
chrome_options = webdriver.ChromeOptions()
# 爬虫伪装
chrome_options.add_argument(
'user-agent=' + getPcUserAgent()
)
if 是否扫描登录 == '否':
chrome_options.add_extension(免登陆插件的路径) #自己下载的crx路径
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('Accept-Encoding=gzip,deflate,br')
chrome_options.add_argument('Accept-Language=zh-CN,zh;q=0.9')
if 是否无头爬取 == '是':
chrome_options.add_argument('--headless=new')
chrome_options.add_experimental_option('excludeSwitches',['enable-automation'])
driver = webdriver.Chrome(chrome_options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
os.environ["http_proxy"] = "HO4T8ANLNN7F680D:63B54387B2DE50D6@http-dyn.abuyun.com:9020"
# os.environ["http_proxy"] = "60.168.81.171:9999"
'''
1、登录知乎网站
2、获取爬取不同用户文章地址列表
3、爬取每个用户文章内容
'''
def work():
# 1、登录知乎网站
if 是否扫描登录 == '是':
login()
# 2、获取爬取不同用户文章地址列表
urls = getDifferentUserArticleUrlList()
# 3、爬取每个用户文章内容
crawlingUserArticles(urls)
def login():
# 进入登录页面
driver.get('https://www.zhihu.com/signin?next=%2F')
# 等待扫码登录成功
try:
userInfoTag = WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CLASS_NAME, "AppHeader-userInfo"))
)
except Exception:
print('没有找到用户信息标签')
raise
# 通过手机号验证码登录需要滑动验证,不采用此方案
# 输入手机号
# mobile = '15579293801'
# mobileInputElement = driver.find_element("xpath", '//input[@name="username"]')
# mobileInputElement.send_keys(mobile)
# time.sleep(1)
# # 点击获取验证码
# getVerifyCodeElement = driver.find_element("xpath", '//button[text()="获取短信验证码"]')
# getVerifyCodeElement.click()
# time.sleep(20)
def getDifferentUserArticleUrlList():
return 爬取的用户文章地址列表
'''
1、对每个用户文章循环处理
2、滑动屏幕,点击所有阅读全文按钮
3、爬取当前页面所有文章内容
4、保存每页文章数据到excel
5、跳转到下一页继续爬取
'''
def crawlingUserArticles(urlList):
time.sleep(5)
# 1、对每个用户文章循环处理
# 处理每个用户的文章地址
for url in urlList:
try:
crawlingUserArticle(url)
except Exception as e:
print('爬取文章失败!url:' + url)
print('爬取文章异常%s' % e)
logging.exception(e)
def crawlingUserArticle(url):
# 跳转到用户文章tab栏
print('=' * 30)
print('进入用户文章地址:' + url)
driver.get(url)
# 2、滑动屏幕,点击所有阅读全文按钮
# 一页一页地滑动屏幕,并尝试点击阅读全文
head = ['标题', '文章链接', '赞同数', '头图链接', '正文内容', '正文html']
data = []
urlQueryDict = urlParse.parse_qs(urlParse.urlsplit(url).query)
pageNo = 1 if urlQueryDict.get('page') == None else int(urlQueryDict.get('page')[0])
totalPage = pageNo
usernameTag = find_element(driver, "//h1[@class='ProfileHeader-title']/span[@class='ProfileHeader-name']")
username = usernameTag.text if usernameTag != None else '未知'
account = url.split('?')[0].split('/')[-2]
isCalTotalPage = False
try:
while (pageNo <= totalPage):
# 等到文章加载完成后开始进行爬取
try:
articleTitleTag = WebDriverWait(driver, 20).until(
EC.visibility_of_all_elements_located(
(By.XPATH, '/descendant-or-self::H2[contains(@class,"ContentItem-title")]//A[1]'))
)
time.sleep(5)
except Exception:
articleTag = find_element(driver,
'//DIV[@id="Profile-posts"]/DIV[2]/div[not(contains(@class,"Pagination"))]')
if not (articleTag != None and articleTag.text == '还没有文章'):
print('未正常加载出来文章')
isLogin = find_element(driver, "//div[@class='signFlowModal-container']")
if isLogin != None:
print("有登录弹窗")
raise
print('没有更多文章')
break
# 获取当前页面总分页数
paginationTagList = find_elements(driver, "//div[@class='Pagination']/button")
if paginationTagList != None and len(paginationTagList) > 0 and isCalTotalPage == False:
totalPage = int(paginationTagList[-1].text) if paginationTagList[-1].text != '下一页' else int(
paginationTagList[-2].text)
isCalTotalPage = True
# 读取当前文章分页面的数据内容,并计算需要爬取的文章
totalArticleTagList = find_elements(driver,
'//DIV[@id="Profile-posts"]/DIV[2]/div[not(contains(@class,"Pagination"))]')
articleTagList = calNeedCrawlingArticleList(totalArticleTagList, username)
if len(articleTagList) == 0:
print('当前页%s内容均已爬取,跳过!' % str(pageNo))
# 提前跳转下一页
pageNo = goToNextPage(pageNo, totalPage, url)
continue
# 获取阅读全文按钮
readMoreButtonList = find_elements(driver,
"//main[@class='App-main']//div[@id='Profile-posts']/div[2]/div[not(contains(@class,'Pagination'))]//div[@class='RichContent-inner']/button")
print('阅读全文按钮个数:' + str(len(readMoreButtonList)))
for i in range(len(readMoreButtonList)):
readMoreButton = readMoreButtonList[i]
driver.execute_script("arguments[0].scrollIntoView();", readMoreButton)
try:
readMoreTag = WebDriverWait(driver, 5).until(
EC.element_to_be_clickable((By.XPATH, "//div[@class='RichContent-inner']/button"))
)
readMoreButton.click()
time.sleep(1)
except Exception:
print('没有找到阅读全文标签')
raise
print('获取点击阅读全文' + str(i))
driver.implicitly_wait(0.5)
# 3、爬取当前页面所有文章内容
haveNoArticle = False
print('文章总页数:' + str(totalPage))
print('当前页面文章数量:%s,爬取的文章数量:%s' % (str(len(totalArticleTagList)), str(len(articleTagList))))
for articleTag in articleTagList:
if articleTag.text == '还没有文章':
haveNoArticle = True
break
articleTitleTag = find_element(articleTag,
'./descendant-or-self::H2[contains(@class,"ContentItem-title")]//A[1]')
agreeCountTag = find_element(articleTag,
'./descendant-or-self::BUTTON[contains(@class,"Button VoteButton VoteButton--up FEfUrdfMIKpQDJDqkjte")]')
articleContentTag = find_element(articleTag, ".//div[@class='RichContent']")
articleHeadImgTag = find_element(articleTag, './descendant-or-self::IMG[contains(@class,"css-1phd9a0")]')
title = articleTitleTag.text if articleTitleTag != None else ''
articleUrl = articleTitleTag.get_property('href') if articleTitleTag != None else ''
agreeCount = agreeCountTag.text.replace('赞同', '').replace(' ',
'') if agreeCountTag != None and agreeCountTag != '' else ''
articleContent = articleContentTag.text if articleContentTag != None else ''
articleHeadImgUrl = articleHeadImgTag.get_property('src') if articleHeadImgTag != None else ''
articleHtml = articleTag.get_attribute("outerHTML")
articleTotalHtml = getHtml(articleUrl) if articleUrl != '' else ''
saveHtml(username, account, title, articleUrl.split('/')[-1], articleTotalHtml)
data.append([title, articleUrl, agreeCount, articleHeadImgUrl, articleContent, articleHtml])
if haveNoArticle:
break
# 4、保存每个用户文章数据到excel
saveExcel2(username, account, head, data)
# 5、跳转到下一页继续爬取
pageNo = goToNextPage(pageNo, totalPage, url)
except Exception as e:
print('爬取文章发生异常%s' % e)
logging.exception(e)
def goToNextPage(pageNo, totalPage, url):
# 跳转到下一页,如果已到最后一页则终止
pageNo += 1
if pageNo > totalPage:
return totalPage + 1
nextPageUrl = url + '?page=' + str(pageNo) if '?page=' not in url else url.split('?')[0] + '?page=' + str(pageNo)
print('跳转下一页:' + nextPageUrl)
print('=' * 15)
if 是否模拟点击下一页 == '否':
driver.get(nextPageUrl)
else:
# 滑动到页面底部,点击下一页按钮,跳转下一页(找不到下一页,则说明爬取结束)
nextPageTag = find_element(driver,
"//BUTTON[contains(@class,'Button PaginationButton PaginationButton-next')]")
if nextPageTag == None:
print('当前用户文章已爬取完成!')
return totalPage + 1
driver.execute_script("arguments[0].scrollIntoView();", nextPageTag)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
nextPageTag2 = ''
try:
nextPageTag2 = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable(
(By.XPATH, "//BUTTON[contains(@class,'Button PaginationButton PaginationButton-next')]"))
)
time.sleep(1)
nextPageTag2.click()
except Exception:
print('没有找到下一页标签,尝试script方式点击')
driver.execute_script("arguments[0].click();", nextPageTag2)
return pageNo
def find_element(elementDriver, xpath):
element = None
try:
element = elementDriver.find_element("xpath", xpath)
except Exception:
print('未找到元素!xpath' + xpath)
return element
def find_elements(elementDriver, xpath):
elements = None
try:
elements = elementDriver.find_elements("xpath", xpath)
except Exception:
print('未找到元素!xpath' + xpath)
return elements
def calNeedCrawlingArticleList(articleTagList, username):
filePath = 保存文件的路径 + username + '-知乎文章.xlsx'
if not os.path.exists(filePath):
return
wb = load_workbook(filePath)
table = wb.active
articleUrlList = getExcelColText(table, 1)[1:]
articleTagOutputList = []
for articleTag in articleTagList:
articleTitleTag = find_element(articleTag,
'./descendant-or-self::H2[contains(@class,"ContentItem-title")]//A[1]')
if articleTitleTag == None:
print('计算需要爬取的文章项异常!username:%s' % username)
return articleTagList
articleUrl = articleTitleTag.get_property('href') if articleTitleTag != None else ''
if articleUrl != '' and articleUrl not in articleUrlList:
articleTagOutputList.append(articleTag)
return articleTagOutputList
def get_distance(bg_url, front_url):
slide = ddddocr.DdddOcr(det=False, ocr=False)
background_bytes = requests.get(bg_url).content
target_bytes = requests.get(front_url).content
res = slide.slide_match(target_bytes, background_bytes, simple_target=True)
print(res)
slider_distance = res.get("target")[0]
# 这里需要在识别出来的距离上+10,应该是跟图片大小有关导致的,需要自己去进行调整
slider_distance = slider_distance + 10
return slider_distance
def test():
bg_url = "https://necaptcha.nosdn.127.net/e0c39e43553447eb9fdb8a47f4e3084f@2x.jpg"
front_url = "https://necaptcha.nosdn.127.net/27453be6d47f4468884257df88aab056@2x.png"
slider_distance = get_distance(bg_url, front_url)
print(slider_distance)
def cal(targetX, width):
return targetX/(320 / width) - 22
def cal2(targetX, width):
return targetX/(width / 320) - 81
def saveHtml(username, account, title, urlId, html):
saveHtmlPath = 保存文件的路径 + 'html/' + username + '-' + account + '/'
fileAbsolutePath = saveHtmlPath + username + '-' + urlId + '-知乎文章.html'
fileSecondAbsolutePath = saveHtmlPath + username + '-' + title + '-' + urlId + '-知乎文章.html'
if not os.path.exists(saveHtmlPath):
os.makedirs(saveHtmlPath)
if html == '':
print('爬取的文章html为空,username:%s,title:%s' % (username, title) )
if os.path.exists(fileAbsolutePath) or os.path.exists(fileSecondAbsolutePath):
return
fileHandle = open(fileAbsolutePath, mode='w')
fileHandle.write(html)
fileHandle.close()
def saveExcel(username, head, data):
workbook = xlwt.Workbook(encoding='utf-8')
table = workbook.add_sheet('知乎文章')
for i in range(len(head)):
table.write(0, i, head[i])
for i in range(len(data)):
for j in range(len(data[i])):
table.write(i + 1, j, data[i][j])
workbook.save(保存文件的路径)
def saveExcel2(username, account, head, data):
if len(data) == 0:
return
fileAbsolutePath = 保存文件的路径 + username + '-知乎文章.xlsx'
if os.path.exists(fileAbsolutePath):
updateExistExcel(fileAbsolutePath, username, head, data)
else:
saveNewExcel(fileAbsolutePath, username, head, data)
def saveNewExcel(filePath, username, head, data):
wb = Workbook()
table = wb.active
for i in range(1, len(head) + 1):
if head[i-1] != '':
table.cell(row = 1, column = i, value = head[i-1])
for i in range(2, 1 + len(data)):
for j in range(1, 1 + len(data[i-2])):
if data[i-2][j-1] != '':
table.cell(row = i, column = j, value = data[i-2][j-1])
wb.save(filePath)
def updateExistExcel(filePath, username, head, data):
wb = load_workbook(filePath)
table = wb.active
head = getExcelRowText(table, 0)
maxRow = table.max_row
maxColumn = table.max_column
writeNewStartRow = maxRow + 1
articleUrlList = getExcelColText(table, 1)[1:]
for i in range(len(data)):
articleUrl = data[i][1]
if (articleUrl in articleUrlList
and table.cell(articleUrlList.index(articleUrl) + 2, 6).value != None
and table.cell(articleUrlList.index(articleUrl) + 2, 6).value != ''):
continue
for j in range(len(data[i])):
if articleUrl in articleUrlList and not (
table.cell(articleUrlList.index(articleUrl) + 2, 6).value != None
and table.cell(articleUrlList.index(articleUrl) + 2, 6).value != ''
):
oldDataRow = articleUrlList.index(articleUrl) + 2
table.cell(oldDataRow, j+1, data[i][j])
else:
table.cell(writeNewStartRow, j + 1, data[i][j])
articleUrlList.append(articleUrl)
writeNewStartRow += 1
wb.save(filePath)
'''
获取excel列数据,坐标从0开始
'''
def getExcelColText(table, colNum):
return [col.value for col in [col for col in table.columns][colNum]]
'''
获取excel行数据,坐标从0开始
'''
def getExcelRowText(table, rowNum):
return [row.value for row in [row for row in table.rows][rowNum]]
def getHtml(url):
response = requests.get(url)
html = response.text
return html
if __name__ == '__main__':
print('start')
work()
driver.quit()
print('ok')
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/flying2whale/zhihu-reptile.git
git@gitee.com:flying2whale/zhihu-reptile.git
flying2whale
zhihu-reptile
知乎爬虫
master

搜索帮助