2 Star 5 Fork 14

ayuliao/AntiCrawlers

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
案例6-网页动态渲染反爬.py 2.91 KB
一键复制 编辑 原始数据 按行查看 历史
二两的分身 提交于 2021-06-28 15:02 . enjoy code
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
url = "http://47.103.13.124:8001/many_sign"
headers = {
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_16_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
}
def add_header():
options = Options()
for k, v in headers.items():
options.add_argument(f'{k}={v}')
return options
def disable_img_css(options):
#禁止图片和css加载
prefs = {"profile.managed_default_content_settings.images": 2,
'permissions.default.stylesheet': 2}
options.add_experimental_option("prefs", prefs)
# 等待元素加载
def wait_element(brower, element_id, wait_time=10):
try:
# 隐式等待
# brower:需要隐式等待的浏览器
# wait_time:最长等待实际
# 1:每隔1秒判断一下对应的元素是否成功加载
WebDriverWait(brower, wait_time, 1).until(
EC.presence_of_element_located((By.ID, element_id))
)
except Exception as e:
# 元素等待了 wait_time 时间,已经没有完成加载
raise Exception(e)
options = add_header()
disable_img_css(options)
brower = webdriver.Chrome(executable_path='chromedriver', chrome_options=options)
try:
brower.get(url)
brower.add_cookie({"name": "session", "value": ".eJyrViotTi1SsqpWyiyOT0zJzcxTsjLQUcrJTwexSopKU3WUcvOTMnNSlayUDM3gQEkHrDE-M0XJyhjCzkvMBSmKKTVNMjMDkiamFkq1tQDfeR3n.YLOC4w.Xbnx1QbrvUh8OUPb5jauC_Aau9U"})
brower.get(url) # 再次访问,使用加载的Cookies
apps = ['app1', 'app2', 'app3', 'app4', 'app5']
result = []
for app in apps:
wait_element(brower, app)
movie = brower.find_element_by_id(app)
# img_url = movie.find_element_by_tag_name('img').get_attribute('src')
title = movie.find_element_by_class_name('mb-1').text
desc = movie.find_element_by_tag_name('p').text
smalls = movie.find_elements_by_tag_name('small')
score = smalls[0].text
desc2 = smalls[1].text
result.append({
# 'img_url': img_url,
'title': title,
'desc': desc,
'score': score,
'desc2': desc2
})
print(result)
finally:
time.sleep(5)
brower.close()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/ayuLiao/anti-crawlers.git
git@gitee.com:ayuLiao/anti-crawlers.git
ayuLiao
anti-crawlers
AntiCrawlers
master

搜索帮助