代码拉取完成,页面将自动刷新
import time
import json
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service # 新版本将executable_path被重构,放在Service函数里
from selenium.webdriver.common.by import By # 隐式等待,定位元素的By类
from selenium.webdriver.support import expected_conditions as EC # 预期条件,监控相关
from selenium.webdriver.support.wait import WebDriverWait # 等待;页面可能是动态的,Ajax加载的不完全
from selenium.webdriver.chrome.options import Options # 浏览器参数设置:如导入Options模块,用于加载headers
# 必须要https://严格格式构建的html地址
url = 'http://42.194.197.95:8001/webdriver_ajax'
# 设置请求头参数
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
# 'Cookie': 'session=.eJyrViotTi1SsqpWyiyOT0zJzcxTsjLQUcrJTwexSopKU3WUcvOTMnNSlayUDM3gQEkHrDE-M0XJyhjCzkvMBSmKKTU3NbKIKTUzMjZXqq0FAN1MHbY.ZPMkQg.MeUf3o_GgoHaGX1K_kSP_CEN0ng; auth=508OGQJI368827WWN1693662187',
'Pragma': 'no-cache',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Mobile Safari/537.36',
}
# 创建Options对象并添加请求头参数
def add_header():
_options = Options()
for k, v in headers.items():
_options.add_argument(f'{k}={v}')
return _options
# 禁止图片和css加载
def disable_img_css(__options):
prefs = {"profile.managed_default_content_settings.images": 2,
'permissions.default.stylesheet': 2}
__options.add_experimental_option("prefs", prefs)
# 实现元素条件等待的方法
def wait_element(_driver, element_id, wait_time=10):
try:
# 隐式等待
# brower:需要隐式等待的浏览器
# wait_time:最长等待实际
# 1:每隔1秒判断一下对应的元素是否成功加载
# 创建WebDriverWait对象,每1秒检查一次,最长等待10秒
wait = WebDriverWait(_driver, wait_time, 1)
# 定义要等待的元素条件,传入定位方式和元素id
element = (By.ID, element_id)
# 等待直到元素条件出现,否则超时抛出TimeoutException
wait.until(EC.presence_of_element_located(element))
except Exception as e:
raise Exception(e)
# 创建服务对象
s = Service("chromedriver.exe")
# 或 browser = webdriver.chrome.service.Service("chromedriver.exe")
# 获取配置好的参数选项
options = add_header()
disable_img_css(options)
# 创建WebDriver对象并传入参数选项
driver = webdriver.Chrome(service=s, options=options) # 定义driver方法
# js脚本加载
with open("stealth.min.js") as f:
js = f.read()
# 执行js脚本
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": js
})
try:
driver.get(url) # 获取html网页
driver.add_cookie({"name": "session",
"value": ".eJyrViotTi1SsqpWyiyOT0zJzcxTsjLQUcrJTwexSopKU3WUcvOTMnNSlayUDM3gQEkHrDE-M0XJyhjCzkvMBSmKKTU3NbKIKTUzMjZXqq0FAN1MHbY.ZPMkQg.MeUf3o_GgoHaGX1K_kSP_CEN0ng"
})
# 添加后,浏览器进行http请求时,会自动携带上cookies -> Headers
driver.get(url) # 再次访问
# 保存网址完整html,方便查阅
html = driver.page_source
pattern = r'"movies":\[.*?\]'
match = re.search(pattern, html)
if match:
json_str = match.group()
print(json_str)
finally:
time.sleep(5)
driver.close()
"""国外网址,使用js避免特征被捕获
import time
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
url = "http://42.194.197.95:8001/brower"
chrome_options = Options()
# 无头浏览器配置
chrome_options.add_argument("--headless")
# 添加请求头
chrome_options.add_argument("user-agent=Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Mobile Safari/537.36")
# 配置浏览器启动工具
s = Service("chromedriver.exe")
driver = Chrome(service=s, options=chrome_options)
# js脚本加载
with open("stealth.min.js") as f:
js = f.read()
# 在打印具体的网页前,执行隐藏浏览器特征的JavaScript
# 执行js脚本
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": js
})
# 访问url
#driver.get(url)
driver.get('https://bot.sannysoft.com/')
time.sleep(5)
driver.save_screenshot('workaround.png')
# 保存网址完整html,方便查阅
source = driver.page_source
with open('result.html', 'w') as f:
f.write(source)
"""
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。