代码拉取完成,页面将自动刷新
import requests
from playwright.sync_api import sync_playwright
from loguru import logger
import pymysql
class Spider():
# init
def __init__(self):
self.BASEURL = 'https://spa6.scrape.center'
self.MAX_PAGE = 10
self.LIMIT = 10
self.browser = sync_playwright().start().chromium.launch()
self.page = self.browser.new_page()
self.page.route('https://spa6.scrape.center/js/chunk-19c920f8.c3a1129d.js',
lambda route: route.fulfill(path='./项目6/chunk.js'))
self.page.route('https://spa6.scrape.center/js/chunk-4dec7ef0.e4c2b130.js',
lambda route: route.fulfill(path='./项目6/chunk-id.js'))
self.page.goto(self.BASEURL)
self.INDEX_URL = self.BASEURL + \
'/api/movie?limit={limit}&offset={offset}&token={token}'
self.DETAIL_URL = self.BASEURL+'/api/movie/{id}/?token={token}'
self.db = pymysql.connect(host='cq13292957303.mysql.rds.aliyuncs.com', user='qianqian',
password='Chenqian1234', database='test1')
self.cursor = self.db.cursor()
# todo:token解密
def get_token(self, params):
result = self.page.evaluate(
'()=>{return window.encrypt("%s")}' % params)
return result
# todo: download_method
def download(self, url):
res = requests.get(url=url)
logger.info(res.url)
return res
# todo:解析:拿到id
def parse_getId(self, json):
for i in json['results']:
id = i['id']
yield id
# todo:id揭秘
def get_id(self, params):
result = self.page.evaluate(
'()=>{return window.encrypt_id("%s")}' % params)
return result
# todo:解析>拿到标题,评分,简介
def parse_getData(self, json):
title = json['name']
rating = json['score']
brif = json['drama']
data = [title, rating, brif]
logger.info(data)
return data
# todo:保存
def save(self, data):
sql = 'insert into movies(title,rating_num,brif) values(%s,%s,%s)\
on duplicate key update title=%s,rating_num=%s,brif=%s'
self.cursor.execute(sql, data*2)
self.db.commit()
# todo:方法调度
def main(self):
for i in range(self.MAX_PAGE):
index_url = self.INDEX_URL.format(
limit=self.LIMIT, offset=i*10, token=self.get_token('/api/movie'))
res = self.download(url=index_url)
id = self.parse_getId(res.json())
for i in id:
detail_url = self.DETAIL_URL.format(id=self.get_id(
i), token=self.get_token('/api/movie/'+self.get_id(i)))
res = self.download(url=detail_url)
data = self.parse_getData(res.json())
self.save(data)
self.db.close()
logger.info('over!')
if __name__ == "__main__":
spider = Spider()
spider.main()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。