1 Star 0 Fork 0

xyislove/python-test

Create your Gitee Account
Explore and code with more than 12 million developers,Free private repositories !:)
Sign up
This repository doesn't specify license. Please pay attention to the specific project description and its upstream code dependency when using it.
Clone or Download
dytt.py 3.10 KB
Copy Edit Raw Blame History
xyislove authored 2021-11-14 17:47 . 梨视频第一版完成!
# 爬取电影天堂磁力链接 版本1 只爬取top180
# auther:xyislove
# 欢迎各位大神修改指导
import requests
import re
from fake_useragent import UserAgent
import csv
import codecs #修正写入csv乱码问题
import os
import time
from requests.api import get
# 获取网页源代码
def get_source(url):
ua=UserAgent()
headers={'User-Agent':ua.random}
resp=requests.get(url,headers=headers,verify=False)
resp.encoding='gb2312' #指定字符集
resp_text=resp.text
resp.close()
# print(resp)
# 网页处理:若代码用utf-8,网页为gbk,则需要这样处理:
# html= html.decode(“gbk”).encode(“utf-8”)
return resp_text
def analyes_fa_page(fa_page):
list_link=[]
obj1=re.compile(r"最新发布180部影视.*?<ul>(?P<ul>.*?)</ul>",re.S)
obj2=re.compile(r"<a href='(?P<son_link>.*?)'>")
result1=obj1.finditer(fa_page)
for i in result1:
print(i.group('ul'))
ul=i.group('ul')
son_link=obj2.finditer(ul)
for it in son_link:
# print(it.group('son_link'))
s_link=it.group('son_link')
# print(it.group('name'))# 片名不全 舍去
fa_link='https://www.ygdy8.com'
whole_link=fa_link+s_link
list_link.append(whole_link)
# print(list_link)
print(len(list_link))
return list_link
# 分析子页
def analyes_son_page(list_link):
info_list=[]
obj1=re.compile(r'charset=gb2312">.*?《(?P<name>.*?)》',re.S)
obj2=re.compile(r'https://img9.doubanio.com/view/photo/l_ratio_poster/public/.*?jpg',re.S)
obj3=re.compile(r'magnet:.*?fannounce',re.S)
i=0
for j in list_link[134:]:
dict={}
i=i+1
son_page=get_source(j)
result=obj1.search(son_page)
result2=obj2.search(son_page)
result3=obj3.search(son_page)
print(result.group('name'),i)
# print(result2.group('pic'))
# print(result3.group('magnet'))
dict['name']=result.group('name').replace('/','or')
dict['pic']=result2.group()
dict['magnet']=result3.group()
info_list.append(dict)
time.sleep(1)
# print(info_list)
return info_list
# 信息写入csv
def write_data(info_list):
f=codecs.open('data.csv',mode='a+',encoding='gb2312')
writer=csv.writer(f)
for i in info_list:
writer.writerow(i.values())
#下载图片
def download_pic(info_list):
for i in data_list:
print(i['pic'])
pic=get_source(i['link']).content
name=i['name']
with open(f'./pic/{name}.jpg','ab+') as f:
f.write(pic)
if __name__=='__main__':
father_link='https://www.ygdy8.com/index1.htm'
# 获取父页源代码
fa_page=get_source(father_link)
# 分析父页源代码
list_link=analyes_fa_page(fa_page)
# 获取子页代码及分析
info_list=analyes_son_page(list_link)
# 写入csv
write_data(info_list)
# requests.exceptions.ConnectionError
# https://www.ygdy8.com/html/gndy/dyzz/20211111/62026.html
# 提取子页面链接
# magnet开头为磁力链接
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/xyislove/python-test.git
git@gitee.com:xyislove/python-test.git
xyislove
python-test
python-test
master

Search