Fetch the repository succeeded.
# 爬取电影天堂磁力链接 版本1 只爬取top180
# auther:xyislove
# 欢迎各位大神修改指导
import requests
import re
from fake_useragent import UserAgent
import csv
import codecs #修正写入csv乱码问题
import os
import time
from requests.api import get
# 获取网页源代码
def get_source(url):
ua=UserAgent()
headers={'User-Agent':ua.random}
resp=requests.get(url,headers=headers,verify=False)
resp.encoding='gb2312' #指定字符集
resp_text=resp.text
resp.close()
# print(resp)
# 网页处理:若代码用utf-8,网页为gbk,则需要这样处理:
# html= html.decode(“gbk”).encode(“utf-8”)
return resp_text
def analyes_fa_page(fa_page):
list_link=[]
obj1=re.compile(r"最新发布180部影视.*?<ul>(?P<ul>.*?)</ul>",re.S)
obj2=re.compile(r"<a href='(?P<son_link>.*?)'>")
result1=obj1.finditer(fa_page)
for i in result1:
print(i.group('ul'))
ul=i.group('ul')
son_link=obj2.finditer(ul)
for it in son_link:
# print(it.group('son_link'))
s_link=it.group('son_link')
# print(it.group('name'))# 片名不全 舍去
fa_link='https://www.ygdy8.com'
whole_link=fa_link+s_link
list_link.append(whole_link)
# print(list_link)
print(len(list_link))
return list_link
# 分析子页
def analyes_son_page(list_link):
info_list=[]
obj1=re.compile(r'charset=gb2312">.*?《(?P<name>.*?)》',re.S)
obj2=re.compile(r'https://img9.doubanio.com/view/photo/l_ratio_poster/public/.*?jpg',re.S)
obj3=re.compile(r'magnet:.*?fannounce',re.S)
i=0
for j in list_link[134:]:
dict={}
i=i+1
son_page=get_source(j)
result=obj1.search(son_page)
result2=obj2.search(son_page)
result3=obj3.search(son_page)
print(result.group('name'),i)
# print(result2.group('pic'))
# print(result3.group('magnet'))
dict['name']=result.group('name').replace('/','or')
dict['pic']=result2.group()
dict['magnet']=result3.group()
info_list.append(dict)
time.sleep(1)
# print(info_list)
return info_list
# 信息写入csv
def write_data(info_list):
f=codecs.open('data.csv',mode='a+',encoding='gb2312')
writer=csv.writer(f)
for i in info_list:
writer.writerow(i.values())
#下载图片
def download_pic(info_list):
for i in data_list:
print(i['pic'])
pic=get_source(i['link']).content
name=i['name']
with open(f'./pic/{name}.jpg','ab+') as f:
f.write(pic)
if __name__=='__main__':
father_link='https://www.ygdy8.com/index1.htm'
# 获取父页源代码
fa_page=get_source(father_link)
# 分析父页源代码
list_link=analyes_fa_page(fa_page)
# 获取子页代码及分析
info_list=analyes_son_page(list_link)
# 写入csv
write_data(info_list)
# requests.exceptions.ConnectionError
# https://www.ygdy8.com/html/gndy/dyzz/20211111/62026.html
# 提取子页面链接
# magnet开头为磁力链接
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。