1 Star 0 Fork 2

renshuhao/抖音

forked from 晴云孤魂/抖音 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
douyin.py 14.07 KB
一键复制 编辑 原始数据 按行查看 历史
二毛 提交于 2021-03-17 20:22 . 1
# -*- encoding: utf-8 -*-
'''
@File : douyin.py
@Time : 2021年03月12日 18:16:57 星期五
@Author : erma0
@Version : 1.0
@Link : https://erma0.cn
@Desc : 抖音用户作品采集
'''
import json
import os
import time
from urllib.parse import parse_qs, urlparse
import requests
from download import Download
class Douyin(object):
"""
抖音用户类
采集作品列表
"""
def __init__(self, param: str, limit: int = 0):
"""
初始化用户信息
参数自动判断:ID/URL
"""
self.limit = limit
self.http = requests.Session()
self.url = ''
self.type = 'unknow'
self.download_path = '暂未定义目录'
# ↑ 预定义属性,避免调用时未定义 ↑
self.param = param.strip()
self.sign = 'TG2uvBAbGAHzG19a.rniF0xtrq' # sign可以固定
self.__get_type() # 判断当前任务类型:链接/ID
self.aria2 = Download() # 初始化Aria2下载服务,先不指定目录了,在设置文件名的时候再加入目录
self.has_more = True
self.videos = []
def __get_type(self):
"""
判断当前任务类型
链接/ID
"""
if '://' in self.param: # 链接
self.__url2redirect()
else: # ID
self.id = self.param
def __url2redirect(self):
"""
取302跳转地址
短连接转长链接
"""
headers = { # 以前作品需要解析去水印,要用到移动端UA,现在不用了
'User-Agent':
'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1 Edg/89.0.4389.82'
}
try:
r = self.http.head(self.param, headers=headers, allow_redirects=False)
self.url = r.headers['Location']
except:
self.url = self.param
def __url2id(self):
try:
self.id = urlparse(self.url).path.split('/')[3]
except:
self.id = ''
def __url2uid(self):
try:
query = urlparse(self.url).query
self.id = parse_qs(query)['sec_uid'][0]
except:
self.id = ''
def get_sign(self):
"""
网页sign算法,现在不需要了,直接固定
"""
self.sign = 'TG2uvBAbGAHzG19a.rniF0xtrq'
return self.sign
def get_user_info(self):
"""
取用户信息
查询结果在 self.user_info
"""
if self.url:
self.__url2uid()
url = 'https://www.iesdouyin.com/web/api/v2/user/info/?sec_uid=' + self.id
try:
res = self.http.get(url).json()
info = res.get('user_info', dict())
except:
info = dict()
self.user_info = info
# 下载路径
username = '{}_{}_{}'.format(self.user_info.get('short_id', '0'),
self.user_info.get('nickname', '无昵称'), self.type)
self.download_path = Download.title2path(username) # 需提前处理非法字符串
def get_challenge_info(self):
"""
取话题挑战信息
查询结果在 self.challenge_info
"""
if self.url:
self.__url2id()
url = 'https://www.iesdouyin.com/web/api/v2/challenge/info/?ch_id=' + self.id
try:
res = self.http.get(url).json()
info = res.get('ch_info', dict())
except:
info = dict()
self.challenge_info = info
# 话题挑战下载路径
username = '{}_{}_{}'.format(self.challenge_info.get('cid', '0'),
self.challenge_info.get('cha_name', '无标题'), self.type)
self.download_path = Download.title2path(username) # 需提前处理非法字符串
def get_music_info(self):
"""
取音乐原声信息
查询结果在 self.music_info
"""
if self.url:
self.__url2id()
url = 'https://www.iesdouyin.com/web/api/v2/music/info/?music_id=' + self.id
try:
res = self.http.get(url).json()
info = res.get('music_info', dict())
except:
info = dict()
self.music_info = info
# 音乐原声下载路径
username = '{}_{}_{}'.format(self.music_info.get('mid', '0'), self.music_info.get('title', '无标题'),
self.type)
self.download_path = Download.title2path(username) # 需提前处理非法字符串
def crawling_users_post(self):
"""
采集用户作品
"""
self.type = 'post'
self.__crawling_user()
def crawling_users_like(self):
"""
采集用户喜欢
"""
self.type = 'like'
self.__crawling_user()
def crawling_challenge(self):
"""
采集话题挑战
"""
self.type = 'challenge'
self.get_challenge_info() # 取当前信息,用做下载目录
# https://www.iesdouyin.com/web/api/v2/challenge/aweme/?ch_id=1570693184929793&count=9&cursor=9&aid=1128&screen_limit=3&download_click_limit=0&_signature=AXN-GQAAYUTpqVxkCT6GHQFzfg
url = 'https://www.iesdouyin.com/web/api/v2/challenge/aweme/'
cursor = '0'
while self.has_more:
params = {
"ch_id": self.id,
"count": "21", # 可调大 初始值:9
"cursor": cursor,
"aid": "1128",
"screen_limit": "3",
"download_click_limit": "0",
"_signature": self.sign
}
try:
res = self.http.get(url, params=params).json()
cursor = res['cursor']
self.has_more = res['has_more']
self.__append_videos(res)
except:
print('话题挑战采集出错')
print('话题挑战采集完成')
def crawling_music(self):
"""
采集音乐原声
"""
self.type = 'music'
self.get_music_info() # 取当前信息,用做下载目录
# https://www.iesdouyin.com/web/api/v2/music/list/aweme/?music_id=6928362875564067592&count=9&cursor=18&aid=1128&screen_limit=3&download_click_limit=0&_signature=5ULmIQAAhRYNmMRcpDm2COVC5j
url = 'https://www.iesdouyin.com/web/api/v2/music/list/aweme/'
cursor = '0'
while self.has_more:
params = {
"music_id": self.id,
"count": "21", # 可调大 初始值:9
"cursor": cursor,
"aid": "1128",
"screen_limit": "3",
"download_click_limit": "0",
"_signature": self.sign
}
try:
res = self.http.get(url, params=params).json()
cursor = res['cursor']
self.has_more = res['has_more']
self.__append_videos(res)
except:
print('音乐原声采集出错')
print('音乐原声采集完成')
def __crawling_user(self):
"""
采集用户作品/喜欢
"""
self.get_user_info() # 取当前用户信息,昵称用做下载目录
max_cursor = 0
# https://www.iesdouyin.com/web/api/v2/aweme/post/?sec_uid=MS4wLjABAAAAaJO9L9M0scJ_njvXncvoFQj3ilCKW1qQkNGyDc2_5CQ&count=21&max_cursor=0&aid=1128&_signature=DrXeeAAAbwPmb.wFM3e63w613m&dytk=
url = 'https://www.iesdouyin.com/web/api/v2/aweme/{}/'.format(self.type)
while self.has_more:
params = {
"sec_uid": self.id,
"count": "21",
"max_cursor": max_cursor,
"aid": "1128",
"_signature": self.sign,
"dytk": ""
}
try:
res = self.http.get(url, params=params).json()
max_cursor = res['max_cursor']
self.has_more = res['has_more']
self.__append_videos(res)
except:
print('作品采集出错')
print('作品采集完成')
def __append_videos(self, res):
"""
数据入库
"""
if res.get('aweme_list'):
for item in res['aweme_list']:
info = item['statistics']
info.pop('forward_count')
info.pop('play_count')
info['desc'] = Download.title2path(item['desc']) # 需提前处理非法字符串
info['uri'] = item['video']['play_addr']['uri']
info['play_addr'] = item['video']['play_addr']['url_list'][0]
info['dynamic_cover'] = item['video']['dynamic_cover']['url_list'][0]
self.videos.append(info)
# 此处可以直接添加下载任务,不过考虑到下载占用网速,影响采集过程,还是采集完再下载吧
if self.limit and len(self.videos) >= self.limit:
# 如果给出了限制采集数目,超出直接返回
self.has_more = False
self.videos = self.videos[:self.limit]
return
else: # 还有作品的情况下没返回数据则进入这里
print('未采集完成,但返回作品列表为空')
def __on_finish(self, gid):
"""
任务完成/停止/失败时的回调函数
任务完成时结束监听
"""
print(self.aria2.get_files(gid)[0]['path'], '任务完成(成功/停止/失败)')
stat = self.aria2.get_stat()
print('当前下载信息:', stat.__dict__)
if stat.num_active + stat.num_waiting == 0: # 正在进行任务数=0,任务全部完成
# 程序由此结束
self.aria2.stop_listening()
self.aria2.stop_loop()
print('当前任务队列下载完成,3秒后结束当前任务')
time.sleep(3)
print('任务已完成')
def __on_loop(self, info: list):
"""
循环监听回调函数
参数info为进行中的下载任务状态
即tell_active(keys=['gid','files', 'totalLength', 'completedLength', 'downloadSpeed'])
"""
# 固定位置输出,还没找到解决方法,留着用做界面回调吧
# print(info)
# info.sort()
# for i in info:
# msg = ''
# msg = msg + '{}'
pass
def __on_pause(self, gid):
"""
任务暂停时的回调函数
"""
print(gid, '任务暂停')
def download_all(self):
"""
作品抓取完成后,统一添加下载任务
同时注册回调函数,监听下载任务状态
结束监听:self.aria2.del_callback()
on_finish监听到任务全部完成时会自动结束监听
(如果出现暂停的任务会无法自动结束,需要外部结束监听)
不结束监听会阻塞进程,导致程序无法关闭
"""
for video in self.videos:
self.aria2.download(url=video['play_addr'],
filename='{}/{}_{}.mp4'.format(self.download_path, video['aweme_id'],
video['desc']))
# 注册回调函数,监听下载任务状态
self.aria2.start_listening(on_start=self.__on_finish,
on_stop=self.__on_finish,
on_complete=self.__on_finish,
on_error=self.__on_finish,
on_pause=self.__on_pause)
# 有界面再循环监听下载状态,放到界面直观反馈
self.aria2.start_loop(on_loop=self.__on_loop)
print('下载任务投递完成')
class Task(object):
def __init__(self, type='user', limit=0):
"""
抖音采集命令行版本
可指定下载类别:user; like; challenge; music,默认为user
可指定下载数量:limit,默认为0,即全部下载
"""
self.__type = type
self.__limit = limit
if not os.path.exists('下载'):
os.mkdir('下载')
def download(self, target):
"""
单个下载
"""
douyin = Douyin(target, self.__limit)
print('开始采集')
if self.__type == 'user':
# 用户作品
douyin.crawling_users_post()
elif self.__type == 'like':
# 用户喜欢[不可用]
douyin.crawling_users_like()
elif self.__type == 'challenge':
# 话题挑战
douyin.crawling_challenge()
elif self.__type == 'music':
# 音乐原声
douyin.crawling_music()
else:
print('输入格式错误')
return
with open('下载/{}.json'.format(douyin.download_path), 'w', encoding='utf-8') as f:
json.dump(douyin.videos, f, ensure_ascii=False) # 中文不用Unicode编码
print('开始下载')
douyin.download_all()
print('当前任务流程结束,等待下载完成')
def download_batch(self, target):
"""
批量下载
文件格式:一行一个链接/id
"""
with open(target, 'r', encoding='utf-8') as f:
for line in f:
self.download(line)
if __name__ == "__main__":
# 1 实例化任务对象
task = Task() # 用户作品
# task = Task(type='like', limit=10) # 用户喜欢(不可用)
# task = Task(type='music', limit=10) # 音乐原声
# task = Task(type='challenge', limit=10) # 话题挑战
# 2 下载
# 2.1 单个下载
# target='https://www.iesdouyin.com/share/user/110812020268?u_code=16ak94dc7&sec_uid=MS4wLjABAAAAaJO9L9M0scJ_njvXncvoFQj3ilCKW1qQkNGyDc2_5CQ&utm_campaign=client_share&app=aweme&utm_medium=ios&tt_from=copy&utm_source=copy'
# https://v.douyin.com/e8hHxQf/
target = 'MS4wLjABAAAAaJO9L9M0scJ_njvXncvoFQj3ilCKW1qQkNGyDc2_5CQ'
task.download(target)
# 2.2 批量下载
# target = 'user.txt' # 文件路径
# task.download_batch(target)
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/renshuhao/douyin.git
git@gitee.com:renshuhao/douyin.git
renshuhao
douyin
抖音
master

搜索帮助