1 Star 0 Fork 1

sinkids/douyin_video

forked from 萧石/douyin_video 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
抖音基于关键词获取视频.py 6.25 KB
一键复制 编辑 原始数据 按行查看 历史
萧石 提交于 2024-11-21 08:57 . 首次上传
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @FileName :test.py
# @Time :2024/4/27
# @Author :coding
# @email :
from multiprocessing import Pool
from DrissionPage import ChromiumPage
from DataRecorder import Recorder
import time
import random
import os
import datetime
from util import mongo_manager
douyin_videos = mongo_manager('douyin_videos', db='public_data')
def countdown(n):
for i in range(n, 0, -1):
print(f'\r倒计时{i}秒', end='') # \r让光标回到行首 ,end=''--结束符为空,即不换行
time.sleep(1) # 让程序等待1秒
else:
print('\r倒计时结束')
def sign_in():
sign_in_page = ChromiumPage()
sign_in_page.get('https://www.douyin.com/?recommend=1')
print('请扫码登录')
countdown(30)
def delete_file(file_path):
"""用于删除初始化excel文件"""
# 检查文件是否存在
if os.path.exists(file_path):
# 删除文件
os.remove(file_path)
print(f"已删除初始化excel文件:{file_path}")
else:
print(f"文件不存在:{file_path} ")
def run(keyword='纯人声朗诵'):
page = ChromiumPage()
# page
url = f'https://www.douyin.com/search/{keyword}?type=video'
# page.clear_cache()
# page.refresh()
page.listen.start('www.douyin.com/aweme/v1/web/search/item', method='GET') # 开始监听,指定获取包含该文本的数据包
page.get(url)
# 新建一个excel表格,用来保存数据
init_file_path = f'数据/抖音搜索-{keyword}.xlsx'
os.makedirs(os.path.dirname(init_file_path), exist_ok=True)
r = Recorder(path=init_file_path, cache_size=100)
lens = 0
# 循环提取视频
while lens <= 300:
# 生成一个随机时间
random_time = random.uniform(2, 3)
# 下滑页面,随机等待
page.scroll.to_bottom()
time.sleep(random_time)
# 下滑页面,随机等待
page.scroll.to_bottom()
time.sleep(random_time)
res = page.listen.wait(timeout=5) # 等待并获取一个数据包
try:
# json_data和浏览器的开发者工区中response一样
json_data = res.response.body
except Exception as e:
print('没有监听到更多视频,结束运行。', e)
time.sleep(10)
break
try:
videos = json_data['data']
# 提取视频信息 # videos是所有视频列表
except Exception as e:
print('没有监听到更多视频,结束运行。', e)
time.sleep(10)
res = page.listen.wait(timeout=5) # 等待并获取一个数据包
# json_data和浏览器的开发者工区中response一样
json_data = res.response.body
videos = json_data['data']
for v in videos:
# print(v)
time.sleep(10)
# 视频id
aweme_id = v['aweme_info']['aweme_id']
# 视频标题
title = v['aweme_info']['desc'].strip()
# 发布时间
create_time = v['aweme_info']['create_time']
# 作者名称
nickname = v['aweme_info']['author']['nickname']
# 作者id
uid = v['aweme_info']['author']['uid']
# 认证信息
enterprise_verify_reason = v['aweme_info']['author']['enterprise_verify_reason']
# 作者主页地址
sec_uid = v['aweme_info']['author']['sec_uid']
# 作者粉丝数
follower_count = v['aweme_info']['author']['follower_count']
# 点赞
digg_count = v['aweme_info']['statistics']['digg_count']
# 评论
comment_count = v['aweme_info']['statistics']['comment_count']
# 收藏
collect_count = v['aweme_info']['statistics']['collect_count']
# 转发
share_count = v['aweme_info']['statistics']['share_count']
video_url = "https://www.douyin.com/video/" + aweme_id
sec_uid = "https://www.douyin.com/user/" + sec_uid
duration = v['aweme_info']['video']['duration']
# 将时间戳转换为datetime对象
create_time = datetime.datetime.fromtimestamp(create_time)
# 将datetime对象转换为字符串
create_time = create_time.strftime('%Y-%m-%d %H:%M:%S')
print(aweme_id, title, digg_count, comment_count, collect_count, share_count)
info = {'关键词': keyword, '视频标题': title, '发布时间': create_time, '点赞': digg_count,
'评论': comment_count, '收藏': collect_count, '转发': share_count, '作者名称': nickname,
'作者粉丝数': follower_count, '视频ID': aweme_id, '视频链接': video_url,
'作者主页地址': sec_uid, '时长(秒)': int(duration) / 1000,
'作者ID': uid, '作者认证信息': enterprise_verify_reason,
'data_type':'舱内'}
r.add_data(info)
try:
info["_id"] = info['视频ID']
douyin_videos.insertOne(info)
lens += 1
except Exception as e:
print('插入失败', e)
time.sleep(3)
# 如果还有视频,继续循环。如果没有视频,结束循环。
has_more = json_data['has_more']
if has_more != 1:
break
# 爬虫结束,避免丢数据,强制保存文件
r.record()
print('**' * 10)
print(
f'抖音关键词【{keyword}】总计获取到【{lens}】条视频')
# 文件重命名
final_file_path = f'数据/抖音搜索-{keyword}-{lens}条视频.xlsx'
r.record(final_file_path)
print(f'成功将文件另存为:{final_file_path}')
# 删除初始文件
delete_file(init_file_path)
# 关闭浏览器
page.close()
def main():
pool = Pool(processes=4)
pool.map(run, keywords)
pool.close()
pool.join()
if __name__ == '__main__':
# # 1、登录,第2次运行可以注释掉这一步骤
sign_in()
keywords =['狗 车内视频','猫 车内视频']
for keyword in keywords:
run(keyword=keyword)
print(keyword, '采集完毕,休息100秒')
time.sleep(100)
# break
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/sinkids/douyin_video.git
git@gitee.com:sinkids/douyin_video.git
sinkids
douyin_video
douyin_video
master

搜索帮助