1 Star 3 Fork 0

萧石/public_data

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
新华广播.py 8.27 KB
一键复制 编辑 原始数据 按行查看 历史
undefined 提交于 2024-11-20 15:15 . 新华广播音频采集
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @FileName :main.py
# @Time :2024/5/10
# @Author :CL
# @email :1037654919@qq.com
# http://www.xinhuanet.com/video/xinhuaradio/xhfm/index.html
# 下载新华广播音频和文本
import os
import requests
from bs4 import BeautifulSoup # xpath re
from util import mongo_manager
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Pragma": "no-cache",
"Referer": "http://www.xinhuanet.com/video/xinhuaradio/qwfb/index.html",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"X-Requested-With": "XMLHttpRequest"
}
cookies = {
"wdcid": "5fca58f6c43ae0aa",
"uid": "f9468db4524b4d32911c51edb0f19ca9",
"wdlast": "1715320951"
}
xinhuaguangbo_url = mongo_manager('xinhuaguangbo_url',db='public_data')
# 获取音频列表
def get_list(url = "http://www.xinhuanet.com/video/xinhuaradio/qwfb/ds_0ba2c0504aff46d4ab66266baad46720.json"):
response = requests.get(url, headers=headers, cookies=cookies, verify=False)
print(response.url,response)
return response.json()
# 基于mp3链接下载音频
def download_yinpin(pathname =None,url = "https://vodpub1.v.news.cn/original/20200521/c2a6e0459b114aacb7710b516a560c14.mp3"):
#
# headers = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
# "Accept-Language": "zh-CN,zh;q=0.9",
# "Cache-Control": "no-cache",
# "Connection": "keep-alive",
# "Pragma": "no-cache",
# "Sec-Fetch-Dest": "document",
# "Sec-Fetch-Mode": "navigate",
# "Sec-Fetch-Site": "none",
# "Sec-Fetch-User": "?1",
# "Upgrade-Insecure-Requests": "1",
# "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
# "referer": "https://vodpub1.v.news.cn/original/20200521/c2a6e0459b114aacb7710b516a560c14.mp3",
# "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
# "sec-ch-ua-mobile": "?0",
# "sec-ch-ua-platform": "\"Linux\""
# }
# cookies = {
# "SERVERID": "2d8039bc5af9613d26f539e3333b9771|1715321534|1715321534"
# }
response = requests.get(url, headers=headers, cookies=cookies)
print(response.url,response)
if response.status_code == 200:
with open(pathname +'.' +url.split('.')[-1], "wb") as f:
f.write(response.content)
return 1
else:
#删除可能创建的文件
if os.path.exists(pathname +'.' +url.split('.')[-1]):
os.remove(pathname +'.' +url.split('.')[-1])
return 0
# 基于链接获取文本
def get_txt(url ='http://www.xinhuanet.com/video/20211015/C99015FF8110000124621000A9AACC00/c.html'):
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Pragma": "no-cache",
"Referer": "http://www.xinhuanet.com/video/xinhuaradio/qwfb/index.html",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
}
cookies = {
"wdcid": "5fca58f6c43ae0aa",
"uid": "f9468db4524b4d32911c51edb0f19ca9",
"wdlast": "1715324835"
}
try:
response = requests.get(url, headers=headers, cookies=cookies, verify=False)
print(response.url,response)
soup = BeautifulSoup(response.text, 'html.parser')
datas = soup.find('span',id='detailContent').find_all('p',style =None)
text =''
for data in datas:
text += data.text +'\n'
return text.strip()
except:
return None
def main():
tags = [
"qwfb", "xhfm", "lxlsj", "jqjszjj", "gdsn", "hzo", "sjlz", "xdgsxczx", "zgmp", "zfhss",
"hbzs", "mdgw", "zs",
"yjz", "jsz", "lwsai", "zmqn", "cjzs", "cjtop"
]
tags_json ={
# "qwfb": "http://www.xinhuanet.com/video/xinhuaradio/qwfb/ds_0ba2c0504aff46d4ab66266baad46720.json",
# "xhfm": "http://xinhuanet.com/video/xinhuaradio/xhfm/ds_eb0e811e69044f35884443ef8b03da1f.json",
# "lxlsj": "http://xinhuanet.com/video/xinhuaradio/lxlsj/ds_cd3e28e5d9fd455183b00c549fcdd425.json",
# "jqjszjj":'http://xinhuanet.com/video/xinhuaradio/jqjszjj/ds_168a8cd1c9644747b3d201a009de4418.json',
# "gdsn":'http://xinhuanet.com/video/xinhuaradio/gdsn/ds_d2df34ed976347acb42e2a7c9ab0632c.json',
# "hzo":'http://xinhuanet.com/video/xinhuaradio/hzo/ds_26b132a7e45442eab07bf9bae388b735.json',
# "sjlz":'http://xinhuanet.com/video/xinhuaradio/sjlz/ds_f18e26ad1f4b4363986899f08831a8f9.json',
# 'xdgsxczx':'http://xinhuanet.com/video/xinhuaradio/xdgsxczx/ds_87a912ebb2464011a9a171245417293e.json',
# 'zgmp':'http://xinhuanet.com/video/xinhuaradio/zgmp/ds_b8316458e31c4402b5741030b2e1b317.json',
# 'zfhss':'http://xinhuanet.com/video/xinhuaradio/zfhss/ds_4c7124cc75b9457b8100e4bcc2f7f53d.json',
# 'hbzs':'http://xinhuanet.com/video/xinhuaradio/hbzs/ds_321350eb1d91422f9863bc63098fecb7.json',
# 'mdgw':'https://xinhuanet.com/video/xinhuaradio/mdgw/ds_e033a06bf423430997fdf375b1dc1619.json',
# 'zs':'http://xinhuanet.com/video/xinhuaradio/zs/ds_4e87ef489d0640cd994469e71143a56b.json',
"yjz":'http://xinhuanet.com/video/xinhuaradio/yjz/ds_b446851d86e84e0a9b4a002035960575.json',
"jsz":'http://xinhuanet.com/video/xinhuaradio/jsz/ds_3181847f80254ffabcfddae05df4c4d0.json',
"lwsai":'http://xinhuanet.com/video/xinhuaradio/lwsai/ds_10f0e15e68df4f5288fbdc910e92a688.json',
"zmqn":'http://xinhuanet.com/video/xinhuaradio/zmqn/ds_2b4a8942680b4d338b011dcdfece51e7.json',
"cjzs":'http://xinhuanet.com/video/xinhuaradio/cjzs/ds_d94fcb9890f3465e893c161a067abff3.json',
"cjtop":'http://xinhuanet.com/video/xinhuaradio/cjtop/ds_ee2d9e3d43ac4595a231fd1d5ced8944.json'
}
#
for tag, url in tags_json.items():
print(f'begin {tag},url:{url}')
path = "/home/chenglei3/work/data/xinhuaradio/"+tag+"/"
os.makedirs(path, exist_ok=True)
datas = get_list(url=url)
for data in datas['datasource']:
if xinhuaguangbo_url.findOne({'_id': data['contentId']}) is None :
reslut ={}
reslut['title'] = data['title']
reslut['publishUrl'] = 'http://www.xinhuanet.com'+data['publishUrl']
reslut['_id'] = data['contentId']
reslut['keywords'] = data['keywords']
reslut['summary'] = data['summary']
reslut['publishTime'] = data['publishTime']
if data['quote'] != '':
reslut['voice_url'] = data['quote']
else:
reslut['voice_url'] = data['multimediaLink']
reslut['text'] = get_txt(reslut['publishUrl'])
# 保存音频 可以单独取出
print(f'{reslut["_id"]} 音频地址为{reslut["voice_url"]}')
if reslut['voice_url'] !='' and reslut['text'] is not None:
flag = download_yinpin(pathname=path+reslut['_id'],url=reslut['voice_url'])
if flag == 1:
reslut['status'] = 'done'
# 保存text
with open(path + reslut['_id'] + '.txt', 'w', encoding='utf-8') as f:
f.write(reslut['text'])
else:
reslut['status'] = 'failed'
else:
print(f'{reslut["_id"]} 音频地址为空')
reslut['status'] = 'failed'
try: #
xinhuaguangbo_url.insertOne(reslut)
except Exception as e:
print(e)
xinhuaguangbo_url.updateOne({'_id': reslut['_id']}, reslut)
if __name__ == '__main__':
print()
# download_yinpin()
main()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/beihai_xiaoshi/public_data.git
git@gitee.com:beihai_xiaoshi/public_data.git
beihai_xiaoshi
public_data
public_data
master

搜索帮助