master

分支 (1)

管理

管理

master

public_data
/
新华广播.py

#!/usr/bin/env python
# -*- coding:utf-8 -*-
# @FileName  :main.py
# @Time      :2024/5/10
# @Author    :CL
# @email     :1037654919@qq.com
# http://www.xinhuanet.com/video/xinhuaradio/xhfm/index.html
# 下载新华广播音频和文本
import os


import requests
from bs4 import BeautifulSoup  # xpath  re
from util import mongo_manager

headers = {
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cache-Control": "no-cache",
    "Connection": "keep-alive",
    "Pragma": "no-cache",
    "Referer": "http://www.xinhuanet.com/video/xinhuaradio/qwfb/index.html",
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest"
}
cookies = {
    "wdcid": "5fca58f6c43ae0aa",
    "uid": "f9468db4524b4d32911c51edb0f19ca9",
    "wdlast": "1715320951"
}
xinhuaguangbo_url  = mongo_manager('xinhuaguangbo_url',db='public_data')
# 获取音频列表
def get_list(url = "http://www.xinhuanet.com/video/xinhuaradio/qwfb/ds_0ba2c0504aff46d4ab66266baad46720.json"):


    response = requests.get(url, headers=headers, cookies=cookies, verify=False)

    print(response.url,response)

    return response.json()

# 基于mp3链接下载音频
def download_yinpin(pathname =None,url = "https://vodpub1.v.news.cn/original/20200521/c2a6e0459b114aacb7710b516a560c14.mp3"):
    #
    # headers = {
    #     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    #     "Accept-Language": "zh-CN,zh;q=0.9",
    #     "Cache-Control": "no-cache",
    #     "Connection": "keep-alive",
    #     "Pragma": "no-cache",
    #     "Sec-Fetch-Dest": "document",
    #     "Sec-Fetch-Mode": "navigate",
    #     "Sec-Fetch-Site": "none",
    #     "Sec-Fetch-User": "?1",
    #     "Upgrade-Insecure-Requests": "1",
    #     "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
    #     "referer": "https://vodpub1.v.news.cn/original/20200521/c2a6e0459b114aacb7710b516a560c14.mp3",
    #     "sec-ch-ua": "\"Chromium\";v=\"124\", \"Google Chrome\";v=\"124\", \"Not-A.Brand\";v=\"99\"",
    #     "sec-ch-ua-mobile": "?0",
    #     "sec-ch-ua-platform": "\"Linux\""
    # }
    # cookies = {
    #     "SERVERID": "2d8039bc5af9613d26f539e3333b9771|1715321534|1715321534"
    # }

    response = requests.get(url, headers=headers, cookies=cookies)
    print(response.url,response)
    if response.status_code == 200:
        with open(pathname +'.' +url.split('.')[-1], "wb") as f:
            f.write(response.content)
        return 1
    else:
        #删除可能创建的文件
        if os.path.exists(pathname +'.' +url.split('.')[-1]):
            os.remove(pathname +'.' +url.split('.')[-1])
        return 0

# 基于链接获取文本
def get_txt(url ='http://www.xinhuanet.com/video/20211015/C99015FF8110000124621000A9AACC00/c.html'):

    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Pragma": "no-cache",
        "Referer": "http://www.xinhuanet.com/video/xinhuaradio/qwfb/index.html",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    }
    cookies = {
        "wdcid": "5fca58f6c43ae0aa",
        "uid": "f9468db4524b4d32911c51edb0f19ca9",
        "wdlast": "1715324835"
    }
    try:

        response = requests.get(url, headers=headers, cookies=cookies, verify=False)
        print(response.url,response)
        soup = BeautifulSoup(response.text, 'html.parser')
        datas = soup.find('span',id='detailContent').find_all('p',style =None)
        text =''
        for data in datas:
            text += data.text +'\n'
        return text.strip()
    except:
        return None
def main():
    tags = [
        "qwfb", "xhfm", "lxlsj", "jqjszjj", "gdsn", "hzo", "sjlz", "xdgsxczx", "zgmp", "zfhss",
        "hbzs", "mdgw", "zs",
        "yjz", "jsz", "lwsai", "zmqn", "cjzs", "cjtop"
    ]
    tags_json ={
        # "qwfb": "http://www.xinhuanet.com/video/xinhuaradio/qwfb/ds_0ba2c0504aff46d4ab66266baad46720.json",
        # "xhfm": "http://xinhuanet.com/video/xinhuaradio/xhfm/ds_eb0e811e69044f35884443ef8b03da1f.json",
        # "lxlsj": "http://xinhuanet.com/video/xinhuaradio/lxlsj/ds_cd3e28e5d9fd455183b00c549fcdd425.json",
        # "jqjszjj":'http://xinhuanet.com/video/xinhuaradio/jqjszjj/ds_168a8cd1c9644747b3d201a009de4418.json',
        # "gdsn":'http://xinhuanet.com/video/xinhuaradio/gdsn/ds_d2df34ed976347acb42e2a7c9ab0632c.json',
        # "hzo":'http://xinhuanet.com/video/xinhuaradio/hzo/ds_26b132a7e45442eab07bf9bae388b735.json',
        # "sjlz":'http://xinhuanet.com/video/xinhuaradio/sjlz/ds_f18e26ad1f4b4363986899f08831a8f9.json',
        # 'xdgsxczx':'http://xinhuanet.com/video/xinhuaradio/xdgsxczx/ds_87a912ebb2464011a9a171245417293e.json',
        # 'zgmp':'http://xinhuanet.com/video/xinhuaradio/zgmp/ds_b8316458e31c4402b5741030b2e1b317.json',
        # 'zfhss':'http://xinhuanet.com/video/xinhuaradio/zfhss/ds_4c7124cc75b9457b8100e4bcc2f7f53d.json',
        # 'hbzs':'http://xinhuanet.com/video/xinhuaradio/hbzs/ds_321350eb1d91422f9863bc63098fecb7.json',
        # 'mdgw':'https://xinhuanet.com/video/xinhuaradio/mdgw/ds_e033a06bf423430997fdf375b1dc1619.json',
        # 'zs':'http://xinhuanet.com/video/xinhuaradio/zs/ds_4e87ef489d0640cd994469e71143a56b.json',
        "yjz":'http://xinhuanet.com/video/xinhuaradio/yjz/ds_b446851d86e84e0a9b4a002035960575.json',
        "jsz":'http://xinhuanet.com/video/xinhuaradio/jsz/ds_3181847f80254ffabcfddae05df4c4d0.json',
        "lwsai":'http://xinhuanet.com/video/xinhuaradio/lwsai/ds_10f0e15e68df4f5288fbdc910e92a688.json',
        "zmqn":'http://xinhuanet.com/video/xinhuaradio/zmqn/ds_2b4a8942680b4d338b011dcdfece51e7.json',
    "cjzs":'http://xinhuanet.com/video/xinhuaradio/cjzs/ds_d94fcb9890f3465e893c161a067abff3.json',
        "cjtop":'http://xinhuanet.com/video/xinhuaradio/cjtop/ds_ee2d9e3d43ac4595a231fd1d5ced8944.json'
    }
    #
    for tag, url in tags_json.items():
        print(f'begin {tag},url:{url}')
        path = "/home/chenglei3/work/data/xinhuaradio/"+tag+"/"
        os.makedirs(path, exist_ok=True)
        datas = get_list(url=url)
        for data in datas['datasource']:
            if xinhuaguangbo_url.findOne({'_id': data['contentId']}) is None :
                reslut ={}
                reslut['title'] = data['title']
                reslut['publishUrl'] = 'http://www.xinhuanet.com'+data['publishUrl']
                reslut['_id'] = data['contentId']
                reslut['keywords'] = data['keywords']
                reslut['summary'] = data['summary']
                reslut['publishTime'] = data['publishTime']
                if data['quote'] != '':
                    reslut['voice_url'] = data['quote']
                else:
                    reslut['voice_url'] = data['multimediaLink']

                reslut['text'] = get_txt(reslut['publishUrl'])

                # 保存音频 可以单独取出
                print(f'{reslut["_id"]} 音频地址为{reslut["voice_url"]}')
                if reslut['voice_url'] !='' and reslut['text'] is not None:
                    flag = download_yinpin(pathname=path+reslut['_id'],url=reslut['voice_url'])
                    if flag == 1:
                        reslut['status'] = 'done'
                        # 保存text
                        with open(path + reslut['_id'] + '.txt', 'w', encoding='utf-8') as f:
                            f.write(reslut['text'])
                    else:

                        reslut['status'] = 'failed'
                else:
                    print(f'{reslut["_id"]} 音频地址为空')
                    reslut['status'] = 'failed'
                try: #
                    xinhuaguangbo_url.insertOne(reslut)
                except Exception as e:
                    print(e)
                    xinhuaguangbo_url.updateOne({'_id': reslut['_id']}, reslut)


if  __name__ == '__main__':
    print()
    # download_yinpin()
    main()