master

分支 (1)

管理

管理

master

tianyancha
/
spider_enterprise_report.py

import json
import time

import requests
from bs4 import BeautifulSoup
from jsonsearch import JsonSearch
import pymysql
import random

# 创建数据库连接
db = pymysql.connect(
    host="localhost",  # MySQL服务器地址
    user="root",  # 用户名
    password="root",  # 密码
    database="robot"  # 数据库名称
)
# 创建游标对象，用于执行SQL查询
cursor = db.cursor()

#进入天眼官网，F12 查看 token\tycid\_userId可以查看，登录不登录都能获取到
token = "eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzYwMDY4OTkyNCIsImlhdCI6MTcyNDQ2NjY3MywiZXhwIjoxNzI3MDU4NjczfQ.TijmbIePzHgjLesMdpoPW2C563-Fmgp3IfEKMwcRgnGXhVFnewvstDCm07WMK2th1s-NR4zClIHMowaf7eCtJw"
tycid = "120f86f061bf11efa459c77b35d5a34d"
_userId = 1724468142018


HEADERS_DATA = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br, zstd",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Content-Type": "application/json",
    "Host": "www.tianyancha.com",
    "Origin": "https://www.tianyancha.com",
    "X-Auth-Token": token,
    "X-Tycid": tycid,
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
    'Connection':'close'
}
annualReportListHeader = {
    'access-control-allow-origin': 'https://www.tianyancha.com',
    # 'connection': 'keep-alive',
    'access-control-allow-credentials': 'true',
    'content-type': 'application/json',
    'Accept': 'application/json, text/plain, */*',
    'Origin': 'https://www.tianyancha.com',
    'Referer': 'https://www.tianyancha.com/',
    'Version': 'TYC-Web',
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
    "X-Auth-Token": token,
    "X-Tycid": tycid,
    'Connection':'close'
}

## 使用代理的形式，需要付费有免费可代理成功的不多，严重影响爬取效率
proxy_count = 0
proxies = None
def proxy_request():
    #  快递代理
    # fetch_random_url = "https://tps.kdlapi.com/api/gettps/?secret_id=o4wx3s2c86e90d4rxh9f&signature=0bat77py8nbqbnx3t95ml52j6ejc2l1m&num=1&pt=1&format=json&sep=1"
    # proxy_uri = requests.get(fetch_random_url).text
    # if len(proxy_uri) == 0:
    #     print(u'暂时没有可用代理')
    # print(u'获取到的代理是：' + proxy_uri)
    # proxy_list = json.loads(proxy_uri)
    # if proxy_list['data'] and proxy_list['data']['count'] > 0:
        # tunnel = proxy_list['data']['proxy_list'][0]
    ## 隧道代理
    username = "t12******" # 代理平台提供的用户明
    password = "***" # 代理平台提供的密码
    tunnel = 'x702.kdltps.com:15818'
    proxies = {
        "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
        "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
    }
    return proxies

## 设置爬取失败超出多少，推出
fail_count = 0
max_fail_count = 10
## 检测那一年未做
spider_year = "2024"

while True:
    try:
        find_all_sql = "SELECT * FROM enterprise WHERE status = 0 and (phone != '-' or  other_phone != '-')"
        cursor.execute(find_all_sql)
        spider_data = cursor.fetchone()
        if fail_count > max_fail_count:
            print("失败了已经超过限制", fail_count)
            break
        if proxy_count >= 8 or proxies is None:
            proxies = proxy_request()
            proxy_count = 0
            proxy_count += 1

        if proxies is None:
            print("获取代理信息失败:", proxies)
            break
        if spider_data:
            credit_code = spider_data[2]
            url = "https://www.tianyancha.com/nsearch?key=%s" % (spider_data[2])
            print("搜索的企业Url:", url)
            session = requests.session()
            resource = session.get(url=url, headers=HEADERS_DATA, verify=False)
            if resource.status_code != 200:
                print("搜索的企业响应失败:", resource.status_code)
                break
            else:
                _soup = BeautifulSoup(resource.text, 'html.parser')
                next_data = _soup.select_one("script#__NEXT_DATA__").string
                next_json_data = json.loads(next_data)
                company_data = {}
                jsondata = JsonSearch(object=next_json_data, mode='j')
                companyList = jsondata.search_all_value(key='companyList')
                if len(companyList) < 0:
                    fail_count += 1
                    continue
                companyList = companyList[0]
                for company in companyList:
                   # try:
                    annualReportListUrl = f'https://capi.tianyancha.com/cloud-company-background/company/annualReportList?gid={company["id"]}&pageSize=20&pageNum=1'
                    print("千眼查企业年报信息Url：", annualReportListUrl)
                    resource = requests.get(url=annualReportListUrl, headers=annualReportListHeader, verify=False)
                    if resource.status_code == 200:
                        report_json_data = json.loads(resource.text)
                        if report_json_data['message'] == "请登录以使用完整功能":
                            proxies = proxy_request()
                            proxy_count = 0
                            proxy_count += 1
                            break
                        elif report_json_data['message'] == "无数据":
                            is_report = 3  # 无数据
                        elif report_json_data['data']:
                            is_report = 1  # 还未做
                            for report in report_json_data['data']:
                                print("report::")
                                if '2023' in report['reportYear'] and report['releaseDate'].startswith(spider_year):
                                    is_report = 2  # 已经做了
                                    break
                        else:
                            is_report = 500
                        update_sql = "update enterprise set status=%s,annual_report=\"%s\" where credit_code ='%s'" % (is_report, report_json_data, credit_code)
                        cursor.execute(update_sql)
                        wait_time = random.randint(1, 1)
                        print("停留等待时间:", wait_time)
                        time.sleep(wait_time)
                    else:
                        print("获取年报错误", resource.text)
                        break
                    break
        else:
            print("数据爬取完成")
            break
    except Exception as e:
        print("执行失败", e)
        proxies = proxy_request()
        proxy_count = 0
        proxy_count += 1
        time.sleep(5)
        break