1 Star 0 Fork 0

辣姜/天眼查企业年报

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
spider_enterprise_report.py 6.84 KB
一键复制 编辑 原始数据 按行查看 历史
辣姜 提交于 2024-11-11 23:59 . [update]项目上传
import json
import time
import requests
from bs4 import BeautifulSoup
from jsonsearch import JsonSearch
import pymysql
import random
# 创建数据库连接
db = pymysql.connect(
host="localhost", # MySQL服务器地址
user="root", # 用户名
password="root", # 密码
database="robot" # 数据库名称
)
# 创建游标对象,用于执行SQL查询
cursor = db.cursor()
#进入天眼官网,F12 查看 token\tycid\_userId可以查看,登录不登录都能获取到
token = "eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzYwMDY4OTkyNCIsImlhdCI6MTcyNDQ2NjY3MywiZXhwIjoxNzI3MDU4NjczfQ.TijmbIePzHgjLesMdpoPW2C563-Fmgp3IfEKMwcRgnGXhVFnewvstDCm07WMK2th1s-NR4zClIHMowaf7eCtJw"
tycid = "120f86f061bf11efa459c77b35d5a34d"
_userId = 1724468142018
HEADERS_DATA = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh;q=0.9",
"Content-Type": "application/json",
"Host": "www.tianyancha.com",
"Origin": "https://www.tianyancha.com",
"X-Auth-Token": token,
"X-Tycid": tycid,
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
'Connection':'close'
}
annualReportListHeader = {
'access-control-allow-origin': 'https://www.tianyancha.com',
# 'connection': 'keep-alive',
'access-control-allow-credentials': 'true',
'content-type': 'application/json',
'Accept': 'application/json, text/plain, */*',
'Origin': 'https://www.tianyancha.com',
'Referer': 'https://www.tianyancha.com/',
'Version': 'TYC-Web',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
"X-Auth-Token": token,
"X-Tycid": tycid,
'Connection':'close'
}
## 使用代理的形式,需要付费有免费可代理成功的不多,严重影响爬取效率
proxy_count = 0
proxies = None
def proxy_request():
# 快递代理
# fetch_random_url = "https://tps.kdlapi.com/api/gettps/?secret_id=o4wx3s2c86e90d4rxh9f&signature=0bat77py8nbqbnx3t95ml52j6ejc2l1m&num=1&pt=1&format=json&sep=1"
# proxy_uri = requests.get(fetch_random_url).text
# if len(proxy_uri) == 0:
# print(u'暂时没有可用代理')
# print(u'获取到的代理是:' + proxy_uri)
# proxy_list = json.loads(proxy_uri)
# if proxy_list['data'] and proxy_list['data']['count'] > 0:
# tunnel = proxy_list['data']['proxy_list'][0]
## 隧道代理
username = "t12******" # 代理平台提供的用户明
password = "***" # 代理平台提供的密码
tunnel = 'x702.kdltps.com:15818'
proxies = {
"http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
"https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
}
return proxies
## 设置爬取失败超出多少,推出
fail_count = 0
max_fail_count = 10
## 检测那一年未做
spider_year = "2024"
while True:
try:
find_all_sql = "SELECT * FROM enterprise WHERE status = 0 and (phone != '-' or other_phone != '-')"
cursor.execute(find_all_sql)
spider_data = cursor.fetchone()
if fail_count > max_fail_count:
print("失败了已经超过限制", fail_count)
break
if proxy_count >= 8 or proxies is None:
proxies = proxy_request()
proxy_count = 0
proxy_count += 1
if proxies is None:
print("获取代理信息失败:", proxies)
break
if spider_data:
credit_code = spider_data[2]
url = "https://www.tianyancha.com/nsearch?key=%s" % (spider_data[2])
print("搜索的企业Url:", url)
session = requests.session()
resource = session.get(url=url, headers=HEADERS_DATA, verify=False)
if resource.status_code != 200:
print("搜索的企业响应失败:", resource.status_code)
break
else:
_soup = BeautifulSoup(resource.text, 'html.parser')
next_data = _soup.select_one("script#__NEXT_DATA__").string
next_json_data = json.loads(next_data)
company_data = {}
jsondata = JsonSearch(object=next_json_data, mode='j')
companyList = jsondata.search_all_value(key='companyList')
if len(companyList) < 0:
fail_count += 1
continue
companyList = companyList[0]
for company in companyList:
# try:
annualReportListUrl = f'https://capi.tianyancha.com/cloud-company-background/company/annualReportList?gid={company["id"]}&pageSize=20&pageNum=1'
print("千眼查企业年报信息Url:", annualReportListUrl)
resource = requests.get(url=annualReportListUrl, headers=annualReportListHeader, verify=False)
if resource.status_code == 200:
report_json_data = json.loads(resource.text)
if report_json_data['message'] == "请登录以使用完整功能":
proxies = proxy_request()
proxy_count = 0
proxy_count += 1
break
elif report_json_data['message'] == "无数据":
is_report = 3 # 无数据
elif report_json_data['data']:
is_report = 1 # 还未做
for report in report_json_data['data']:
print("report::")
if '2023' in report['reportYear'] and report['releaseDate'].startswith(spider_year):
is_report = 2 # 已经做了
break
else:
is_report = 500
update_sql = "update enterprise set status=%s,annual_report=\"%s\" where credit_code ='%s'" % (is_report, report_json_data, credit_code)
cursor.execute(update_sql)
wait_time = random.randint(1, 1)
print("停留等待时间:", wait_time)
time.sleep(wait_time)
else:
print("获取年报错误", resource.text)
break
break
else:
print("数据爬取完成")
break
except Exception as e:
print("执行失败", e)
proxies = proxy_request()
proxy_count = 0
proxy_count += 1
time.sleep(5)
break
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/qTp/tianyancha.git
git@gitee.com:qTp/tianyancha.git
qTp
tianyancha
天眼查企业年报
master

搜索帮助

0d507c66 1850385 C8b1a773 1850385