1 Star 0 Fork 2

ZSW/patent_system

forked from 魏泽桦/patent_system 
加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
patenthub_api.py 3.57 KB
一键复制 编辑 原始数据 按行查看 历史
aimerin 提交于 2021-04-26 16:47 . 2021-04-26 简单实现了LDA模型
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import random
import json
s = requests.Session()
# 每天更改文件名 patents+日期
patent_csv_path = 'dataset/patents04-03-01.csv'
claim_path = 'dataset/claim04-03-01.csv'
#已选关键字 刀剪 透镜 磨具 剪刀 锤子 镜子 钳子 夹子 气筒 石墨烯
keyword = "石墨烯"
token1='b29c9231c7c2737860e70af2e1be0e334bdf11aa'
token='06a605d92e3c2369d48532555ca165373113f5e4'
# 从patenthubAPI获取专利数据
def get_from_api():
columns = ['id', 'legalStatus', 'currentStatus', 'type', 'title', 'summary',
'applicant', 'applicationDate', 'mainIpc', 'applicationNumber',
'documentNumber', 'documentDate', 'inventor', 'applicantType', 'ipc', 'cpc']
patent_lists = []
page=0
iscircle=0
try:
for i in range(100):
url = 'https://www.patenthub.cn/api/s?ds=cn&t='+token+'&q=' + keyword + '&p=' + str(
i+1) + '&ps=50&sort=relation&v=1'
r = s.get(url)
print(r.text)
data = json.loads(r.text)
if data['code']==207:
print('页数page='+data['page'])
break
if data['page']==1:
iscircle=iscircle+1
if iscircle==2:
break
else:
page = page + 1
for patent in data['patents']:
patent_list = []
for column in columns:
if column in patent.keys():
patent_list.append(patent[column])
else:
patent_list.append("")
patent_lists.append(patent_list)
except BaseException:
print("今日该token次数已用尽")
finally:
print("page="+str(page))
patent_lists.insert(0, columns)
dataFrame = pd.DataFrame(data=patent_lists)
dataFrame.to_csv(patent_csv_path, encoding='UTF-8', mode='a', index=0, header=False)
# 从patenthubAPI获取权利要求书数据
def get_claims_from_api():
csv_data = pd.read_csv(patent_csv_path, low_memory=False) # 防止弹出警告
csv_df = pd.DataFrame(csv_data)
list = set(csv_df['id'].tolist())
columns = ['id', 'applicationNumber', 'documentNumber', 'claims']
patent_lists = []
for i in list:
patent_list = []
url = 'https://www.patenthub.cn/api/patent/claims?t='+token+'&id=' + str(
i) + '&v=1'
r = s.get(url)
data = json.loads(r.text)
print(r.text)
code = data['code']
if code !=200:
break
if 'patent' in data.keys():
patent = data['patent']
for column in columns:
if column in patent.keys():
patent_list.append(patent[column])
else:
patent_list.append("")
patent_lists.append(patent_list)
patent_lists.insert(0, columns)
dataFrame = pd.DataFrame(data=patent_lists)
dataFrame.to_csv(claim_path, encoding='UTF-8', mode='a',index=0, header=False)
# dataframe 去除重复
def drop_duplicates(path):
csv_data = pd.read_csv(path, low_memory=False) # 防止弹出警告
csv_df = pd.DataFrame(csv_data)
list=csv_df['id']
print(len(list)+"-->"+len(set(list)))
# csv_df.drop_duplicates(subset='id', keep='first', inplace=True)
# csv_df.to_csv(path, encoding='UTF-8', mode='w', header=False,index=0)
if __name__ == '__main__':
# login()
get_from_api()
# drop_duplicates(claim_path)
get_claims_from_api()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/zhushangwu/patent_system.git
git@gitee.com:zhushangwu/patent_system.git
zhushangwu
patent_system
patent_system
master

搜索帮助