代码拉取完成,页面将自动刷新
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import random
import json
s = requests.Session()
# 每天更改文件名 patents+日期
patent_csv_path = 'dataset/patents04-03-01.csv'
claim_path = 'dataset/claim04-03-01.csv'
#已选关键字 刀剪 透镜 磨具 剪刀 锤子 镜子 钳子 夹子 气筒 石墨烯
keyword = "石墨烯"
token1='b29c9231c7c2737860e70af2e1be0e334bdf11aa'
token='06a605d92e3c2369d48532555ca165373113f5e4'
# 从patenthubAPI获取专利数据
def get_from_api():
columns = ['id', 'legalStatus', 'currentStatus', 'type', 'title', 'summary',
'applicant', 'applicationDate', 'mainIpc', 'applicationNumber',
'documentNumber', 'documentDate', 'inventor', 'applicantType', 'ipc', 'cpc']
patent_lists = []
page=0
iscircle=0
try:
for i in range(100):
url = 'https://www.patenthub.cn/api/s?ds=cn&t='+token+'&q=' + keyword + '&p=' + str(
i+1) + '&ps=50&sort=relation&v=1'
r = s.get(url)
print(r.text)
data = json.loads(r.text)
if data['code']==207:
print('页数page='+data['page'])
break
if data['page']==1:
iscircle=iscircle+1
if iscircle==2:
break
else:
page = page + 1
for patent in data['patents']:
patent_list = []
for column in columns:
if column in patent.keys():
patent_list.append(patent[column])
else:
patent_list.append("")
patent_lists.append(patent_list)
except BaseException:
print("今日该token次数已用尽")
finally:
print("page="+str(page))
patent_lists.insert(0, columns)
dataFrame = pd.DataFrame(data=patent_lists)
dataFrame.to_csv(patent_csv_path, encoding='UTF-8', mode='a', index=0, header=False)
# 从patenthubAPI获取权利要求书数据
def get_claims_from_api():
csv_data = pd.read_csv(patent_csv_path, low_memory=False) # 防止弹出警告
csv_df = pd.DataFrame(csv_data)
list = set(csv_df['id'].tolist())
columns = ['id', 'applicationNumber', 'documentNumber', 'claims']
patent_lists = []
for i in list:
patent_list = []
url = 'https://www.patenthub.cn/api/patent/claims?t='+token+'&id=' + str(
i) + '&v=1'
r = s.get(url)
data = json.loads(r.text)
print(r.text)
code = data['code']
if code !=200:
break
if 'patent' in data.keys():
patent = data['patent']
for column in columns:
if column in patent.keys():
patent_list.append(patent[column])
else:
patent_list.append("")
patent_lists.append(patent_list)
patent_lists.insert(0, columns)
dataFrame = pd.DataFrame(data=patent_lists)
dataFrame.to_csv(claim_path, encoding='UTF-8', mode='a',index=0, header=False)
# dataframe 去除重复
def drop_duplicates(path):
csv_data = pd.read_csv(path, low_memory=False) # 防止弹出警告
csv_df = pd.DataFrame(csv_data)
list=csv_df['id']
print(len(list)+"-->"+len(set(list)))
# csv_df.drop_duplicates(subset='id', keep='first', inplace=True)
# csv_df.to_csv(path, encoding='UTF-8', mode='w', header=False,index=0)
if __name__ == '__main__':
# login()
get_from_api()
# drop_duplicates(claim_path)
get_claims_from_api()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。