1 Star 1 Fork 2

魏泽桦/patent_system

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
patent_spider.py 5.34 KB
一键复制 编辑 原始数据 按行查看 历史
aimerin 提交于 2021-04-26 16:47 . 2021-04-26 简单实现了LDA模型
# -*- coding=utf8 -*-
# 从patenthub获取专利数据
# 数据内容包括基本信息以及权利要求书
import urllib
import requests
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import random
import json
###
###爬虫
###
cookie='T_ID=20210208000457JjHQfCoPjTOQCWccdl; source=b64:ZGlyZWN0CW51bGwJL3VzZXIvbG9naW4uanNvbgkxMjAuMjI5LjI1NC4y; pref="ds:cn,s:score!,dm:mix_10"; C_UER_ID=; l=1; U_TOKEN=aa00329ae54cb4eff6dc2cb086c9f46eb04ba3f7; s=SBg4FgJdfFMISnBTOisAEzIZMVcyLVpTGT9ELD4tJg4SHSIzNRw7PQJhaw4ZH09JHBobIAMFCSwBHnAIA1QpR3F+dwMDU1JeFRBPWUIRUBUFHxkOXA=='
headers = {'user-agent': 'Mozilla/5.0', 'Referer': 'https://www.patenthub.cn/s?ds=cn&q=%E5%88%80%E5%89%AA',
'cookie': cookie}
# 生成Session对象,用于保存Cookie
s = requests.Session()
# 模拟登录
def login():
login_url = 'https://www.patenthub.cn/user/login.json'
headers = {'user-agent': 'Mozilla/5.0', 'Referer': 'https://www.patenthub.cn/s?ds=cn&q=%E5%88%80%E5%89%AA'}
# 用户名和密码
data = {
'account': '19927538213',
'password': 'zhuanlihui123456',
}
try:
r = s.post(login_url, headers=headers, data=data)
r.raise_for_status()
except:
print('模拟登录失败')
print('登录成功')
# 批量爬取
def batch_spider():
for i in range(10):
spider(i)
time.sleep(random.random() * 5)
def spider(page):
URL = 'https://www.patenthub.cn/s?p=' + str(page) + '&ds=cn&q=%E5%88%80%E5%89%AA'
print('爬取网页的URL' + URL)
# 获取网页元素
try:
r = s.get(URL, headers=headers)
r.raise_for_status()
contents = r.text
#print(contents)
except:
print('spider dead')
#解析网页元素
soup = BeautifulSoup(contents, features='html.parser')
items = soup.find_all('ul', class_='ui items')
patentList = []
for item in items:
element = str(item)
# print(element)
title = list2str(re.findall('<span data-property="title">(.*)</span>', element))
documentNumber = list2str(re.findall('<span data-property="documentNumber">(.*)</span>', element))
documentDate = list2str(re.findall('<span data-property="documentDate">(.*)</span>', element))
applicationNumber = list2str(re.findall('<span data-property="applicationNumber">(.*)</span>', element))
applicationDate = list2str(re.findall('<span data-property="applicationDate">(.*)</span>', element))
applicant = list2str(re.findall('<span data-property="applicant">(.*)</span>', element))
inventor = list2str(re.findall('<span data-property="inventor">(.*)</span>', element))
ipcs = list2str(re.findall('<span class="ipc" data-property="ipc">(.*)</span>', element))
summary = re.compile('<span data-property="summary">\n(.*?)</span>', re.DOTALL).findall(element)
if len(summary) == 0:
summary = re.compile(
'<span data-cc="cn" data-property="summary" data-role="translation" data-target=".*?" data-trigger-container="p" data-type="摘要">\n(.*?)</span>',
re.DOTALL).findall(element)
summary = list2str(summary).split()
patent = [title, documentNumber, documentDate, applicationNumber, applicationDate, applicant, inventor, ipcs,
summary]
patentList.append(patent)
claim='https://www.patenthub.cn/patent/'+documentNumber+'.html?ds=cn#/claims'
print(claim)
#save2csv(patentList)
def get_claim(documentNumber):
# URL= 'https://www.patenthub.cn/patent/'+documentNumber+'/pdf'
# path = documentNumber+".pdf"
# try:
# r = s.get(URL, headers=headers)
# r.raise_for_status()
# except:
# print('spider dead')
#
# pdf_url='https://www.patenthub.cn'+list2str(re.findall('src=(.*) allowfullscreen',bytes.decode(r.content)))
# try:
# r = s.get('https://www.patenthub.cn/pdf_browse/web/viewer.html?file=/CN/docs/2016/01/06/1/CN103537956B-%E4%B8%80%E7%A7%8D%E5%88%80%E5%89%AA%E7%A3%A8%E5%BA%8A.pdf', headers=headers)
# r.raise_for_status()
# except:
# print('spider dead')
# with open(path, "wb") as f:
# f.write(r.content)
# f.close()
r = requests.get('https://www.patenthub.cn/pdf_browse/web/viewer.html?file=/CN/docs/2016/01/06/1/CN103537956B-%E4%B8%80%E7%A7%8D%E5%88%80%E5%89%AA%E7%A3%A8%E5%BA%8A.pdf',headers=headers)
with open("code3.pdf", "wb") as code:
code.write(r.content)
def save2csv(list):
columns = ['title', 'documentNumber', 'documentDate', 'applicationNumber', 'applicationDate', 'applicant',
'inventor',
'ipcs', 'summary']
dataFrame = pd.DataFrame(data=list)
dataFrame.to_csv(patent_csv_path, encoding='UTF-8', mode='a', header=False)
def list2str(list):
return ",".join(list)
class Patent(object):
def __init__(self, title, documentNumber, documentDate, applicationNumber, applicationDate, applicant, inventor,
ipcs, summary):
self.title = title
self.documentNumber = documentNumber
self.documentDate = documentDate
self.applicationNumber = applicationNumber
self.applicationDate = applicationDate
self.applicant = applicant
self.inventor = inventor
self.ipcs = ipcs
self.summary = summary
if __name__ == '__main__':
get_claim('CN103537956B')
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/aimerin/patent_system.git
git@gitee.com:aimerin/patent_system.git
aimerin
patent_system
patent_system
master

搜索帮助