patent_system
/
patent_spider.py

# -*- coding=utf8 -*-
# 从patenthub获取专利数据
# 数据内容包括基本信息以及权利要求书
import urllib
import requests
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import random
import json

###
###爬虫
###
cookie='T_ID=20210208000457JjHQfCoPjTOQCWccdl; source=b64:ZGlyZWN0CW51bGwJL3VzZXIvbG9naW4uanNvbgkxMjAuMjI5LjI1NC4y; pref="ds:cn,s:score!,dm:mix_10"; C_UER_ID=; l=1; U_TOKEN=aa00329ae54cb4eff6dc2cb086c9f46eb04ba3f7; s=SBg4FgJdfFMISnBTOisAEzIZMVcyLVpTGT9ELD4tJg4SHSIzNRw7PQJhaw4ZH09JHBobIAMFCSwBHnAIA1QpR3F+dwMDU1JeFRBPWUIRUBUFHxkOXA=='
headers = {'user-agent': 'Mozilla/5.0', 'Referer': 'https://www.patenthub.cn/s?ds=cn&q=%E5%88%80%E5%89%AA',
           'cookie': cookie}
# 生成Session对象，用于保存Cookie
s = requests.Session()


# 模拟登录
def login():
    login_url = 'https://www.patenthub.cn/user/login.json'
    headers = {'user-agent': 'Mozilla/5.0', 'Referer': 'https://www.patenthub.cn/s?ds=cn&q=%E5%88%80%E5%89%AA'}
    # 用户名和密码
    data = {
        'account': '19927538213',
        'password': 'zhuanlihui123456',
    }
    try:
        r = s.post(login_url, headers=headers, data=data)
        r.raise_for_status()
    except:
        print('模拟登录失败')
    print('登录成功')


# 批量爬取
def batch_spider():
    for i in range(10):
        spider(i)
        time.sleep(random.random() * 5)


def spider(page):
    URL = 'https://www.patenthub.cn/s?p=' + str(page) + '&ds=cn&q=%E5%88%80%E5%89%AA'
    print('爬取网页的URL' + URL)
    # 获取网页元素
    try:
        r = s.get(URL, headers=headers)
        r.raise_for_status()
        contents = r.text
        #print(contents)
    except:
        print('spider dead')
    #解析网页元素
    soup = BeautifulSoup(contents, features='html.parser')
    items = soup.find_all('ul', class_='ui items')

    patentList = []


    for item in items:
        element = str(item)
        # print(element)
        title = list2str(re.findall('<span data-property="title">(.*)</span>', element))
        documentNumber = list2str(re.findall('<span data-property="documentNumber">(.*)</span>', element))
        documentDate = list2str(re.findall('<span data-property="documentDate">(.*)</span>', element))
        applicationNumber = list2str(re.findall('<span data-property="applicationNumber">(.*)</span>', element))
        applicationDate = list2str(re.findall('<span data-property="applicationDate">(.*)</span>', element))
        applicant = list2str(re.findall('<span data-property="applicant">(.*)</span>', element))
        inventor = list2str(re.findall('<span data-property="inventor">(.*)</span>', element))
        ipcs = list2str(re.findall('<span class="ipc" data-property="ipc">(.*)</span>', element))
        summary = re.compile('<span data-property="summary">\n(.*?)</span>', re.DOTALL).findall(element)
        if len(summary) == 0:
            summary = re.compile(
                '<span data-cc="cn" data-property="summary" data-role="translation" data-target=".*?" data-trigger-container="p" data-type="摘要">\n(.*?)</span>',
                re.DOTALL).findall(element)
        summary = list2str(summary).split()

        patent = [title, documentNumber, documentDate, applicationNumber, applicationDate, applicant, inventor, ipcs,
                  summary]
        patentList.append(patent)
        claim='https://www.patenthub.cn/patent/'+documentNumber+'.html?ds=cn#/claims'
        print(claim)

    #save2csv(patentList)

def get_claim(documentNumber):
    # URL= 'https://www.patenthub.cn/patent/'+documentNumber+'/pdf'
    # path = documentNumber+".pdf"
    # try:
    #     r = s.get(URL, headers=headers)
    #     r.raise_for_status()
    # except:
    #     print('spider dead')
    #
    # pdf_url='https://www.patenthub.cn'+list2str(re.findall('src=(.*) allowfullscreen',bytes.decode(r.content)))
    # try:
    #     r = s.get('https://www.patenthub.cn/pdf_browse/web/viewer.html?file=/CN/docs/2016/01/06/1/CN103537956B-%E4%B8%80%E7%A7%8D%E5%88%80%E5%89%AA%E7%A3%A8%E5%BA%8A.pdf', headers=headers)
    #     r.raise_for_status()
    # except:
    #     print('spider dead')
    # with open(path, "wb") as f:
    #     f.write(r.content)
    # f.close()
    r = requests.get('https://www.patenthub.cn/pdf_browse/web/viewer.html?file=/CN/docs/2016/01/06/1/CN103537956B-%E4%B8%80%E7%A7%8D%E5%88%80%E5%89%AA%E7%A3%A8%E5%BA%8A.pdf',headers=headers)
    with open("code3.pdf", "wb") as code:
        code.write(r.content)

def save2csv(list):
    columns = ['title', 'documentNumber', 'documentDate', 'applicationNumber', 'applicationDate', 'applicant',
               'inventor',
               'ipcs', 'summary']
    dataFrame = pd.DataFrame(data=list)
    dataFrame.to_csv(patent_csv_path, encoding='UTF-8', mode='a', header=False)


def list2str(list):
    return ",".join(list)


class Patent(object):
    def __init__(self, title, documentNumber, documentDate, applicationNumber, applicationDate, applicant, inventor,
                 ipcs, summary):
        self.title = title
        self.documentNumber = documentNumber
        self.documentDate = documentDate
        self.applicationNumber = applicationNumber
        self.applicationDate = applicationDate
        self.applicant = applicant
        self.inventor = inventor
        self.ipcs = ipcs
        self.summary = summary

if __name__ == '__main__':
    get_claim('CN103537956B')