master

分支 (1)

管理

管理

master

python_acquisition
/
xly.py

import pymysql
from selenium import webdriver
import re
import hashlib
import datetime
from multiprocessing.dummy import Pool
import time
from random import randint


class XLY(object):
    def __init__(self):
        self.host = '127.0.0.1'
        self.db = 'app_mark'
        self.user = 'root'
        self.passwd = '123456'
        self.charset = 'utf8mb4'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
        }

    def get_links(self):
        con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
        cur = con.cursor()
        sql = 'select link from gly where tag="0" and sitename = "国家食品药品监督管理总局"'
        try:
            cur.execute(sql)
            results = cur.fetchall()
        except Exception as e:
            con.rollback()
            results = None
            print('error~ ', e)
        else:
            con.commit()
        cur.close()
        con.close()

        return results

    def parse_data(self, link):
        # 有的driver不能及时关闭，设置随机延迟降低内存消耗速度
        # 使用driver.quit内存占用 < 2G
        time.sleep(randint(1, 5))
        # 直接拿源码拿不到，用selenium
        opt = webdriver.ChromeOptions()
        opt.add_argument('--headless')
        opt.add_argument('--no-sandbox')
        opt.add_argument('--disable-gpu')
        opt.add_argument('--disable-dev-shm-usage')
        driver = webdriver.Chrome(chrome_options=opt)
        driver.get(link)
        text = driver.page_source
        # driver.close()
        driver.quit()  # 使用quit可以很好地降低内存占用率
        # print(text)
        # 下载文件链接
        # http: // samr.cfda.gov.cn / directory / web / WS01 / images / localgov / gov_1553202201616.xlsx
        # href = "/directory/web/WS01/images/localgov/gov_1553202201616.xlsx"
        # 这里最坑的就是拿下来的源码和从浏览器里看的源码不一样，总是多空格，导致正则匹配不到！！！！
        file_urls = re.findall('href="(/directory/web/WS01/.*?)"', text, re.S)

        for file_url in file_urls:
            # 对所有附件的链接格式化
            file_url = 'http://samr.cfda.gov.cn{}'.format(file_url)
            print(file_url)
            self.save_file_url(file_url)

    def save_file_url(self, url):
        hkey = hashlib.md5(url.encode(encoding='utf-8')).hexdigest()
        lasttime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        list_data = [url, hkey, lasttime]
        con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
        cur = con.cursor()
        sql = 'insert into xly (IR_URLNAME, IR_HKEY, IR_LASTTIME) values (%s, %s, %s)'
        try:
            cur.execute(sql, list_data)
            print('insert success')
        except Exception as e:
            con.rollback()
            print('error~', e)
        else:
            con.commit()
        cur.close()
        con.close()


if __name__ == '__main__':
    xly = XLY()
    results = xly.get_links()
    if results:
        results = [x[0] for x in results]
        pool1 = Pool(10)
        pool1.map(xly.parse_data, results)
        pool1.close()
        pool1.join()
        # for link in results:
        #     一个link是一篇文章，每一篇文章可能有多个附件
        #     xly.parse_data(link)
        #     break