get-cgyx
/
data_ysjg.py

import os

import requests
from bs4 import BeautifulSoup
from pip._vendor import requests

import public_params
import popup_util
import utils

table_list_result = list()
limit_page = -1


def get_data(sheet_title: str, input_value: int):
    """
    请求数据
    :param sheet_title: sheet标题
    :return:
    """
    global table_list_result, limit_page
    url = 'https://www.ccgp-hubei.gov.cn:9040/quSer/search'
    params = {
        'queryInfo.type': 'syjg',
        'queryInfo.cgdw': '',
        'queryInfo.xmmc': '',
        'queryInfo.key': '',
        'queryInfo.city': '宜昌市',
        'queryInfo.qybm': 420582,
        'queryInfo.district': '当阳市',
        'queryInfo.begin': '',
        'queryInfo.end': '',
        'queryInfo.begin1': '',
        'queryInfo.end1': '',
        'queryInfo.pageNo': 0,
        'queryInfo.pageSize': 15,
        'queryInfo.pageTotle': 1
    }

    table_list_result = list()
    # 主动限制条件(默认 -1 获取全部)
    limit_page = input_value
    headers = public_params.get_headers(public_params.get_session())
    # print(headers)
    while int(params['queryInfo.pageNo']) < int(params['queryInfo.pageSize']) and (
            limit_page < 0 or int(params['queryInfo.pageNo']) < limit_page):
        params['queryInfo.pageNo'] += 1
        table_list = get_https_data(url, params, headers)
        table_list_result.extend(table_list)

    # 添加第一项标题
    table_dic_first = dict()
    table_dic_first['index'] = '序号(100)'
    table_dic_first['title'] = '公告标题(1000)'
    table_dic_first['url'] = '验收结果公告详情地址(1200)'
    table_dic_first['time'] = '时间(260)'
    table_dic_first['cgxm'] = '采购项目(480)'
    table_dic_first['cgr'] = '采购人(320)'
    table_dic_first['ysrq'] = '验收日期(260)'
    table_dic_first['qy'] = '区域(260)'
    table_dic_first['introduction'] = '项目简介(2000)'
    table_list_result.insert(0, table_dic_first)

    # 导出到excel
    # print(table_list_result)
    public_params.export_to_excel(table_list_result, sheet_title)


def get_https_data(url: str, params: dict, headers: dict):
    """
    获取具体的请求数据
    :param url: 请求的地址
    :param params: 请求携带的参数
    :param headers: 请求头
    :return: 请求后得到的数据列表
    """
    os.environ['REQUESTS_CA_BUNDLE'] = "certifi/cacert.pem"
    response = requests.post(url, data=params, headers=headers, verify=False)
    response.encoding = 'utf-8'
    html_content = response.text
    # print(html_content)
    # return
    # 获取到table
    # soup = BeautifulSoup(html_content, 'html.parser')
    # 采用 html5lib解析，容错率更高
    soup = BeautifulSoup(html_content, 'html5lib')
    # 获取总数
    div_tag = soup.find("div", class_="serach-page-state")
    # print(div_tag)
    font_tag = div_tag.find('font', attrs={"color": "#106AB2"})
    # print(font_tag)
    total_num = font_tag.get_text()
    total_num = int(utils.get_between_strings_regex(total_num, "“", "”"))
    # 获取表单参数
    form_tag = soup.find('form', id='noticeForm')
    # print(form_tag)
    child_array = form_tag.findChildren('input', type='hidden', recursive=False)

    for child in child_array:
        if child.has_attr('name'):
            input_name = child['name']
            input_value = child['value']
            # print('input_name='+input_name)
            # print('input_value='+input_value)
            if 'queryInfo.pageNo'.__eq__(input_name):
                input_value = int(input_value)
            elif 'queryInfo.pageSize'.__eq__(input_name):
                input_value = int(input_value)
            elif 'queryInfo.pageTotle'.__eq__(input_name):
                input_value = int(input_value)
            params[input_name] = input_value
    # print(params)
    ul_tag = soup.find('ul', class_='serach-page-results list-unstyled')
    # print(ul_tag)
    li_item_list = ul_tag.find_all('li', class_='serach-page-results-item')
    table_list = list()
    current_page_num = params['queryInfo.pageNo']
    page_size = params['queryInfo.pageSize']
    total_page = params['queryInfo.pageTotle']
    for item_index in range(len(li_item_list)):
        # 列表条目
        item = li_item_list[item_index]
        # 字典内容
        table_dict = dict()
        table_dict['index'] = (item_index + 1 + (current_page_num - 1) * page_size)
        rows = item.find_all('div', class_='row')
        for row in rows:
            # 获取具体列表内容
            title_tag = row.find('div', class_='title ellipsis')
            if title_tag is not None:
                title = title_tag.find('a').get_text()
                url = title_tag.find('a').get('href')
                table_dict['title'] = title
                table_dict['url'] = url
                # print('title=' + title)
                # print('url=' + url)
            time_tag = row.find('div', class_='time')
            if time_tag is not None:
                time_text = time_tag.get_text()
                table_dict['time'] = time_text
                # print('time=' + time_format_text)
            cols_tag = row.find_all('div', class_='type-col')
            if cols_tag is not None:
                for col in cols_tag:
                    text = col.get_text()
                    text = text.replace('\n', '').replace('\t', '').replace(' ', '').replace('&nbsp', '')
                    split_text = ''
                    if '采购项目：' in text:
                        split_text = text[text.index('采购项目：') + 5:len(text)]
                        table_dict['cgxm'] = split_text
                    if '采购人：' in text:
                        split_text = text[text.index('采购人：') + 4:len(text)]
                        table_dict['cgr'] = split_text
                    if '验收日期：' in text:
                        split_text = text[text.index('验收日期：') + 5:len(text)]
                        table_dict['ysrq'] = split_text
                    if '区域：' in text:
                        split_text = text[text.index('区域：') + 3:len(text)]
                        table_dict['qy'] = split_text
                    # print('text=' + text)
                    # print('split_text=' + split_text)
            p_tag = row.find('p')
            if p_tag is not None:
                introduction = p_tag.get_text()
                table_dict['introduction'] = introduction
                # print('introduction=' + introduction)
        table_list.append(table_dict)
        # print(table_list)
        if limit_page > 0:
            # print(limit_page)
            if limit_page < total_page:
                popup_util.update_progress(round(table_dict['index'] / (limit_page * page_size), 2))
            else:
                popup_util.update_progress(round(table_dict['index'] / total_num, 2))
        else:
            popup_util.update_progress(round(table_dict['index'] / total_num, 2))
    return table_list