代码拉取完成,页面将自动刷新
import os
import requests
from bs4 import BeautifulSoup
from pip._vendor import requests
import public_params
import popup_util
import utils
table_list_result = list()
limit_page = -1
def get_data(sheet_title: str, input_value: int):
"""
请求数据
:param sheet_title: sheet标题
:return:
"""
global table_list_result, limit_page
url = 'https://www.ccgp-hubei.gov.cn:9040/quSer/search'
params = {
'queryInfo.type': 'syjg',
'queryInfo.cgdw': '',
'queryInfo.xmmc': '',
'queryInfo.key': '',
'queryInfo.city': '宜昌市',
'queryInfo.qybm': 420582,
'queryInfo.district': '当阳市',
'queryInfo.begin': '',
'queryInfo.end': '',
'queryInfo.begin1': '',
'queryInfo.end1': '',
'queryInfo.pageNo': 0,
'queryInfo.pageSize': 15,
'queryInfo.pageTotle': 1
}
table_list_result = list()
# 主动限制条件(默认 -1 获取全部)
limit_page = input_value
headers = public_params.get_headers(public_params.get_session())
# print(headers)
while int(params['queryInfo.pageNo']) < int(params['queryInfo.pageSize']) and (
limit_page < 0 or int(params['queryInfo.pageNo']) < limit_page):
params['queryInfo.pageNo'] += 1
table_list = get_https_data(url, params, headers)
table_list_result.extend(table_list)
# 添加第一项标题
table_dic_first = dict()
table_dic_first['index'] = '序号(100)'
table_dic_first['title'] = '公告标题(1000)'
table_dic_first['url'] = '验收结果公告详情地址(1200)'
table_dic_first['time'] = '时间(260)'
table_dic_first['cgxm'] = '采购项目(480)'
table_dic_first['cgr'] = '采购人(320)'
table_dic_first['ysrq'] = '验收日期(260)'
table_dic_first['qy'] = '区域(260)'
table_dic_first['introduction'] = '项目简介(2000)'
table_list_result.insert(0, table_dic_first)
# 导出到excel
# print(table_list_result)
public_params.export_to_excel(table_list_result, sheet_title)
def get_https_data(url: str, params: dict, headers: dict):
"""
获取具体的请求数据
:param url: 请求的地址
:param params: 请求携带的参数
:param headers: 请求头
:return: 请求后得到的数据列表
"""
os.environ['REQUESTS_CA_BUNDLE'] = "certifi/cacert.pem"
response = requests.post(url, data=params, headers=headers, verify=False)
response.encoding = 'utf-8'
html_content = response.text
# print(html_content)
# return
# 获取到table
# soup = BeautifulSoup(html_content, 'html.parser')
# 采用 html5lib解析,容错率更高
soup = BeautifulSoup(html_content, 'html5lib')
# 获取总数
div_tag = soup.find("div", class_="serach-page-state")
# print(div_tag)
font_tag = div_tag.find('font', attrs={"color": "#106AB2"})
# print(font_tag)
total_num = font_tag.get_text()
total_num = int(utils.get_between_strings_regex(total_num, "“", "”"))
# 获取表单参数
form_tag = soup.find('form', id='noticeForm')
# print(form_tag)
child_array = form_tag.findChildren('input', type='hidden', recursive=False)
for child in child_array:
if child.has_attr('name'):
input_name = child['name']
input_value = child['value']
# print('input_name='+input_name)
# print('input_value='+input_value)
if 'queryInfo.pageNo'.__eq__(input_name):
input_value = int(input_value)
elif 'queryInfo.pageSize'.__eq__(input_name):
input_value = int(input_value)
elif 'queryInfo.pageTotle'.__eq__(input_name):
input_value = int(input_value)
params[input_name] = input_value
# print(params)
ul_tag = soup.find('ul', class_='serach-page-results list-unstyled')
# print(ul_tag)
li_item_list = ul_tag.find_all('li', class_='serach-page-results-item')
table_list = list()
current_page_num = params['queryInfo.pageNo']
page_size = params['queryInfo.pageSize']
total_page = params['queryInfo.pageTotle']
for item_index in range(len(li_item_list)):
# 列表条目
item = li_item_list[item_index]
# 字典内容
table_dict = dict()
table_dict['index'] = (item_index + 1 + (current_page_num - 1) * page_size)
rows = item.find_all('div', class_='row')
for row in rows:
# 获取具体列表内容
title_tag = row.find('div', class_='title ellipsis')
if title_tag is not None:
title = title_tag.find('a').get_text()
url = title_tag.find('a').get('href')
table_dict['title'] = title
table_dict['url'] = url
# print('title=' + title)
# print('url=' + url)
time_tag = row.find('div', class_='time')
if time_tag is not None:
time_text = time_tag.get_text()
table_dict['time'] = time_text
# print('time=' + time_format_text)
cols_tag = row.find_all('div', class_='type-col')
if cols_tag is not None:
for col in cols_tag:
text = col.get_text()
text = text.replace('\n', '').replace('\t', '').replace(' ', '').replace(' ', '')
split_text = ''
if '采购项目:' in text:
split_text = text[text.index('采购项目:') + 5:len(text)]
table_dict['cgxm'] = split_text
if '采购人:' in text:
split_text = text[text.index('采购人:') + 4:len(text)]
table_dict['cgr'] = split_text
if '验收日期:' in text:
split_text = text[text.index('验收日期:') + 5:len(text)]
table_dict['ysrq'] = split_text
if '区域:' in text:
split_text = text[text.index('区域:') + 3:len(text)]
table_dict['qy'] = split_text
# print('text=' + text)
# print('split_text=' + split_text)
p_tag = row.find('p')
if p_tag is not None:
introduction = p_tag.get_text()
table_dict['introduction'] = introduction
# print('introduction=' + introduction)
table_list.append(table_dict)
# print(table_list)
if limit_page > 0:
# print(limit_page)
if limit_page < total_page:
popup_util.update_progress(round(table_dict['index'] / (limit_page * page_size), 2))
else:
popup_util.update_progress(round(table_dict['index'] / total_num, 2))
else:
popup_util.update_progress(round(table_dict['index'] / total_num, 2))
return table_list
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。