5 Star 83 Fork 69

简笔轩客/Cov

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
spider.py 6.62 KB
一键复制 编辑 原始数据 按行查看 历史
简笔轩客 提交于 2020-04-04 17:22 . 第一次提交,功能基本完善
import sys
import time
import pymysql
import json
import traceback
import requests
from selenium.webdriver import Chrome,ChromeOptions
def get_tencent_data():
url1 = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5"
url2 = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_other"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"
}
r1 = requests.get(url1, headers)
r2 = requests.get(url2, headers)
res1 = json.loads(r1.text)
res2 = json.loads(r2.text)
data_all1 = json.loads(res1["data"])
data_all2 = json.loads(res2["data"])
history = {}
for i in data_all2["chinaDayList"]:
ds = "2020." + i["date"]
tup = time.strptime(ds, "%Y.%m.%d") # 匹配时间
ds = time.strftime("%Y-%m-%d", tup) # 改变时间格式
confirm = i["confirm"]
suspect = i["suspect"]
heal = i["heal"]
dead = i["dead"]
history[ds] = {"confirm": confirm, "suspect": suspect, "heal": heal, "dead": dead}
for i in data_all2["chinaDayAddList"]:
ds = "2020." + i["date"]
tup = time.strptime(ds, "%Y.%m.%d") # 匹配时间
ds = time.strftime("%Y-%m-%d", tup) # 改变时间格式
confirm = i["confirm"]
suspect = i["suspect"]
heal = i["heal"]
dead = i["dead"]
history[ds].update({"confirm_add": confirm, "suspect_add": suspect, "heal_add": heal, "dead_add": dead})
details = []
update_time = data_all1["lastUpdateTime"]
data_country = data_all1["areaTree"]
data_province = data_country[0]["children"]
for pro_infos in data_province:
province = pro_infos["name"]
for city_infos in pro_infos["children"]:
city = city_infos["name"]
confirm = city_infos["total"]["confirm"]
confirm_add = city_infos["today"]["confirm"]
heal = city_infos["total"]["heal"]
dead = city_infos["total"]["dead"]
details.append([update_time, province, city, confirm, confirm_add, heal, dead])
return history, details
def get_conn():
# 建立连接
conn = pymysql.connect(host="###", user="###", password="###", db="cov", charset="utf8")
# c创建游标
cursor = conn.cursor()
return conn, cursor
def close_conn(conn, cursor):
if cursor:
cursor.close()
if conn:
conn.close()
#定义更新细节函数
def update_details():
cursor = None
conn = None
try:
li = get_tencent_data()[1]#1代表最新数据
conn,cursor = get_conn()
sql = "insert into details(update_time,province,city,confirm,confirm_add,heal,dead) values(%s,%s,%s,%s,%s,%s,%s)"
sql_query = 'select %s=(select update_time from details order by id desc limit 1)'
#对比当前最大时间戳
cursor.execute(sql_query,li[0][0])
if not cursor.fetchone()[0]:
print(f"{time.asctime()}开始更新数据")
for item in li:
cursor.execute(sql,item)
conn.commit()
print(f"{time.asctime()}更新到最新数据")
else:
print(f"{time.asctime()}已是最新数据!")
except:
traceback.print_exc()
finally:
close_conn(conn,cursor)
#插入历史数据
def insert_history():
cursor = None
conn = None
try:
dic = get_tencent_data()[0]#0代表历史数据字典
print(f"{time.asctime()}开始插入历史数据")
conn,cursor = get_conn()
sql = "insert into history values (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
for k,v in dic.items():
cursor.execute(sql,[k, v.get("confirm"),v.get("confirm_add"),v.get("suspect"),
v.get("suspect_add"),v.get("heal"),v.get("heal_add"),
v.get("dead"),v.get("dead_add")])
conn.commit()
print(f"{time.asctime()}插入历史数据完毕")
except:
traceback.print_exc()
finally:
close_conn(conn,cursor)
#更新历史数据
def update_history():
cursor = None
conn = None
try:
dic = get_tencent_data()[0]#0代表历史数据字典
print(f"{time.asctime()}开始更新历史数据")
conn,cursor = get_conn()
sql = "insert into history values (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
sql_query = "select confirm from history where ds=%s"
for k,v in dic.items():
if not cursor.execute(sql_query,k):
cursor.execute(sql,[k, v.get("confirm"),v.get("confirm_add"),v.get("suspect"),
v.get("suspect_add"),v.get("heal"),v.get("heal_add"),
v.get("dead"),v.get("dead_add")])
conn.commit()
print(f"{time.asctime()}历史数据更新完毕")
except:
traceback.print_exc()
finally:
close_conn(conn,cursor)
#爬取百度热搜数据
def get_baidu_hot():
option = ChromeOptions()
option.add_argument("--headless")#隐藏游览器
option.add_argument("--no--sandbox")
browser = Chrome(options = option,executable_path="chromedriver-dev.exe")
url = "https://voice.baidu.com/act/virussearch/virussearch?from=osari_map&tab=0&infomore=1"
browser.get(url)
#print(brower.page_source)
but = browser.find_element_by_css_selector('#ptab-0 > div > div.VirusHot_1-5-3_32AY4F.VirusHot_1-5-3_2RnRvg > section > div')
#点击加载更多
but.click()
time.sleep(1)
#爬虫与反爬,模拟人等待1秒
c = browser.find_elements_by_xpath('//*[@id="ptab-0"]/div/div[2]/section/a/div/span[2]')
context = [i.text for i in c]
browser.close()
return context
def update_hotsearch():
cursor = None
conn = None
try:
context = get_baidu_hot()
print(f"{time.asctime()}开始更新数据")
conn,cursor = get_conn()
sql = "insert into hotsearch(dt,content) values(%s,%s)"
ts = time.strftime("%Y-%m-%d %X")
for i in context:
cursor.execute(sql,(ts,i))
conn.commit()
print(f"{time.asctime()}数据更新完毕")
except:
traceback.print_exc()
finally:
close_conn(conn,cursor)
if __name__ == "__main__":
l = len(sys.argv)
if l == 1:
s = """
请输入参数
参数说明,
up_his 更新历史记录表
up_hot 更新实时热搜
up_det 更新详细表
"""
print(s)
else:
order = sys.argv[1]
if order == "up_his":
update_history()
elif order == "up_det":
update_details()
elif order == "up_hot":
update_hotsearch()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/yzcy/Cov.git
git@gitee.com:yzcy/Cov.git
yzcy
Cov
Cov
master

搜索帮助

D67c1975 1850385 1daf7b77 1850385