1 Star 0 Fork 0

吴超/huamei

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
not_jk_fk.py 3.30 KB
一键复制 编辑 原始数据 按行查看 历史
wmc 提交于 2019-10-20 22:27 . first commit
from MongoDbHandler import MongoDbHandler
from lxml import etree
import requests
import time
mongoSession = MongoDbHandler('127.0.0.1', 'admin', 'admin')
class Baidu:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
}
self.url = "https://www.baidu.com/s"
self.hm = "华美顾问"
self.hm_gg = "华美顾问"
def page(self, kw="酒店顾问"):
pm = '50+'
try:
for j in range(3):
pn = j * 10
data = {"wd": kw, "pn": pn}
# data = {"wd": kw, "rn": 50}
r = requests.get(self.url, params=data, headers=self.headers)
con = r.text
# with open("百度_{}_{}.html".format(kw, j + 1), "w", encoding="utf8") as f:
# f.write(con)
html = etree.HTML(con)
wb_title = html.xpath('//title/text()')[0]
print("百度电脑端", j + 1, "-", wb_title)
h3_li = html.xpath("//h3")
for i, h3 in enumerate(h3_li):
h3_text_li = h3.xpath('.//text()')
for h in h3_text_li:
if '\n' in h:
h3_text_li.remove(h)
h3_text = ''.join(h3_text_li)
# print(h3_text)
if self.hm in h3_text or self.hm_gg in h3_text:
pm = str(pn + i + 1)
except Exception as e:
print("百度电脑端请求url失败--", kw)
print(e)
finally:
return pm
def save_mongodb(self, kw_dict):
t = time.time()
ymd = time.strftime('%Y-%m-%d/%H:%M', time.localtime(t))
hour = time.localtime().tm_hour
if hour >= 0 and hour < 12:
zzw = '早'
elif hour < 18:
zzw = '中'
elif hour >= 18:
zzw = '晚'
for key, value in kw_dict.items():
h_dict = {}
h_dict["平台"] = '百度电脑端'
h_dict["公司"] = '华美'
h_dict['time'] = int(t)
h_dict['年月日'] = ymd
h_dict['早中晚'] = zzw
h_dict['关键词'] = key
h_dict['排名'] = value
mongoSession.insert_one('jk', 'result', h_dict)
def run(self, kw_li):
kw_dict = {}
for kw in kw_li:
pm = self.page(kw)
kw_dict.update({kw: pm})
# print(kw_dict)
# re_dict["百度电脑端"] = kw_dict
self.save_mongodb(kw_dict)
print("百度电脑端完成!")
def get_kw():
result = mongoSession.find_all("jk", "keyword") # D为数据库名,E为集合名
print('查询成功有数据,共{}条'.format(len(result)))
kw_dict = result[0]
kw_dict.pop('_id')
return kw_dict
def main():
t_start = time.time()
kw_dict = get_kw()
print(kw_dict)
kw_li_all = kw_dict['关键词']
kw_li = []
for kw in kw_li_all:
if not kw.endswith('类'):
kw_li.append(kw)
# print(kw_li)
kw_li = kw_li[:2]
# re_dict={}
baidu = Baidu()
baidu.run(kw_li)
# baidu.run(kw_li,re_dict)
if __name__ == "__main__":
main()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/wumuchen/huamei.git
git@gitee.com:wumuchen/huamei.git
wumuchen
huamei
huamei
master

搜索帮助

0d507c66 1850385 C8b1a773 1850385