1 Star 0 Fork 0

吴超/huamei

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
jk_use.py 56.44 KB
一键复制 编辑 原始数据 按行查看 历史
wmc 提交于 2019-10-20 22:27 . first commit
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322
from MongoDbHandler import MongoDbHandler
from lxml import etree
import pandas as pd
import requests
import json
import time,os
import hashlib
import random
import copy
import multiprocessing
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.header import Header
t_start=int(time.time())
path=r'C:/搜索排名/'
requests.packages.urllib3.disable_warnings()
t=time.strftime('%Y%m%d')
# t='20191014'
t0=t+'000000'
t1=t+'235959'
t0 = time.strptime(t0, "%Y%m%d%H%M%S")
t0 = int(time.mktime(t0))
t1 = time.strptime(t1, "%Y%m%d%H%M%S")
t1 = int(time.mktime(t1))
# t0=time.strptime(t0)
# t1=time.strptime(t1)
# print(t0,t1)
mongoSession = MongoDbHandler()
db = 'jk'
collection = 'rank11'
result = mongoSession.find_all("jk", "keyword") # D为数据库名,E为集合名
print('keyword查询关键词成功有数据,共{}条'.format(len(result)))
kw_dict = result[0]
# kw_dict.pop('_id')
kw_li_all = kw_dict['kw']
kw_li = []
user = kw_dict['user']
sites=kw_dict['sites']
sites_li=list(sites.keys())
for kw in kw_li_all:
if not kw.endswith('类'):
kw_li.append(kw)
# print(user,sites.keys(),sites.values())
# print(kw_dict,kw_li,kw_li_all)
class Baidu():
def __init__(self):
self.chrome_options = Options()
self.chrome_options.add_argument('--headless')
self.chrome_options.add_argument('log-level=3')
self.url = "https://www.baidu.com"
# self.hm_url = "huamei2001.com"
# self.hm_gg = '华美顾问'
self.hm_url_li=kw_dict['sites']['百度电脑端'].split(';')
def hand_page(self, div, pm, n):
try:
url_r_li = div.find_elements_by_xpath(
'.//*[@class="c-tools"]/preceding-sibling::*')
if url_r_li:
for url_r in url_r_li:
url_t = url_r.text
n += 1
# print(n,url_t)
for hm_url in self.hm_url_li:
# print(hm_url,url_t)
if hm_url in url_t:
pm = str(n)
break
if pm!='50+':
break
else:
n += 1
except:
n += 1
return pm, n
def page(self, kw):
self.driver.get(self.url)
self.driver.find_element_by_id("kw").clear()
self.driver.find_element_by_id("kw").send_keys(kw)
self.driver.find_element_by_id("su").click()
time.sleep(1)
pm = '50+'
n = 0
try:
for _ in range(1, 4):
self.driver.find_element_by_class_name('result') # 检查搜索结果是否出现
title = self.driver.title
print("百度电脑端", ':', kw, _, "-", title)
# with open('se{}.html'.format(_),'w',encoding='utf8')as f:
# f.write(self.driver.page_source)
cont_left = self.driver.find_element_by_id('content_left')
div_li = cont_left.find_elements_by_xpath('./div')
for div in div_li:
pm, n = self.hand_page(div, pm, n)
if pm != '50+':
break
if pm != '50+':
break
self.driver.find_element_by_xpath(
'//a[@class="n"][last()]').click()
time.sleep(2) # 点击下一页,休眠2s,等待确保下一页内容加载完成
except:
pass
return pm
def save_mongodb(self, kw_dict):
mongoSession.insert_one(db, collection, kw_dict)
def find_key(self,kw):
f_di={'平台':'百度电脑端','关键词':kw,'time':{'$gt':t0,'$lt':t1}}
result = mongoSession.find_all(db, collection,f_di)
return result
def run(self, kw_li):
# print(self.hm_url_li)
self.driver = webdriver.Chrome(chrome_options=self.chrome_options)
self.driver.implicitly_wait(5)
self.driver.maximize_window()
for kw in kw_li:
res=self.find_key(kw)
if not res:
pm = self.page(kw)
t = time.time()
kw_dict = {}
kw_dict["平台"] = '百度电脑端'
kw_dict['user'] = user
kw_dict['time'] = int(t)
kw_dict['关键词'] = kw
kw_dict['排名'] = pm
# print(kw_dict)
self.save_mongodb(kw_dict)
print("百度电脑端完成!")
self.driver.quit()
class Baidu_APP():
def __init__(self):
self.headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7,en-US;q=0.6',
'referer': 'https://m.baidu.com/from=844b/s',
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1",
# 'X-Requested-With':'XMLHttpRequest',
}
# 出现乱码,原因有可能是url错了
self.url = "https://m.baidu.com/from=844b/s" # 百度手机端url用这个
# self.url = "https://m.baidu.com/s" #不带from的url,搜索结果无广告
# self.hm_url = "huamei2001.com"
# self.hm_gg = "深圳市华美顾问"
self.hm_url_li=kw_dict['sites']['百度手机端'].split(';')
def hang_page(self, div_li, pm, n):
for div in div_li:
n += 1
h3_li = div.xpath(
'.//*[@class="c-showurl" or @class="c-color-gray" or @class="c-foot-source" or @class="c-showurl c-footer-showurl"]//text()')
# print(n,h3_li)
if h3_li:
h3 = h3_li[-1]
for hm_url in self.hm_url_li:
# print(hm_url,url_t)
if hm_url in h3:
pm = str(n)
break
if pm!='50+':
break
return pm, n
def page(self, kw):
pm = '50+'
n = 0
try:
for j in range(3):
# 为了提高效率,所以减少了请求次数,只请求前3页(前3页已经够用了),如果嫌少,可以请求5页
pn = j * 10
data = {"word": kw, "pn": pn}
r = requests.get(self.url, params=data, headers=self.headers)
con = r.text
# con=r.content.decode('utf8')
with open("百度手机_{}_{}.html".format(kw, j + 1), "w", encoding="utf8") as f:
f.write(con)
html = etree.HTML(con)
wb_title = html.xpath('//title/text()')[0]
print("百度手机端:", kw, j + 1, "-", wb_title)
div_li = html.xpath('//*[contains(@class,"c-container")]')
pm, n = self.hang_page(div_li, pm, n)
if pm != '50+':
break
except Exception as e:
print("百度手机端url失败--", kw)
print(e)
finally:
return pm
def save_mongodb(self, kw_dict):
mongoSession.insert_one(db, collection, kw_dict)
def find_key(self,kw):
f_di={'平台':'百度手机端','关键词':kw,'time':{'$gt':t0,'$lt':t1}}
result = mongoSession.find_all(db, collection,f_di)
return result
def run(self, kw_li):
for kw in kw_li:
res=self.find_key(kw)
if not res:
pm = self.page(kw)
t = time.time()
kw_dict = {}
kw_dict["平台"] = '百度手机端'
kw_dict['user'] = user
kw_dict['time'] = int(t)
kw_dict['关键词'] = kw
kw_dict['排名'] = pm
# print(kw_dict)
self.save_mongodb(kw_dict)
print("百度手机端完成!")
class Google:
def __init__(self):
# 随机UA,每次请求随机选一个,防止封ip
self.user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36",
]
# self.hm_url = "huamei2001.com"
self.hm_url_li=kw_dict['sites']['谷歌电脑端'].split(';')
self.url = "https://www.google.com/search"
self.proxies = {
"http": "http://127.0.0.1:10801",
"https": "https://127.0.0.1:10801",
}
def page(self, kw):
user_agent = random.choice(self.user_agent_list)
headers = {
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7",
# 出现utf8编码错误,原因是没有加referer参数
"referer": "https://www.google.com/",
"User-Agent": user_agent,
'Connection': 'close',
}
# 一次请求50页
data = {"q": kw, "num": 50, "hl": "zh-CN"}
pm = '50+'
try:
r = requests.get(self.url, params=data, headers=headers,
proxies=self.proxies, verify=False)
# with open("谷歌_{}_{}.html".format(kw, 1), "w", encoding="utf8") as f:
# f.write(r.content.decode("utf8"))
html = etree.HTML(r.text)
title = html.xpath("//title/text()")[0]
print('谷歌', 1, kw, r.status_code, "-后面汉字为成功-", title)
if kw in title:
h3url_li = html.xpath("//h3/../@href")
# h3url_li = html.xpath("//div[@class='KJDcUb']/a/@href") # 手机端用这个,但谷歌的电脑和手机搜索结果一样,所以用一个就行
for i, h3url in enumerate(h3url_li):
for hm_url in self.hm_url_li:
if hm_url in h3url:
pm = str(i + 1)
break
if pm!='50+':
break
except Exception as e:
print("谷歌url失败--", kw)
print(e)
finally:
return pm
def save_mongodb(self, kw_dict, kw_dict_app):
mongoSession.insert_one(db, collection, kw_dict)
mongoSession.insert_one(db, collection, kw_dict_app)
def find_key(self,kw):
f_di={'平台':'谷歌电脑端','关键词':kw,'time':{'$gt':t0,'$lt':t1}}
result = mongoSession.find_all(db, collection,f_di)
return result
def run(self, kw_li):
for kw in kw_li:
res=self.find_key(kw)
if not res:
pm = self.page(kw)
t = time.time()
kw_dict = {}
kw_dict["平台"] = '谷歌电脑端'
kw_dict['user'] = user
kw_dict['time'] = int(t)
kw_dict['关键词'] = kw
kw_dict['排名'] = pm
kw_dict_app = copy.deepcopy(kw_dict)
kw_dict_app["平台"] = '谷歌手机端'
# print(kw_dict)
self.save_mongodb(kw_dict, kw_dict_app)
time.sleep(25) # 建议请求时间间隔>20s,防止反爬虫
print("谷歌完成!")
class Weibo:
def __init__(self):
self.url = "https://m.weibo.cn/api/container/getIndex"
# https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%E9%85%92%E5%BA%97&page_type=searchall
# self.hm = '2505331282'
self.hm=kw_dict['sites']['微博手机端'].split(';')[0]
self.headers = {
"Accept": "application/json,text/plain,*/*",
"MWeibo-Pwa": "1",
"Referer": "https://m.weibo.cn/search?containerid=100103type%3D1%26q%3D%E9%85%92%E5%BA%97%E9%A1%BE%E9%97%AE",
# "Referer": "https://m.weibo.cn/search?containerid=100103type%3D1%26q%3D%E9%85%92%E5%BA%97%E9%A1%BE%E9%97%AE",
"Sec-Fetch-Mode": "cors",
"User-Agent": "Mozilla/5.0(iPhone;CPUiPhoneOS11_0likeMacOSX)AppleWebKit/604.1.38(KHTML,likeGecko)Version/11.0Mobile/15A372Safari/604.1",
"X-Requested-With": "XMLHttpRequest",
"X-XSRF-TOKEN": "caffd8",
}
def card_11(self, card_group_li, pm, n):
# card_type=11的情况处理
for card_group in card_group_li:
if card_group['card_type'] == 9:
n += 1
# print('11-9',card_group['mblog']['user']['screen_name'])
user_name = card_group['mblog']['user']['screen_name']
if self.hm == user_name:
pm = str(n)
break
elif card_group['card_type'] == 10:
n += 1
# print('11-10',card_group['user']['id'],card_group['user']['screen_name'])
user_name = card_group['user']['screen_name']
if self.hm == user_name:
pm = str(n)
break
return pm, n
def page(self, kw):
# 查2页,微博返回的是json数据,电脑端和手机端数据排名是一样的
# 暂时没有做json数据data为空的反爬处理
pm = '50+'
try:
n = 0
for j in range(1, 3):
# datas={'q':kw,'Refer':'Sweibo_box'}
data = {
"containerid": "100103type=1&q={}".format(kw),
"page_type": "searchall",
"page": j, # 微博页码从1开始,每页10条,只查前2页,合计20条内容
}
r = requests.get(self.url, params=data, headers=self.headers)
print("微博:{}-{}".format(j, kw))
con = r.json()
# with open("微博_{}_{}.json".format(kw, j), "w", encoding="utf8") as f:
# f.write(json.dumps(con, indent=4, ensure_ascii=False))
res_li = con["data"]["cards"]
if res_li:
for m in res_li:
if m['card_type'] == 9:
n += 1
# print('9',m['mblog']['user']['screen_name'])
user_name = m['mblog']['user']['screen_name']
if self.hm == user_name:
pm = str(n)
break
elif m['card_type'] == 11:
card_group_li = m['card_group']
pm, n = self.card_11(card_group_li, pm, n)
if pm != '50+':
break
if pm != '50+':
break
except Exception as e:
print("微博url失败--", kw)
print(e)
finally:
return pm
def save_mongodb(self, kw_dict, kw_dict_app):
mongoSession.insert_one(db, collection, kw_dict)
mongoSession.insert_one(db, collection, kw_dict_app)
def find_key(self,kw):
f_di={'平台':'微博电脑端','关键词':kw,'time':{'$gt':t0,'$lt':t1}}
result = mongoSession.find_all(db, collection,f_di)
return result
def run(self, kw_li):
for kw in kw_li:
res=self.find_key(kw)
if not res:
pm = self.page(kw)
t = time.time()
kw_dict = {}
kw_dict["平台"] = '微博电脑端'
kw_dict['user'] = user
kw_dict['time'] = int(t)
kw_dict['关键词'] = kw
kw_dict['排名'] = pm
kw_dict_app = copy.deepcopy(kw_dict)
kw_dict_app["平台"] = '微博手机端'
# print(kw_dict)
self.save_mongodb(kw_dict, kw_dict_app)
time.sleep(10) # 防止反爬虫
print("微博完成!")
class Zhihu:
def __init__(self):
self.url = "https://www.zhihu.com/api/v4/search_v3"
# https://www.zhihu.com/api/v4/search_v3?t=general&q=%E9%82%AE%E7%AE%B1&correction=1&offset=0&limit=20&lc_idx=0&show_all_topics=0
# self.hm = 'hua-mei-gu-wen-24'
self.hm_url_li=kw_dict['sites']['知乎手机端'].split(';')
# name: "华美顾问",url_token: "hua-mei-gu-wen-24",name有可能有em,所以用url_token判断
self.headers = {
"Accept": "application/json,text/plain,*/*",
# "Referer": "https://www.zhihu.com/search?q=%E9%82%AE%E7%AE%B1&type=content",
# 手机端ua
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1",
}
def page(self, kw="酒店顾问"):
# 查2页,知乎返回的是json数据,每次请求20条,电脑端和手机端数据排名是一样的
# 可以提取出对应的文章标题和作者,以作者的url_token为判断依据
pm = '50+'
try:
for j in range(2):
datas = {
"q": kw,
"t": "general",
"offset": j * 20, # 翻页属性,从第几条开始,默认从0开始,每页间隔20,前2页做够了
"limit": 20, # 默认每页20条内容
}
r = requests.get(self.url, params=datas, headers=self.headers)
print("知乎:{}-{}".format(j+1, kw))
con = r.json()
# with open("知乎_{}_{}.json".format(kw, j),"w",encoding="utf8",) as f:
# f.write(json.dumps(con, indent=4, ensure_ascii=False))
res_li = con["data"]
if res_li:
for i, m in enumerate(res_li):
try:
url_token = m['object']['author']['url_token']
# print(i,url_token,m['highlight']['title'])
for hm_url in self.hm_url_li:
if self.hm == url_token:
pm = str(j*20+i+1)
break
except: # list中有的没有object字段,会报错,用try捕获忽略即可
pass
finally:
if pm != '50+':
break
if pm != '50+':
break
except Exception as e:
print("知乎url失败--", kw)
print(e)
finally:
return str(pm)
def save_mongodb(self, kw_dict, kw_dict_app):
mongoSession.insert_one(db, collection, kw_dict)
mongoSession.insert_one(db, collection, kw_dict_app)
def find_key(self,kw):
f_di={'平台':'知乎电脑端','关键词':kw,'time':{'$gt':t0,'$lt':t1}}
result = mongoSession.find_all(db, collection,f_di)
return result
def run(self, kw_li):
for kw in kw_li:
res=self.find_key(kw)
if not res:
pm = self.page(kw)
t = time.time()
kw_dict = {}
kw_dict["平台"] = '知乎电脑端'
kw_dict['user'] = user
kw_dict['time'] = int(t)
kw_dict['关键词'] = kw
kw_dict['排名'] = pm
kw_dict_app = copy.deepcopy(kw_dict)
kw_dict_app["平台"] = '知乎手机端'
# print(kw_dict)
self.save_mongodb(kw_dict, kw_dict_app)
time.sleep(10) # 建议请求时间间隔>10s,防止反爬虫
print("知乎完成!")
class Toutiao_PC:
def __init__(self):
# 用户的user_id不会变,但用户名有可能更改,故此用user_id判断,
# 但user_id客户不知道怎么找,所以改用用户名判断
# self.hm_uid = '2730821192789468'
self.hm_uid=kw_dict['sites']['今日头条电脑端'].split(';')[0]
self.url = 'https://www.toutiao.com/api/search/content/'
self.headers = {
'accept': 'application/json, text/javascript',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7,en-US;q=0.6',
'cache-control': 'no-cache',
'content-type': 'application/x-www-form-urlencoded',
# 'cookie':'tt_webid=6740943025142695431; s_v_web_id=414de9e183b9c0ede5efd02897c6fb63; WEATHER_CITY=%E5%8C%97%E4%BA%AC; __tasessionId=5dhr1s7g01569498118786; tt_webid=6740943025142695431; csrftoken=5e84327f8e9d9bda3fefd84dc08e0b48; RT="z=1&dm=toutiao.com&si=t8mi1e1h16o&ss=k10mnl95&sl=6&tt=fct&obo=2&nu=5f2ae476426021582acbbcbe42cce319&cl=dxmo&ld=dxms&r=9ae68c383d9be96dad46b5686046d969&ul=dxmu&hd=dxr0"',
'cookie': 'tt_webid=6730868230266684932; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6730868230266684932; csrftoken=0ee1c29623fbaf16be4f2d846aab784a; _ga=GA1.2.1829696456.1567411449; WIN_WH=1536_832; s_v_web_id=7d68b0820ff2e83ce5bf1877bc8ee6e3; RT="z=1&dm=toutiao.com&si=iyu7jgr4hrq&ss=k10e84u9&sl=1&tt=3qy&ld=58kmw&r=32aeb9eb56facd8304cfc1b363566bd2&ul=58kn1&hd=58knm"; __tasessionId=y749tador1569503233797',
# 有cookie才会返回20条数据,内容和浏览器一样。否则每次返回10条数据,而且数据和浏览器不一样。
'pragma': 'no-cache',
'referer': 'https://www.toutiao.com/search/',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
}
def page(self, kw):
# 头条pc搜索时,头条官方返回的是json数据
pm = '50+'
n = 0
try:
for i in range(3):
# 为了提高效率,所以减少了请求次数,只请求前3页(前3页已经够用了),如果嫌少,可以请求5页
# 如果请求5次,每次20条,共计100条数据,但实际在浏览器显示的数据只有60条左右,因为其中有些是其他类别的数据
params = {
'aid': '24',
'app_name': 'web_search',
'offset': i*20,
'format': 'json',
'keyword': kw,
'autoload': 'true',
'count': 20,
'en_qc': '1',
'cur_tab': 1,
'from': 'search_tab',
'pd': 'synthesis',
}
r = requests.get(self.url, params=params, headers=self.headers)
# print(r.status_code,r.url,r.headers,r.request.headers)
print("今日头条电脑端:{}-{}".format(i+1, kw))
# print(i,'次请求,此次数量',r.json()['count'],r.json()['offset'])
con = r.json()
# with open('今日头条_{}_{}.json'.format(kw, i+1),'w',encoding='utf8')as f:
# f.write(json.dumps(con,ensure_ascii=False,indent=4))
data_li = con['data']
if data_li:
for data in data_li:
abstract = data.get('abstract')
title = data.get('title')
auther = data.get('source')
user_id = data.get('user_id')
# print(abstract,title,auther,user_id)
# auther和user_id要么同事有,要么同时无
if abstract and title and auther and user_id:
# abstract,title,auther,user_id同时有的情况下,才会作为1条内容显示在页面
n += 1
# print(n,abstract,title,auther,user_id)
if self.hm_uid == str(auther):
# if self.hm_uid == str(user_id):
# 用户的user_id不会变,但source用户有可能更改,故此用user_id判断
pm = str(n)
break
if pm != '50+':
break
except Exception as e:
print("今日头条电脑端url失败--", kw)
print(e)
finally:
return str(pm)
def save_mongodb(self, kw_dict):
mongoSession.insert_one(db, collection, kw_dict)
def find_key(self,kw):
f_di={'平台':'今日头条电脑端','关键词':kw,'time':{'$gt':t0,'$lt':t1}}
result = mongoSession.find_all(db, collection,f_di)
return result
def run(self, kw_li):
for kw in kw_li:
res=self.find_key(kw)
if not res:
pm = self.page(kw)
t = time.time()
kw_dict = {}
kw_dict["平台"] = '今日头条电脑端'
kw_dict['user'] = user
kw_dict['time'] = int(t)
kw_dict['关键词'] = kw
kw_dict['排名'] = pm
# print(kw_dict)
self.save_mongodb(kw_dict)
time.sleep(10) # 建议请求时间间隔>10s,防止反爬虫
print("今日头条电脑端完成!")
class Toutiao_APP:
def __init__(self):
# self.hm = '华美酒店顾问' # 华美在头条的官方号,用户名,有可能更改,头条手机端不返回用户的uid,只能根据url和用户名判断
# self.hm_url = 'huamei2001.com'
self.hm_url_li=kw_dict['sites']['今日头条手机端综合'].split(';')
self.url = 'https://m.toutiao.com/search/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
'Host': 'm.toutiao.com',
'Referer': 'https://m.toutiao.com/search/',
}
def page(self, kw):
# 头条app搜索时,头条官方返回的是html数据
pm = '50+'
n = 0
try:
for j in range(5):
# 为了提高效率,所以减少了请求次数,只请求前3页(前3页已经够用了),每页10条数据,如果嫌少,可以请求5页
pn = j * 10
# pd代表分类选项,synthesis是分类选项下的综合,offset和start_index代表的意思似乎一样,都是从0开始,间隔为10
params = {"keyword": kw, 'count': 10,
"offset": pn, 'start_index': pn, 'pd': 'synthesis'}
r = requests.get(self.url, params=params, headers=self.headers)
con = r.text
# with open("今日头条手机综合_{}_{}.html".format(kw, j + 1),"w",encoding="utf8") as f:
# f.write(con)
html = etree.HTML(con)
wb_title = html.xpath('//title/text()')[0]
print("今日头条手机端综合", j + 1, "-", wb_title)
res_li = html.xpath('//div[@class="result-content"]')
# print(res_li,type(res_li),len(res_li))
for res in res_li:
n += 1
# res_url = res.xpath('.//div[contains(@class,"ts-size14")]/span[1]/text() | //div[contains(@class,"ts-size14")]/a/text()')#此xpath会多匹配到“百科”,但结果判断用不掉“百科”,故此舍弃此xpath用法
res_url_li = res.xpath(
'.//div[contains(@class,"ts-size14")]/span[1]//text()') # 用这个xpath
# res_url_li有3种单选情况:1为空[],2为['www.fedint.com'],3为['[最佳回答]', '悟空问答']或['华美', '酒店顾问'],每个情况为1条内容div
# print(n,res_url_li)
if res_url_li:
# 因为res_url_li有肯出现['华美', '酒店顾问'],将list中的内容拼接起来才算头条号的用户名,故此用到join方法
res_url = ''.join(res_url_li)
for hm_url in self.hm_url_li:
# 判断官网,官方号,华美公司在头条的官方号,用户名,有可能更改,头条手机端不返回用户的uid,只能根据url或用户名判断
if hm_url in res_url:
pm = str(n)
break
# pm = str(pn+i+1)
if pm != '50+':
break
if pm != '50+':
break
except Exception as e:
print("今日头条手机端综合url失败--", kw)
print(e)
finally:
return str(pm)
def page_info(self, kw):
pm = '50+'
n = 0
try:
for j in range(4):
# 为了提高效率,所以减少了请求次数,只请求前3页(前3页已经够用了),如果嫌少,可以请求5页
# 如果请求5页,数据共计50条左右
pn = j * 10
# pd是分类,synthesis综合,information资讯
params = {"keyword": kw, 'count': 10,
"offset": pn, 'start_index': pn, 'pd': 'information'}
# data = {"word": kw, "rn": 50}
r = requests.get(self.url, params=params, headers=self.headers)
con = r.text
# with open("今日头条手机资讯_{}_{}.html".format(kw, j + 1),"w",encoding="utf8") as f:
# f.write(con)
html = etree.HTML(con)
wb_title = html.xpath('//title/text()')[0]
print("今日头条手机端资讯:", kw, j + 1, "-", wb_title)
res_li = html.xpath(
'//div[contains(@class,"tt-word3")]/span[1]')
# print(res_li,type(res_li),len(res_li))
for res in res_li:
n += 1
text_li = res.xpath('.//text()')
text = ''.join(text_li)
# print(n,text)
for hm_url in self.hm_url_li:
if hm_url in text:
pm = str(n)
break
if pm != '50+':
break
if pm != '50+':
break
except Exception as e:
print("今日头条手机端资讯url失败--", kw)
print(e)
finally:
return str(pm)
def save_mongodb(self, kw_dict):
mongoSession.insert_one(db, collection, kw_dict)
def find_key(self,kw):
f_di={'平台':'今日头条手机端综合','关键词':kw,'time':{'$gt':t0,'$lt':t1}}
result = mongoSession.find_all(db, collection,f_di)
return result
def find_key_info(self,kw):
f_di={'平台':'今日头条手机端资讯','关键词':kw,'time':{'$gt':t0,'$lt':t1}}
result = mongoSession.find_all(db, collection,f_di)
return result
def run(self, kw_li):
for kw in kw_li:
res=self.find_key(kw)
if not res:
pm = self.page(kw)
t = time.time()
kw_dict = {}
kw_dict["平台"] = '今日头条手机端综合'
kw_dict['user'] = user
kw_dict['time'] = int(t)
kw_dict['关键词'] = kw
kw_dict['排名'] = pm
# print(kw_dict)
self.save_mongodb(kw_dict)
time.sleep(10) # 建议请求时间间隔>10s,防止反爬虫
print("今日头条手机端综合完成!")
def run_info(self, kw_li):
for kw in kw_li:
res=self.find_key_info(kw)
if not res:
pm = self.page_info(kw)
t = time.time()
kw_dict = {}
kw_dict["平台"] = '今日头条手机端资讯'
kw_dict['user'] = user
kw_dict['time'] = int(t)
kw_dict['关键词'] = kw
kw_dict['排名'] = pm
# print(kw_dict)
self.save_mongodb(kw_dict)
time.sleep(10) # 建议请求时间间隔>10s,防止反爬虫
print("今日头条手机端资讯完成!")
class Linkedin_PC():
def __init__(self):
self.url = 'https://www.linkedin.com/voyager/api/search/blended'
# https://www.linkedin.com/voyager/api/search/blended
# self.hm = '华美顾问'
# self.hm1 = '华美酒店'
self.hm_url_li=kw_dict['sites']['领英手机端'].split(';')
self.headers = {
# 用time时间戳,防止反爬
'cookie': 'bcookie="v=2&93fdaca3-b373-4e7a-8681-3d6f71c35d28"; bscookie="v=1&2019092811214926184922-eec0-4d4c-8364-622029971d75AQEFrEhauWeiBAoagoi-RRw3wjAg2MNl"; _ga=GA1.2.377214128.1569669708; _gat=1; AMCVS_14215E3D5995C57C0A495C55%40AdobeOrg=1; AMCV_14215E3D5995C57C0A495C55%40AdobeOrg=-1303530583%7CMCIDTS%7C18168%7CMCMID%7C53491006891487876871671437302217651683%7CMCAAMLH-1570274508%7C11%7CMCAAMB-1570274508%7C6G1ynYcLPuiQxYZrsz_pkqfLG9yMXBpb2zX5dvJdYQJzPXImdj0y%7CMCOPTOUT-1569676908s%7CNONE%7CvVersion%7C3.3.0; aam_uuid=54012205828187279801728551448569431592; lil-lang=zh_CN; utag_main=v_id:016d779d389500161be06a0184c200087005d07f004bb$_sn:1$_se:1$_ss:1$_st:1569671655384$ses_id:1569669855384%3Bexp-session$_pn:1%3Bexp-session$vapi_domain:linkedin.com; pushPermState=default; appUpsellCoolOff=1569669921942; visit=v=1&M; JSESSIONID="ajax:6098893895897455442"; lissc1=1; lissc2=1; RT=s=1569669941239&r=https%3A%2F%2Fwww.linkedin.com%2Fstart%2Fjoin%3Ftrk%3Dguest_homepage-basic_nav-header-join; li_at=AQEDAS0342kDDObDAAABbXee5d4AAAFtm6tp3lEARHaJdL8xsjPZ9K-sIBb8PxqIDuJ7OyXdlCmASre2UlKcdBRgLn_EBASs-eAHKqvLW_MVnWg0l6z3XYIVWB0BjKXrmCCjG3Te89rQdrApJB3E3LvH; liap=true; sl=v=1&XobLR; li_cc=AQF9ajyFvnWGnAAAAW13nudpUr8PKwkufx8JiE2uuAa_U8u9i6QvGgSbS-m1SeNC7tY3wk7L8W_X; lang=v=2&lang=zh-cn; lidc="b=OGST09:g=1293:u=1:i={0:.0f}:t={0:.0f}:s=AQE3IpLZ9xmu0Uk8PEtrsZZKzfQlOQce"'.format(time.time()),
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7,en-US;q=0.6',
'csrf-token': 'ajax:6098893895897455442',
'referer': 'https://www.linkedin.com/mwlite/search/results/people',
# 'referer': 'https://www.linkedin.com/mwlite/search/results/all?keywords=%E9%85%92%E5%BA%97%E9%A1%BE%E9%97%AE',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
}
def hand_page(self, ele, pm, n):
for hm_url in self.hm_url_li:
# 分2种情况,1是len(ele['elements'])为0,2是len(ele['extendedElements'])为0,
if ele['type'] == 'SEARCH_HITS' and len(ele['elements']) > 0:
for el in ele['elements']:
n += 1
if el['type'] == 'PROFILE':
user_job = el['headline']['text']
# print(n,user_job)
if hm_url in user_job:
pm = str(n)
break
if ele['type'] == 'SEARCH_HITS' and len(ele['extendedElements']) > 0:
for el in ele['extendedElements']:
n += 1
if el['type'] == 'JYMBII':
user_job = el['jymbii']['jymbiiUpdate']['company']['com.linkedin.voyager.feed.shared.ExternalCompany']['companyName']
# print(n,user_job)
if hm_url in user_job:
# if self.hm in user_job or self.hm1 in user_job:
pm = str(n)
break
if pm!='50+':
break
return pm, n
def page(self, kw):
pm = '50+'
n = 0
try:
for i in range(1): # 首页从0开始,第2页从10开始
# count最大为40,可一次返回40条数据,这样只需请求1次,range写1就可以了。写50会返回0条数据,也就是不返回数据。
params = {
'count': 40,
'q': 'all',
'keywords': kw,
'start': i*10,
'filters': 'List(resultType->PEOPLE)',
# 'filters':'List()',
'origin': 'HISTORY', # 必选参数
# 'queryContext': 'List(spellCorrectionEnabled->true,relatedSearchesEnabled->true,kcardTypes->PROFILE|COMPANY|JOB_TITLE)',
}
r = requests.get(self.url, params=params,
headers=self.headers, timeout=15)
con = r.json()
# print(r.status_code,r.url)
print("领英:", kw, i + 1, '-', r.status_code)
# with open("领英_{}_{}.json".format(kw, i + 1),"w",encoding="utf8") as f:
# f.write(json.dumps(con, indent=4, ensure_ascii=False))
ele_li = con['elements']
for ele in ele_li:
# print(len(ele['elements']),ele['type'],len(ele['extendedElements']))
pm, n = self.hand_page(ele, pm, n)
if pm != '50+':
break
if pm != '50+':
break
except Exception as e:
print("领英url失败--", kw)
print(e)
finally:
return str(pm)
def save_mongodb(self, kw_dict, kw_dict_app):
mongoSession.insert_one(db, collection, kw_dict)
mongoSession.insert_one(db, collection, kw_dict_app)
def find_key(self,kw):
f_di={'平台':'领英电脑端','关键词':kw,'time':{'$gt':t0,'$lt':t1}}
result = mongoSession.find_all(db, collection,f_di)
return result
def run(self, kw_li):
for kw in kw_li:
res=self.find_key(kw)
if not res:
pm = self.page(kw)
t = time.time()
kw_dict = {}
kw_dict["平台"] = '领英电脑端'
kw_dict['user'] = user
kw_dict['time'] = int(t)
kw_dict['关键词'] = kw
kw_dict['排名'] = pm
kw_dict_app = copy.deepcopy(kw_dict)
kw_dict_app["平台"] = '领英手机端'
# print(kw_dict)
self.save_mongodb(kw_dict, kw_dict_app)
time.sleep(20) # 建议请求时间间隔>20s,防止反爬虫
print("领英电脑端完成!")
print("领英手机端完成!")
class Twitter():
def __init__(self):
self.headers = {
'accept': 'application/json, text/javascript, */*; q=0.01',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7,en-US;q=0.6',
'Referer': 'https://twitter.com/search',
# 'Referer': 'https://twitter.com/search?q=%E9%85%92%E5%BA%97%E9%A1%BE%E9%97%AE%E5%85%AC%E5%8F%B8&src=typd',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
'Connection': 'close',
}
# self.hm = "huamei2001" # twitter华美用户名huamei2001
# self.hm_url_li=kw_dict['sites']['twitter电脑端top'].split(';')[-1]
self.url = "https://twitter.com/search"
self.proxies = {
"http": "http://127.0.0.1:10801",
"https": "https://127.0.0.1:10801",
}
def page(self, kw):
pm = '50+'
try:
params = {"q": kw, 'src': 'typd', }
r = requests.get(
self.url, params=params, headers=self.headers, proxies=self.proxies, verify=False)
# con=r.text
con = r.content.decode("utf8")
# with open("{}/twitter/twitter_{}_{}.html".format(html_dir, kw, 1), "w", encoding="utf8") as f:
# f.write(con)
# with open("{}/twitter/twitter_{}_{}.json".format(html_dir, kw, 1), "w", encoding="utf8") as f:
# f.write(r.json())
html = etree.HTML(r.text)
title = html.xpath("//title/text()")[0]
print('twitter的top', kw, 1, "-", title, r.status_code)
# print(con)
user_li = html.xpath(
'//span[contains(@class,"username")]/b/text()')
# print(len(user_li))
for j, user in enumerate(user_li):
# print(user)
if self.hm in user:
pm = str(j+1)
except Exception as e:
print("twitter的top请求url失败--", kw)
print(e)
finally:
return pm
def page_latest(self, kw):
pm = '50+'
try:
params = {"q": kw, 'src': 'typd', 'f': 'live'}
r = requests.get(
self.url, params=params, headers=self.headers, proxies=self.proxies, verify=False)
# con=r.text
# con = r.content.decode("utf8")
html = etree.HTML(r.text)
title = html.xpath("//title/text()")[0]
print('twitter的latest', kw, 1, "-", title, r.status_code)
# print(con)
user_li = html.xpath(
'//span[contains(@class,"username")]/b/text()')
# print(len(user_li))
for j, user in enumerate(user_li):
# print(user)
if self.hm in user:
pm = str(j+1)
except Exception as e:
print("twitter的latest请求url失败--", kw)
print(e)
finally:
return pm
def save_mongodb(self, kw_dict):
mongoSession.insert_one(db, collection, kw_dict)
def find_key(self,kw):
f_di={'平台':'twitter电脑端top','关键词':kw,'time':{'$gt':t0,'$lt':t1}}
result = mongoSession.find_all(db, collection,f_di)
return result
def run(self, kw_li):
for kw in kw_li:
pm = self.page(kw)
t = time.time()
kw_dict = {}
kw_dict["平台"] = 'twitter电脑端top'
kw_dict['user'] = user
kw_dict['time'] = int(t)
kw_dict['关键词'] = kw
kw_dict['排名'] = pm
# print(kw_dict)
self.save_mongodb(kw_dict)
kw_dict_app = copy.deepcopy(kw_dict)
kw_dict_app["平台"] = 'twitter手机端top'
self.save_mongodb(kw_dict_app)
# time.sleep(20) # 建议请求时间间隔>20s,防止反爬虫
print("twitter的top完成!")
def run_latest(self, kw_li):
for kw in kw_li:
pm = self.page_latest(kw)
t = time.time()
kw_dict = {}
kw_dict["平台"] = 'twitter电脑端latest'
kw_dict['user'] = user
kw_dict['time'] = int(t)
kw_dict['关键词'] = kw
kw_dict['排名'] = pm
# print(kw_dict)
self.save_mongodb(kw_dict)
kw_dict_app = copy.deepcopy(kw_dict)
kw_dict_app["平台"] = 'twitter手机端latest'
self.save_mongodb(kw_dict_app)
# time.sleep(20) # 建议请求时间间隔>20s,防止反爬虫
print("twitter的latest完成!")
def run_no(self, kw_li):
# 以下平台的历史查询排名均是50+,所以用此方法
pt_li = ['twitter电脑端latest', 'twitter手机端latest',
'twitter电脑端top', 'twitter手机端top', 'facebook电脑端', 'facebook手机端']
for kw in kw_li:
res=self.find_key(kw)
if not res:
for pt in pt_li:
pm = '50+'
t = time.time()
kw_dict = {}
kw_dict["平台"] = pt
kw_dict['user'] = user
kw_dict['time'] = int(t)
kw_dict['关键词'] = kw
kw_dict['排名'] = pm
# print(kw_dict)
self.save_mongodb(kw_dict)
# time.sleep(10) # 建议请求时间间隔>20s,防止反爬虫
print("twitter完成!")
print("facebook完成!")
class Weixin():
def __init__(self):
# self.run_url='http://10.10.10.159:8888/huamei/run?timestamp={}&sign={}'
# self.check_url='http://10.10.10.159:8888/huamei/case_result?timestamp={}&sign={}'
self.run_url = 'http://sh.encootech.com:28888/huamei/run?timestamp={}&sign={}'
self.check_url = 'http://sh.encootech.com:28888/huamei/case_result?timestamp={}&sign={}'
# self.hm = "华美酒店顾问"
self.hm_wx=kw_dict['sites']['微信手机端公众号'].split(';')[0]
self.hm_dd=kw_dict['sites']['钉钉手机端'].split(';')[0]
def md5_encrypt(self, string):
md5 = hashlib.md5()
md5.update(string.encode(encoding='utf-8'))
result_string = md5.hexdigest()
# print(result_string)
return result_string
def page(self, kw,wxdd):
if wxdd=='微信手机端公众号':
site='微信'
kind='公众号'
hm=self.hm_wx
elif wxdd=='微信手机端文章':
site='微信'
kind='文章'
hm=self.hm_wx
elif wxdd=='钉钉手机端':
site='钉钉'
kind='企业广场/找企业'
hm=self.hm_dd
pm = '50+'
t = int(time.time())
string = str(t)+'云扩RPA连接一切YUNKUORPA8888QAZWSX'
sign_md5 = self.md5_encrypt(string)
# print(sign_md5)
data = {
"search_text": kw,
"verify_text": hm,
"platform": site,
"module": kind,
}
# print(data)
check_url = self.check_url.format(t, sign_md5)
run_url = self.run_url.format(t, sign_md5)
r = requests.post(check_url, json=data)
res_json = r.json()
# print(res_json)
res = res_json['data'].get('result_index')
# print(res)
if res:
return
else:
r = requests.post(run_url, json=data)
# print(r.json())
while 1:
time.sleep(30)
r = requests.post(check_url, json=data)
res_json = r.json()
# print(res_json)
res = res_json['data'].get('result_index')
# print(res)
if res:
pm = res
# print(pm)
break
return pm
def save_mongodb(self, kw_dict):
mongoSession.insert_one(db, collection, kw_dict)
def find_key(self,kw,site):
f_di={'平台':site,'关键词':kw}
result = mongoSession.find_all(db, collection,f_di)
return result
def run(self, kw_li):
wxdd_pt=['微信手机端公众号','微信手机端文章','钉钉手机端']
for wxdd in wxdd_pt:
if wxdd in list(sites.keys()):
qw=0
for kw in kw_li:
res=self.find_key(kw,wxdd)
if not res:
# pm = self.page(kw)
pm='50+'
t = time.time()
kw_dict = {}
kw_dict["平台"] = wxdd
kw_dict['user'] = user
kw_dict['time'] = int(t)
kw_dict['关键词'] = kw
kw_dict['排名'] = pm
# print(kw_dict)
self.save_mongodb(kw_dict)
print("{}完成!".format(wxdd))
def get_one():
fd={'time':{'$gt':t0,'$lt':t1}}
one_li = mongoSession.find_all(db, collection,fd) # D为数据库名,E为集合名
print('查询成功有数据,共{}条'.format(len(result)))
# print(one_li)
pd_kw=copy.copy(kw_li_all)
pd_kw.insert(0,'关键词')
# print(pd_kw)
# print(sites_li)
t_day=time.strftime('%m%d',time.localtime(time.time()))
t_hour=time.strftime('%H%M',time.localtime(time.time()))
# print(t_day,t_hour)
df=pd.DataFrame(index=pd_kw,columns=sites_li)
# print(df)
df.loc['关键词']=t_day
for one in one_li:
# df[one['平台']]=one['排名']
df.loc[one['关键词'],one['平台']]=one['排名']
# print(df)
file_path=path+'单天结果/result{}.xlsx'.format(t_day)
print(file_path)
if os.path.exists(file_path):
os.remove(file_path)
df.to_excel(file_path,sheet_name=t_hour)
return file_path
def get_all():
# fd={'time':{'$gt':t0,'$lt':t1}}
one_li = mongoSession.find_all(db, collection) # D为数据库名,E为集合名
print('查询成功有数据,共{}条'.format(len(result)))
# print(one_li)
pd_kw=copy.copy(kw_li_all)
pd_kw.insert(0,'关键词')
# print(pd_kw)
# print(sites_li)
t_day=time.strftime('%m%d',time.localtime(time.time()))
t_hour=time.strftime('%H%M',time.localtime(time.time()))
# print(t_day,t_hour)
df=pd.DataFrame(index=pd_kw,columns=sites_li)
# print(df)
df.loc['关键词']=t_day
for one in one_li:
# df[one['平台']]=one['排名']
df.loc[one['关键词'],one['平台']]=one['排名']
# print(df)
file_path=path+'单天结果/result{}.xlsx'.format(t_day)
print(file_path)
if os.path.exists(file_path):
os.remove(file_path)
df.to_excel(file_path,sheet_name=t_hour)
return file_path
# data_li = []
# a1 = "2019-10-09 00:00:00"
# a2 = "2019-10-09 23:59:59"
# # 先转换为时间数组
# t0 = time.strptime(a1, "%Y-%m-%d %H:%M:%S")
# t1 = time.strptime(a2, "%Y-%m-%d %H:%M:%S")
# # 转换为时间戳
# t0 = int(time.mktime(t0))
# t1 = int(time.mktime(t1))
# for r in result:
# if r['time'] >= t0 and r['time'] <= t1:
# r.pop('_id')
# data_li.append(r)
# data_dict = result[0]
# data_dict.pop('_id')
# return data_li
# return result
def send_mail(file_all,t_all):
# 第三方 SMTP 服务
mail_host="smtp.qq.com" #设置服务器
mail_user="1216887433@qq.com" #用户名
mail_pass="rqfruulyrbenhhab" #口令
sender = '1216887433@qq.com'
receivers = kw_dict['email'] # 接收邮件,可设置为你的QQ邮箱或者其他邮箱
# receivers = ['1216887433@qq.com'] # 接收邮件,可设置为你的QQ邮箱或者其他邮箱
# receivers = ['1216887433@qq.com','cuilinlin@huamei2001.com'] # 接收邮件,可设置为你的QQ邮箱或者其他邮箱
#创建一个带附件的实例
message = MIMEMultipart()
message['From'] = Header("rpa机器人", 'utf-8')
message['To'] = Header("华美", 'utf-8')
subject = '搜索监控'
message['Subject'] = Header(subject, 'utf-8')
#邮件正文内容
message.attach(MIMEText('搜索监控结果表格,总计耗时{:.0f}秒'.format(t_all), 'plain', 'utf-8'))
file_dir,file=os.path.split(file_all)
# 构造附件1,传送当前目录下的 test.txt 文件
att1 = MIMEText(open(file_all, 'rb').read(), 'base64', 'utf-8')
att1["Content-Type"] = 'application/octet-stream'
# 这里的filename可以任意写,写什么名字,邮件中显示什么名字
att1["Content-Disposition"] = 'attachment; filename="{}"'.format(file)
message.attach(att1)
try:
smtpObj = smtplib.SMTP_SSL(mail_host,465)
# smtpObj.connect(mail_host) # 25 为 SMTP 端口号
smtpObj.login(mail_user,mail_pass)
smtpObj.sendmail(sender, receivers, message.as_string())
print ("邮件发送成功")
except Exception as e:
print("Error: 无法发送邮件")
print(e)
def main():
global kw_li
t_start = time.time()
# kw_dict, kw_li = get_kw()
# kw_li_all = kw_dict['kw']
# kw_li = ['酒店顾问']
kw_li = kw_li[:2]
# kw_li = kw_li[:10]
# print(kw_dict)
# print(kw_li)
proce_li=[]
site_li=list(sites.keys())
if '百度电脑端' in site_li:
p = multiprocessing.Process(target=Baidu().run, args=(kw_li,))
p.start()
proce_li.append(p)
if '百度手机端' in site_li:
p = multiprocessing.Process(target=Baidu_APP().run, args=(kw_li,))
p.start()
proce_li.append(p)
if '谷歌手机端' in site_li or '谷歌电脑端' in site_li:
p = multiprocessing.Process(target=Google().run, args=(kw_li,))
p.start()
proce_li.append(p)
if '微博手机端' in site_li or '微博电脑端' in site_li:
p = multiprocessing.Process(target=Weibo().run, args=(kw_li,))
p.start()
proce_li.append(p)
if '知乎手机端' in site_li or '知乎电脑端' in site_li:
p = multiprocessing.Process(target=Zhihu().run, args=(kw_li,))
p.start()
proce_li.append(p)
if '今日头条电脑端' in site_li:
p = multiprocessing.Process(target=Toutiao_PC().run, args=(kw_li,))
p.start()
proce_li.append(p)
if '今日头条手机端综合' in site_li:
p = multiprocessing.Process(target=Toutiao_APP().run, args=(kw_li,))
p.start()
proce_li.append(p)
if '今日头条手机端资讯' in site_li:
p = multiprocessing.Process(target=Toutiao_APP().run_info, args=(kw_li,))
p.start()
proce_li.append(p)
if '领英手机端' in site_li or '领英电脑端' in site_li:
p = multiprocessing.Process(target=Linkedin_PC().run, args=(kw_li,))
p.start()
proce_li.append(p)
for ft in ['twitter电脑端latest', 'twitter手机端latest',
'twitter电脑端top', 'twitter手机端top', 'facebook电脑端', 'facebook手机端']:
if ft in site_li:
p = multiprocessing.Process(target=Twitter().run_no, args=(kw_li,))
p.start()
proce_li.append(p)
break
for ft in ['微信手机端公众号','微信手机端文章','钉钉手机端']:
if ft in site_li:
p = multiprocessing.Process(target=Weixin().run, args=(kw_li,))
p.start()
proce_li.append(p)
for p in proce_li:
p.join()
if kw_dict['history']=='否'
file_path=get_one()
elif kw_dict['history']=='是':
file_path=get_all()
# print(data_dict)
t_end=int(time.time())
t_all=t_end-t_start
send_mail(file_path,t_all)
if __name__ == "__main__":
main()
# get_one()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/wumuchen/huamei.git
git@gitee.com:wumuchen/huamei.git
wumuchen
huamei
huamei
master

搜索帮助

0d507c66 1850385 C8b1a773 1850385