代码拉取完成,页面将自动刷新
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import requests
import datetime
import json
import time
import sys
def getCode(driver):
print("打开浏览器访问页面" + time.strftime('%Y-%m-%d %H:%M',time.localtime(time.time())))
try:
driver.get('http://qzone.qq.com')
time.sleep(2)
driver.save_screenshot('./登录页.png')
print("浏览器启动并访问登录页成功,请速速扫码")
except BaseException as e:
driver.quit()
try: sys.exit(0)
except SystemExit: print("一开始就出错,浏览器已退出")
def getBlogs(driver, attr):
lis = []
count = 0 # 判断是不是没扫码
for item in attr['qzoneURLs']:
print(item[0])
try:
driver.get(item[0])
time.sleep(2)
# driver.save_screenshot('./ym.png')
small_frame = driver.find_element(By.CLASS_NAME, "app_canvas_frame")
driver.switch_to.frame(small_frame)
time.sleep(1)
ol = driver.find_elements(By.XPATH, '//ol[@class="mod_feed_lst js_error_display"]/li')
except BaseException as e:
# print(e)
if count >= 2:
print("应该是没登录,截张图,返回False")
driver.save_screenshot('./ym.png')
return False
else: count += 1
print("访问出错跳过")
# print(e)
continue
a = 0
for li in ol:
a += 1
if a > 10: break # 限制了条数================
lis.append([li.get_attribute("innerHTML"), item[1], item[2]])
return lis
def handleBlogs(lis):
# 解析
lisoups = []
for li in lis:
lisoup = BeautifulSoup(li[0],'lxml')
# 过滤掉转发的说说
if len(lisoup.select("div.box.bgr3 div.rt_content")) == 0:
lisoups.append([lisoup, li[1], li[2]])
# 生成了li内容转为soup对象组成的列表 lisoups
bloglist = []
for lisoup in lisoups:
blog = {
"blogTittle": lisoup[0].select("pre.content")[0].text.split("\n")[0][:45],
"blogContent": lisoup[0].select("pre.content")[0].text,
"blogUserid": lisoup[1],
"blogUsername": lisoup[2],
"blogTime": lisoup[0].select("a.c_tx.c_tx3.goDetail")[0].attrs["title"],
"blogLabel": "xy",
"blogData": "[\"",
"blogType": "pc",
"blogInfo": ""
}
# 为茉日天添加的标签
if "茉" in blog["blogUsername"]:
blog["blogLabel"] = "yl"
imgsum = len(lisoup[0].select(".img-attachments-inner.clearfix a"))
i = 0
for a in lisoup[0].select(".img-attachments-inner.clearfix a"):
i += 1
# 图片地址
blog["blogData"] += a.attrs["href"]
if i == imgsum:break
blog["blogData"] += "\",\""
blog["blogData"] += "\"]"
# 文章链接
link = lisoup[0].select("a.c_tx.c_tx3.goDetail")[0].attrs["href"]
if(len(link)>10):
blog["blogInfo"] = '{"link":"' + lisoup[0].select("a.c_tx.c_tx3.goDetail")[0].attrs["href"] + '"}'
# print(blog["blogTime"])
try:
if (blog["blogTime"][0] == '编'):
blog["blogTime"] = blog["blogTime"][4:]
if (blog["blogTime"][0]=='前'): blog["blogTime"] = (datetime.datetime.now() + datetime.timedelta(days = -2)).strftime('%Y年%m月%d日 ') + blog['blogTime'][2:]
elif (blog["blogTime"][0]=='昨'): blog["blogTime"] = (datetime.datetime.now() + datetime.timedelta(days = -1)).strftime('%Y年%m月%d日 ') + blog['blogTime'][2:]
else: blog["blogTime"] = datetime.datetime.now().strftime('%Y年%m月%d日 ') + blog['blogTime']
blog["blogTime"] = str(datetime.datetime.strptime(blog['blogTime'],'%Y年%m月%d日 %H:%M'))
# print(blog["blogTime"])
# print()
except BaseException as e:
print(e)
# print(blog['blogTime'])
print("出错跳过----")
continue
bloglist.append(blog)
# 生成 bloglist
return bloglist
def sendBlogs(bloglist):
# 过滤
def filterBlogs(blogContent):
if '饿了么' in blogContent: return False
if '领' in blogContent:
if '红包' in blogContent: return False
if '现金' in blogContent: return False
if '支付宝' in blogContent: return False
return True
# 请求
url = "https://sicau.xyz:8080/release_pc"
for blog in bloglist:
if filterBlogs(blog['blogContent']):
res = requests.post(url=url, data={"blogJson": json.dumps(blog, ensure_ascii = False)})
print(res.text + " >>> " + blog["blogTittle"][:10] + " <= " + blog["blogUsername"])
else:
print("-1 >>> " + blog["blogTittle"][:10] + " <= " + blog["blogUsername"])
print("请求发送完毕")
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。