1 Star 0 Fork 0

钱途/监控爬虫

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
tieba.py 2.99 KB
一键复制 编辑 原始数据 按行查看 历史
钱途 提交于 2020-08-13 14:52 . update tieba.py.
# -*- utf-8 -*-
import requests
import json
import re
from time import sleep
from bs4 import BeautifulSoup
import time
import datetime
from requests.adapters import HTTPAdapter
from time import strftime, localtime
def baojing(wenben):
dingding = '钉钉报警接口'
headers = {
'Content-Type': 'application/json'
}
aaa = {"msgtype": "text",
"text": {
"content": wenben
}}
resp = requests.post(url=dingding, headers=headers, json=aaa)
print(resp)
def get_bs(link):
headers = {
"User - Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"
}
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=3)) # 设置重试次数为3次
s.mount('https://', HTTPAdapter(max_retries=3))
response = s.get(link, headers=headers, timeout=150)
ttt = response.text
soup = BeautifulSoup(ttt, 'lxml')
f1 = open('已报警内容.txt', 'r+', encoding='utf-8')
list1 = f1.readlines() #readlines获得的直接就是列表了
for page in soup:
page = str(page)
if 'content' in page:
dat = page.split('page": "search')[1].split('大家都在搜')[0]
dat1 = dat.split('span class="p_title')
for line in dat1:
if '石力派' in line and 'data-tid' in line:
line1 = line.split('target="_blank">')[1].split('</div>')[0]
line2 = re.sub('<(S*?)[^>]*>.*?|<.*? />', '', line1)
print(line2)
if '骗人' in line2 or '退款' in line2 or '内幕' in line2 or '欺' in line2 or '12315' in line2 or '投诉' in line2 or '药' in line2 or '坑' in line2 or '学费' in line2 or '欺诈' in line2 or '辱' in line2 or '曝光' in line2 or '暴光' in line2 or '民族' in line2:
neir = '贴吧舆论报警:' + '\t' + line2 + '\n'
print(neir)
if neir in list1:#此处判断注意后面的换行符
print('■■■■■■■■■■■■■已经报警过了')
else:
lianjie = str(link)
baojing('贴吧链接:'+'\t'+lianjie)
baojing(neir)
time.sleep(10)
f1.writelines(neir)
else:
continue
f1.close()
#f = open('已报警内容.txt', 'w', encoding='utf-8')
while 1:
url = 'https://tieba.baidu.com/f/search/res?ie=utf-8&qw=%E7%9F%B3%E5%8A%9B%E6%B4%BE'
try:
page = get_bs(url)
print('贴吧舆情监控完毕...30分钟后进行下一轮扫描'+'\n')
print(strftime('%Y-%m-%d %H:%M:%S', localtime()))
time.sleep(1800)
except Exception as e:
print(e)
time.sleep(7200)
continue
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/buau654/monitoring_crawlers.git
git@gitee.com:buau654/monitoring_crawlers.git
buau654
monitoring_crawlers
监控爬虫
master

搜索帮助