master

分支 (1)

管理

管理

master

monitoring_crawlers
/
tieba.py

# -*- utf-8 -*-

import requests
import json
import re
from time import sleep
from bs4 import BeautifulSoup
import time
import datetime
from requests.adapters import HTTPAdapter
from time import strftime, localtime


def baojing(wenben):
    dingding = '钉钉报警接口'

    headers = {
        'Content-Type': 'application/json'
    }
    aaa = {"msgtype": "text",
            "text": {
                "content": wenben
            }}
    resp = requests.post(url=dingding, headers=headers, json=aaa)
    print(resp)

def get_bs(link):
    headers = {
        "User - Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36"
    }
    s = requests.Session()
    s.mount('http://', HTTPAdapter(max_retries=3))  # 设置重试次数为3次
    s.mount('https://', HTTPAdapter(max_retries=3))
    response = s.get(link, headers=headers, timeout=150)
    ttt = response.text
    soup = BeautifulSoup(ttt, 'lxml')

    f1 = open('已报警内容.txt', 'r+', encoding='utf-8')
    list1 = f1.readlines() #readlines获得的直接就是列表了

    for page in soup:
        page = str(page)
        if 'content' in page:
            dat = page.split('page": "search')[1].split('大家都在搜')[0]
            dat1 = dat.split('span class="p_title')

            for line in dat1:
                if '石力派' in line and 'data-tid' in line:
                    line1 = line.split('target="_blank">')[1].split('</div>')[0]
                    line2 = re.sub('<(S*?)[^>]*>.*?|<.*? />', '', line1)
                    print(line2)
                    if '骗人' in line2 or '退款' in line2 or '内幕' in line2 or '欺' in line2 or '12315' in line2 or '投诉' in line2 or '药' in line2 or '坑' in line2 or '学费' in line2 or '欺诈' in line2 or '辱' in line2 or '曝光' in line2 or '暴光' in line2 or '民族' in line2:
                        neir = '贴吧舆论报警：' + '\t' + line2 + '\n'
                        print(neir)
                        if neir in list1:#此处判断注意后面的换行符
                            print('■■■■■■■■■■■■■已经报警过了')
                        else:
                            lianjie = str(link)
                            baojing('贴吧链接:'+'\t'+lianjie)
                            baojing(neir)
                            time.sleep(10)
                            f1.writelines(neir)
                    else:
                        continue
    f1.close()

#f = open('已报警内容.txt', 'w', encoding='utf-8')

while 1:
    url = 'https://tieba.baidu.com/f/search/res?ie=utf-8&qw=%E7%9F%B3%E5%8A%9B%E6%B4%BE'
    try:
        page = get_bs(url)
        print('贴吧舆情监控完毕...30分钟后进行下一轮扫描'+'\n')
        print(strftime('%Y-%m-%d %H:%M:%S', localtime()))
        time.sleep(1800)
    except Exception as e:
        print(e)
        time.sleep(7200)

    continue