master

分支 (1)

管理

管理

master

python_papapa
/
tieba.py

import requests
import time
from bs4 import BeautifulSoup


def get_content(url):
    '''
    分析贴吧的网页文件，整理信息，保存在列表变量中
    '''

    # 初始化一个列表来保存所有的帖子信息：
    comments = []
    # 使用request请求所需url
    html = requests.get(url)

    # 使用BeautifulSoup解析网页内容
    soup = BeautifulSoup(html.text, 'lxml')

    # 找到所有具有‘j_thread_list clearfix’属性的li标签
    liTags = soup.find_all('li', attrs={"class": ['j_thread_list', 'clearfix']})

    # 循环遍历li标签
    for li in liTags:
        # 初始化一个字典来存储帖子信息
        comment = {}
        try:
            # 筛选信息，并保存到字典中
            comment['title'] = li.find('a', attrs={"class": ['j_th_tit']}).text.strip()
            comment['link'] = "tieba.baidu.com/" + li.find('a', attrs={"class": ['j_th_tit']})['href']
            comment['name'] = li.find('span', attrs={"class": ['tb_icon_author']}).text.strip()
            comment['time'] = li.find('span', attrs={"class": ['pull-right is_show_create_time']}).text.strip()
            comment['replyNum'] = li.find('span', attrs={"class": ['threadlist_rep_num center_text']}).text.strip()
            comments.append(comment)
        except:
            print('出了点小问题')

    return comments


def Out2File(comments):
    '''
    将爬取到的文件写入到本地
    保存到当前目录的TTBT.txt文件中。
    '''
    with open('TTBT.txt', 'a+', encoding='utf-8') as f:
        for comment in comments:
            f.write('标题：{} \t 链接：{} \t 发帖人：{} \t 发帖时间：{} \t 回复数量：{} \n'.format(
                comment['title'], comment['link'], comment['name'], comment['time'], comment['replyNum']))
        print('当前页面爬取完成')


def main(base_url, deep):
    url_list = []
    # 将所有需要爬取的url存入列表
    for i in range(0, deep):
        url_list.append(base_url + '&pn=' + str(50 * i))
    # 循环写入所有的数据
    for url in url_list:
        print(f"开始爬取：{url}")
        content = get_content(url)
        print(content)
        Out2File(content)
        time.sleep(5)
    print('所有的信息都已经保存完毕！')


base_url = 'https://tieba.baidu.com/f?ie=utf-8&kw=亚运会'
# 设置需要爬取的页码数量
deep = 3

if __name__ == '__main__':
    main(base_url, deep)