weibo
/
微博爬取.py

import csv
import os
import requests
from jsonpath import jsonpath
import re


def req_url(url):
    '''
    爬取当前页面的数据
    :param url:
    :return:
    content_list:当前页的数据
    since_id:下一页请求所带的参数
    '''
    response = requests.get(url)
    data = response.json()
    since_id = jsonpath(data, '$..since_id')[0]
    cards = jsonpath(data, '$..cards')[0]
    content_list = []
    for card in cards:
        if jsonpath(card, '$..mblog'):
            # 对text文本数据进行简单处理
            text = jsonpath(card, '$..text')[0]
            pat = re.compile('<[^>]+>', re.S)
            text_x = pat.sub('', text)
            time = jsonpath(card, '$..created_at')[0]
            print('当前时间：'+time)
            if '03-' in jsonpath(card, '$..created_at')[0]:
                since_id = None
                return content_list, since_id
            else:
                content_dict = {
                    'text':text_x.replace('\xa0',''),
                    'scheme':jsonpath(card, '$..scheme')[0],    # 微博正文链接
                    'time':jsonpath(card, '$..created_at')[0],  # 微博发送时间
                    'attitudes_count':jsonpath(card, '$..attitudes_count')[0],  #点赞数
                    'comments_count':jsonpath(card, '$..comments_count')[0],    #评论数
                    'reposts_count':jsonpath(card, '$..reposts_count')[0],      # 转发数
                    'source':jsonpath(card, '$..source')[0],                    # 微博发送来源
                    'screen_name':jsonpath(card, '$..screen_name')[0],          #名称
                    'follow_count':jsonpath(card, '$..follow_count')[0],        #关注
                    'followers_count':jsonpath(card, '$..followers_count')[0]   #粉丝
                }
                content_list.append(content_dict)

    return content_list, since_id


def downloads_all():
    '''
    # 循环切换url进行全部数据的抓取
    :return: all_list抓取的所有数据存储在列表中的字典
    '''
    all_list =[]
    url = 'https://m.weibo.cn/api/container/getIndex?uid=2183473425&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E8%81%94%E6%83%B3%E4%B8%AD%E5%9B%BD&type=uid&value=2183473425&containerid=1076032183473425'
    since_id = ''
    while True:
        if since_id != '':
            r_url = url + '&since_id=' + str(since_id)
        else:
            r_url = url
        con_list, sin_id = req_url(r_url)
        if sin_id:
            since_id = sin_id
            all_list.extend(con_list)
        else:
            all_list.extend(con_list)
            break

    return all_list


def save(item):
    csv_headers = ['text', 'scheme','time', 'attitudes_count', 'comments_count','reposts_count','source','screen_name', 'follow_count', 'followers_count']  # 表头
    csv_path = './微博1.0.csv'  # 路径
    try:
        # 第一次打开文件时，写入表头
        if not os.path.exists(csv_path):
            with open(csv_path, 'w+', newline='', encoding='gbk') as f:  # newline='' 去除空白行
                writer = csv.DictWriter(f, fieldnames=csv_headers)  # 写字典的方法
                writer.writeheader()  # 写入表头
        # 追加写入内容
        with open(csv_path, 'a+', newline='', encoding='gbk') as f:
            writer = csv.DictWriter(f, fieldnames=csv_headers)
            writer.writerow(item)
            print("^_^ 写入成功！！")
    except Exception as e:
        print(e)
        print('^~^写入失败!!!')


if __name__ == '__main__':
    s = downloads_all()
    for i in s:
        save(i)