代码拉取完成,页面将自动刷新
# coding=gbk
# https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=4403396993644448&is_show_bulletin=2&is_mix=0&max_id=139524556292235&count=20&uid=2656274875
#https://m.weibo.cn/comments/hotflow?id=4404093797907178&mid=4404093797907178&max_id_type=0
#import os
#import pandas as pd
#import requests
#from bs4 import BeautifulSoup
#
# def fetchUrl():
# # url
# url = "https://m.weibo.cn/comments/hotflow"
# # 请求头
# headers = {
#
# "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.47"
# }
# # 参数
# params = {
#
# "id": 4404093797907178,
# "mid": 4404093797907178,
# "max_id_type": 0,
# }
#
# r = requests.get(url, headers=headers, params=params)
#
# return r.json()
#
#
# def parseJson(jsonObj):
# data = jsonObj["data"]
# max_id = jsonObj["max_id"]
#
# commentData = []
# for item in data:
# # 评论id
# comment_Id = item["id"]
# # 评论内容
# content = BeautifulSoup(item["text"], "html.parser").text
# # 评论时间
# created_at = item["created_at"]
# # 点赞数
# like_counts = item["like_counts"]
# # 评论数
# total_number = item["total_number"]
#
# # 评论者 id,name,city
# user = item["user"]
# userID = user["id"]
# userName = user["name"]
# userCity = user["location"]
#
# dataItem = [comment_Id, created_at, userID, userName, userCity, like_counts, total_number, content]
# print(dataItem)
# commentData.append(dataItem)
#
# return commentData, max_id
#
#
# def save_data(data, path, filename):
# if not os.path.exists(path):
# os.makedirs(path)
#
# dataframe = pd.DataFrame(data)
# dataframe.to_csv(path + filename, encoding='utf_8_sig', mode='a', index=False, sep=',', header=False)
#
#
# if __name__ == "__main__":
#
# pid = 4717939545342043 # 微博id,固定
# uid = 6512991534 # 用户id,固定
# max_id = 0
# path = "G:/py/" # 保存的路径
# filename = "comments.csv" # 保存的文件名
#
# csvHeader = [["评论id", "发布时间", "用户id", "用户昵称", "用户城市", "点赞数", "回复数", "评论内容"]]
# save_data(csvHeader, path, filename)
#
# while (True):
# html = fetchUrl(pid, uid, max_id)
# comments, max_id = parseJson(html)
# save_data(comments, path, filename)
# # max_id 为 0 时,表示爬取结束
# if max_id == 0:
# break;
# -*- coding:utf-8 -*-
# @time: 2021/5/11 19:00
# @Author: 韩国麦当劳
# @Environment: Python 3.7
# @file: 微博评论.py
# import json
# import csv
# import re
# import requests
# import time
#
#
# # 获取网页源码的文本文件
# def get_html(url):
# headers = {
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
# "Referer": "https://m.weibo.cn"
# }
# cookies = {
# "cookie": "ySUB=_2A25PhhIADeRhGeBO61IT9CnEyD2IHXVsiL5IrDV6PUJbktANLRbYkW1NSkBchgauJa4UR3g6budAM0kdkvEaRfTk; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh.OcAnJyRs55-Kox512Vig5NHD95Qceh57eoBN1hepWs4DqcjZCJ8_dPLz9g4EwBtt; _T_WM=57365897740; WEIBOCN_FROM=1110006030; MLOGIN=1; XSRF-TOKEN=329971; mweibo_short_token=c12fc604a4; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D38%2526q%253D%25E5%258F%25B0%25E9%25A3%258E%25E5%2588%25A9%25E5%25A5%2587%25E9%25A9%25AC%2526t%253D0%26uicode%3D10000011%26fid%3D231522type%253D1%2526t%253D10%2526q%253D%2523%25E5%258F%25B0%25E9%25A3%258E%25E5%2588%25A9%25E5%25A5%2587%25E9%25A9%25AC%2523%26oid%3D4403396993644448"
# }
# response = requests.get(url, headers=headers, cookies=cookies)
# response.encoding = response.apparent_encoding
# time.sleep(3) # 加上3s 的延时防止被反爬
# print(response.text)
# return response.text
#
#
# def get_string(text):
# t = ''
# flag = 1
# for i in text:
# if i == '<':
# flag = 0
# elif i == '>':
# flag = 1
# elif flag == 1:
# t += i
# return t
#
#
# # 保存评论
# def save_text_data(text_data):
# text_data = get_string(text_data)
# with open("data.csv", "a", encoding="utf-8", newline="")as fi:
# fi = csv.writer(fi)
# fi.writerow([text_data])
#
#
# # 获取二级评论
# def get_second_comments(cid):
# max_id = 0
# max_id_type = 0
# url = 'https://m.weibo.cn/comments/hotFlowChild?cid={}&max_id={}&max_id_type={}'
# while True:
# response = get_html(url.format(cid, max_id, max_id_type))
# content = json.loads(response)
# comments = content['data']
# for i in comments:
# text_data = i['text']
# save_text_data(text_data)
# max_id = content['max_id']
# max_id_type = content['max_id_type']
# if max_id == 0: # 如果max_id==0表明评论已经抓取完毕了
# break
#
#
# # 获取一级评论
# def get_first_comments(mid):
# max_id = 0
# max_id_type = 0
# url = 'https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id={}&max_id_type={}'
# while True:
# response = get_html(url.format(mid, mid, max_id, max_id_type))
# print(response)
# content = json.loads(response)
# max_id = content['data']['max_id']
# max_id_type = content['data']['max_id_type']
# text_list = content['data']['data']
# for text in text_list:
# text_data = text['text']
# total_number = text['total_number']
# if int(total_number) != 0: # 如果有二级评论就去获取二级评论。
# get_second_comments(text['id'])
# save_text_data(text_data)
# if int(max_id) == 0: # 如果max_id==0表明评论已经抓取完毕了
# break
#
#
# if __name__ == '__main__':
# mid = ["4404093797907178"]
# for id in mid:
# get_first_comments(id) # 爬取一级评论
# -*- coding: utf-8 -*-
#coding=utf-8
import requests
import time
import os
import csv
import sys
import json
from bs4 import BeautifulSoup
import importlib
importlib.reload(sys)
# 要爬取热评的起始url
url = 'https://m.weibo.cn/comments/hotflow?id=4404093797907178&mid=4404093797907178&max_id='
headers = {
'Cookie': 'SUB=_2A25PhhIADeRhGeBO61IT9CnEyD2IHXVsiL5IrDV6PUJbktANLRbYkW1NSkBchgauJa4UR3g6budAM0kdkvEaRfTk; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh.OcAnJyRs55-Kox512Vig5NHD95Qceh57eoBN1hepWs4DqcjZCJ8_dPLz9g4EwBtt; _T_WM=57365897740; WEIBOCN_FROM=1110006030; MLOGIN=1; M_WEIBOCN_PARAMS=oid%3D4403396993644448%26luicode%3D10000011%26lfid%3D231522type%253D1%2526t%253D10%2526q%253D%2523%25E5%258F%25B0%25E9%25A3%258E%25E5%2588%25A9%25E5%25A5%2587%25E9%25A9%25AC%2523; XSRF-TOKEN=7c791f',
'Referer': 'https://m.weibo.cn/detail/4404093797907178',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.47',
'X-Requested-With': 'XMLHttpRequest'
}
def get_page(max_id, id_type):
params = {
'max_id': max_id,
'max_id_type': id_type
}
try:
r = requests.get(url, params=params, headers=headers)
if r.status_code == 200:
return r.json()
except requests.ConnectionError as e:
print('error', e.args)
def parse_page(jsondata):
if jsondata:
items = jsondata.get('data')
item_max_id = {}
item_max_id['max_id'] = items['max_id']
item_max_id['max_id_type'] = items['max_id_type']
return item_max_id
def write_csv(jsondata):
datas = jsondata.get('data').get('data')
for data in datas:
created_at = data.get("created_at")
like_count = data.get("like_count")
source = data.get("source")
floor_number = data.get("floor_number")
username = data.get("user").get("screen_name")
comment = data.get("text")
comment = BeautifulSoup(comment, 'lxml').get_text()
writer.writerow([username, created_at, like_count, floor_number, source,
json.dumps(comment, ensure_ascii=False)])
# 存为csv
path = os.getcwd() + "/weiboComments1.csv"
csvfile = open(path, 'w',encoding = 'utf-8')
writer = csv.writer(csvfile)
writer.writerow(['Usename', 'Time', 'Like_count', 'Floor_number', 'Sourse', 'Comments'])
maxpage = 50 #爬取的数量
m_id = 0
id_type = 0
for page in range(0, maxpage):
print(page)
jsondata = get_page(m_id, id_type)
write_csv(jsondata)
results = parse_page(jsondata)
time.sleep(1)
m_id = results['max_id']
id_type = results['max_id_type']
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。