代码拉取完成,页面将自动刷新
import time
import requests
import re
from lxml import etree
f = open("美团外卖.csv", "w", encoding="utf-8-sig")
url = "https://meishi.meituan.com/i/poi/160270566" # 商铺详情页地址
npg = "https://p0.meituan.net/travelcube/da7f909243236acd7452e12f844702de626.png" # 判断评分是否是满分(单张图片)
date = requests.get(url)
date.encoding = "utf-8"
# 获取商铺详情页的 商店名 地址 营业时间
res = re.compile(r'.*?<div class="name" data-reactid="44"><p class="poi-brand" data-reactid="45">(?P<ShopName>.*?)</p>.*?<div class="poi-address" data-reactid="69">(?P<Addres>.*?)</div>.*?<span class="col-right" data-reactid="126">(?P<DateTime>.*?)</span>', re.S)
# 获取评论每个用户发言对应的div盒子
comment_compile = re.compile(r'.*?<dd class="dd-padding"><div class="feedbackCard">(?P<commentDate>.*?)</dd>', re.S)
# 获取用户名称
comment_compile_username = re.compile(r'.*?<weak class="username">(?P<username>.*?)</weak>', re.S)
# 获取用户评论
comment_compile_comment = re.compile(r'.*?<div class="comment">.*?<p>(?P<comment>.*?)</p>', re.S)
# 判断是否有下一页
# 详情页信息
result = res.findall(date.text)
# for i in result:
# ShopName = i[0]
# Addres = i[1]
# DateTime = i[3]
# f.write("{},{},{}".format(ShopName, Addres, DateTime))
# 爬取评论
condition = True
page = 8
while condition:
comment = "https://i.meituan.com/poi/160270566/feedbacks/page_"+str(page)
headers ={
"Cookie": "__mta=49033881.1678793876094.1678793940876.1678795200204.6; _lxsdk_cuid=1864fac1719c8-0ca920cec855c5-26021151-1fa400-1864fac171ac8; WEBDFPID=z92674uu33zy52y5ywvv2633z3zz7x6u8135uzx8z3497958x656x508-1991733924674-1676373924101QMUUGOCfd79fef3d01d5e9aadc18ccd4d0c95071107; iuuid=DDE8CB8A12280513CEA4330E821956CD14C585CC83A9326AA805BFE6CE2D5C7B; _lxsdk=DDE8CB8A12280513CEA4330E821956CD14C585CC83A9326AA805BFE6CE2D5C7B; rvct=99%2C1; _hc.v=fd01bec7-e788-6fb0-70a4-ac7482d4d9f2.1676375129; _ga=GA1.1.1420699912.1676457603; _ga_95GX0SH5GM=GS1.1.1676457603.1.0.1676457606.0.0.0; webp=1; token=AgHGH4hJ0g9AXYLsxUquPGZ7Le_Bd7sHRIdbQqQEX50ImSgAAPHWIRk4EmsbLat2ionOXS0ECvTYugAAAAC9FgAAkGOSSwlDxxujA0ZYMoisjPCARg_Ga3S4jnji0ylaxUENRq_lzyrIcXJdPHA267LE; mt_c_token=AgHGH4hJ0g9AXYLsxUquPGZ7Le_Bd7sHRIdbQqQEX50ImSgAAPHWIRk4EmsbLat2ionOXS0ECvTYugAAAAC9FgAAkGOSSwlDxxujA0ZYMoisjPCARg_Ga3S4jnji0ylaxUENRq_lzyrIcXJdPHA267LE; userId=3498043032; isid=AgHGH4hJ0g9AXYLsxUquPGZ7Le_Bd7sHRIdbQqQEX50ImSgAAPHWIRk4EmsbLat2ionOXS0ECvTYugAAAAC9FgAAkGOSSwlDxxujA0ZYMoisjPCARg_Ga3S4jnji0ylaxUENRq_lzyrIcXJdPHA267LE; __utmz=74597006.1676523827.4.4.utmcsr=meishi.meituan.com|utmccn=(referral)|utmcmd=referral|utmcct=/; wm_order_channel=mtib; utm_source=60030; _lx_utm=utm_source%3Dmeishi.meituan.com%26utm_medium%3Dreferral%26utm_content%3D%252F; ci3=93; JSESSIONID=node0ltlq1ahvgwmp1tw73hb8lwqxi32503055.node0; IJSESSIONID=node0ltlq1ahvgwmp1tw73hb8lwqxi32503055; oops=AgHGH4hJ0g9AXYLsxUquPGZ7Le_Bd7sHRIdbQqQEX50ImSgAAPHWIRk4EmsbLat2ionOXS0ECvTYugAAAAC9FgAAkGOSSwlDxxujA0ZYMoisjPCARg_Ga3S4jnji0ylaxUENRq_lzyrIcXJdPHA267LE; u=3498043032; idau=1; __utma=74597006.1841706476.1676374789.1676523827.1678793651.5; __utmc=74597006; latlng=25.274207,110.299111,1678793658318; ci=99; cityname=%E5%8D%97%E5%AE%81; uuid=45c324ef-9445-4a28-a44c-d751efc766f8; p_token=AgHGH4hJ0g9AXYLsxUquPGZ7Le_Bd7sHRIdbQqQEX50ImSgAAPHWIRk4EmsbLat2ionOXS0ECvTYugAAAAC9FgAAkGOSSwlDxxujA0ZYMoisjPCARg_Ga3S4jnji0ylaxUENRq_lzyrIcXJdPHA267LE; i_extend=C_b0E004459250661043473771723482620800189196_e8554952675407443022_v6542421858852281651_a%e6%b1%9f%e5%8d%97%e4%b8%87%e8%be%be+kfcGimthomepagesearchH__a100173__b5; _lxsdk_s=186dfe623ba-649-c2c-1e4%7C%7C31; __utmb=74597006.20.9.1678795213279",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://i.meituan.com/poi/160270566/feedbacks"
}
comment_date = requests.get(comment, headers=headers)
# comment_date.encoding = "gzip"
# gzip.decompress(comment_date.text).decode('utf8')
# 根据返回的页面获取每个用户发言的div盒子
commentlist = comment_compile.finditer(comment_date.text)
for item in commentlist:
username = comment_compile_username.findall(item.group())[0]
commentstr = comment_compile_comment.findall(item.group())
if len(commentstr) == []:
commentstr = "用户未评价"
et = etree.HTML(item.group())
# 用etrre获取当前盒子用户的第三个评分图片
score = et.xpath(
'//div[@class="feedbackCard"]/div[@class="user-wrapper"]/div[@class="user-info-text"]/div[@class="score"]/span/img[3]/@src')
# 根据是否满分来寻找差评
if score[0] == npg:
print("ok")
else:
f.write(f"{username}, {str.strip(commentstr[0])}\n")
print(username, commentstr[0])
next = etree.HTML(comment_date.text)
# 寻找当前页面元素的下一页按钮是否被禁用
boolens = next.xpath('//a[@class="btn btn-weak btn-disabled"]/text()')
if len(boolens) != 0 and page >1:
condition = False
page += 1
print(comment_date.text)
time.sleep(0.8)
break
print("完成")
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。