1 Star 0 Fork 1

水月萧/third

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
jd.py 5.51 KB
一键复制 编辑 原始数据 按行查看 历史
水月萧 提交于 2018-01-05 18:03 . 更新 jd.py
# -*- coding: utf-8 -*-
'''
爬取京东某一频道的商品的商品名、商品价格、商品出售方、商品评论数等信息
存储到一个文件中
附加要求:
把对应商品的评论情况爬下来,要求爬2页以上评论(如果足2页)
Created on 2018-01-05
@author: Zuolong
'''
from lxml import etree
import urllib.request
import urllib.parse
import re
import random
import io
import sys
import os
if(os.path.exists('./comment/')):
pass
else:
os.mkdir('comment')
print('创建文件夹comment成功')
key = '零食'
key = urllib.request.quote(key)
enc = 'utf-8'
def write_error_log(error_str):
try:
fh=open("./error_log.txt", "a")
fh.write(time.strftime("-" * 30 + "%Y-%m-%d %H:%M:%S",time.localtime(time.time())) + "-" * 30 + "\n")
fh.write(str(error_str) + "\n")
fh.close()
except Exception as e:
print(e)
uapools = [
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'
]
def ua(uapools):
thisua = random.choice(uapools)
print(thisua)
headers = ('User-Agent', thisua)
opener = urllib.request.build_opener()
opener.addheaders = [headers]
#安装为全局
urllib.request.install_opener(opener)
if __name__ == '__main__':
result = ''
for y in range(1, 31):
try:
url = 'https://search.jd.com/Search?keyword='+key+'&enc='+enc+'&&page='+str(y*2-1)
ua(uapools)
data = urllib.request.urlopen(url).read().decode('utf-8', 'ignore')
print('正在爬取第'+str(y)+'页的数据。。。')
dom_tree = etree.HTML(data)
shopList = dom_tree.xpath('//li[@class="gl-item"]')
for i in range(0, len(shopList)):
item = etree.tostring(shopList[i], encoding="utf-8").decode('utf-8', 'ignore')
#PId
pat_pid = '<li class="gl-item" data-sku=".*?" data-spu=".*?" data-pid="(.*?)">'
pid = re.compile(pat_pid, re.S).findall(item)
if len(pid) == 0:
continue
result += '{"商品ID":'+pid[0]+', '
#PName
pat_name = '<div class="p-name p-name-type-2">.*?<em>(.*?)</em>.*?</div>'
pname = re.compile(pat_name, re.S).findall(item)
p_name = pname[0].replace('<span class="p-tag" style="background-color:#c81623">京东超市</span>', '')
p_name = p_name.replace('<font class="skcolor_ljg">', '')
p_name = p_name.replace('</font>', '')
p_name = p_name.replace('<img class="p-tag3" src="//img14.360buyimg.com/uba/jfs/t6919/268/501386350/1257/92e5fb39/5976fcf9Nd915775f.png"/>', '')
result += '"商品名":'+p_name+', '
#PPrice
pat_price = '<div class="p-price">.*?<i>(.*?)</i>.*?</div>'
p_price = re.compile(pat_price, re.S).findall(item)
result += '"商品价格":'+p_price[0]+', '
#PShop
pat_shop = '<div class="p-shop".*?>.*?<span class="J_im_icon"><a .*?>(.*?)</a>'
p_shop = re.compile(pat_shop, re.S).findall(item)
if(len(p_shop) == 0):
pat_shop = '<div class="p-icons" .*?>.*?<i .*?>(.*?)</i>.*?</div>'
p_shop = re.compile(pat_shop, re.S).findall(item)
if(len(p_shop) == 0):
pat_shop = '<span class="p-promo-flag">(.*?)</span>'
p_shop = re.compile(pat_shop, re.S).findall(item)
result += '"商品出售方":'+p_shop[0]+', '
#PCommit
pat_commit = '<div class="p-commit">.*?<strong><a .*?>(.*?)</strong>'
p_commit = re.compile(pat_commit, re.S).findall(item)
p_commit = p_commit[0].replace('</a>', '')
result += '"商品评论数":'+p_commit+'}\n'
#获取商品评论
commitResult = ''
for j in range(0, 5):
#commitlink
commit_link = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv0&productId='+pid[0]+'&score=0&sortType=6&page='+str(j)+'&pageSize=10'
commitData = urllib.request.urlopen(commit_link).read().decode('gbk')
pat_commit1 = '"content":"(.*?)"'
commitList = re.compile(pat_commit1, re.S).findall(commitData)
for k in range(0, len(commitList)):
commitResult += commitList[k]+'\n'
commitResult += '------------------------------------\n'
#保存商品的评论
fm = open('./comment/'+str(y)+'-'+str(i)+'-'+pid[0]+'.txt', 'w', encoding="utf-8")
fm.write(commitResult)
fm.close()
except Exception as err:
write_error_log(err)
fh = open('./result.txt', 'w', encoding='utf-8')
fh.write(result)
fh.close()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/ShuiYueXiao/third.git
git@gitee.com:ShuiYueXiao/third.git
ShuiYueXiao
third
third
master

搜索帮助

0d507c66 1850385 C8b1a773 1850385