practical_python_tools
/
spider_jingdong.py

"""
    爬取当当网的内容
"""

import requests
from lxml import html

def spider(SN, book_list=[]):

    URL = 'https://search.jd.com/Search'

    # 获取html内容
    resp = requests.get(URL, params={
        'keyword': SN
    })
    print(resp.encoding)
    resp.encoding = 'utf-8' # 修改编码

    html_data = resp.text

    # xpath对象
    selector = html.fromstring(html_data)
    # 找到书本列表
    ul_list = selector.xpath('//div[@id="J_goodsList"]/ul/li')
    print(len(ul_list))
    # 输出每个书籍的内容
    for li in ul_list:

        title = li.xpath('div[@class="gl-i-wrap"]/div[@class="p-img"]/a/@title') # 图书标题
        link = li.xpath('div[@class="gl-i-wrap"]/div[@class="p-img"]/a/@href') # 图书链接
        price = li.xpath('div[@class="gl-i-wrap"]/div[@class="p-price"]/strong/i/text()')
        store = li.xpath('div[@class="gl-i-wrap"]/div[@class="p-shopnum"]/a/text()')

        print(title[0])
        print('京东网售价：¥', price[0])
        print(link[0])
        print('未知商家' if len(store) == 0 else store[0])
        print('------------------')

        book_list.append({
            'title': title[0],
            'price': price[0],
            'link': link[0],
            'store': store
        })


if __name__ == '__main__':
    SN = '9787115428028'
    spider(SN)