practical_python_tools
/
spider_dangdang.py

"""
    爬取当当网的内容
"""

import requests
from lxml import html

def spider(SN, book_list=[]):

    URL = 'http://search.dangdang.com/?key={sn}&act=input'.format(sn = SN)

    # 获取html内容
    html_data = requests.get(URL).text
    # xpath对象
    selector = html.fromstring(html_data)
    # 找到书本列表
    ul_list = selector.xpath('//div[@id="search_nature_rg"]/ul/li')
    print(len(ul_list))
    # 输出每个书籍的内容
    for li in ul_list:

        title = li.xpath('a/@title') # 图书标题
        link = li.xpath('a/@href') # 图书链接
        price = li.xpath('p[@class="price"]/span[@class="search_now_price"]/text()')
        store = li.xpath('p[@class="search_shangjia"]/a/text()')

        print(title[0])
        print(price[0].replace('¥', '当当网售价：¥'))
        print(link[0])
        print('当当自营' if len(store) == 0 else store[0])
        print('------------------')

        book_list.append({
            'title': title[0],
            'price': price[0].replace('¥', ''),
            'link': link[0],
            'store': store
        })


if __name__ == '__main__':
    SN = '9787115428028'
    spider(SN)