CuteSpider
/
config.py

#!/usr/bin/env python
# -*- coding:utf-8 -*-


STAR_URL = []

def urlFilter(url):
    if url.find('http://www.zhihu.com/question/') == -1 or url.find('answer') != -1:
        return False
    else:return True

def pickQuestion(html_tree,extra = None):
    qitem = dict()
    qtitle = html_tree.xpath("//div[@id='zh-question-title']/h2/text()")
    qitem["qTitle"] = qtitle[0].strip()
    qid = html_tree.xpath('//div[@id="zh-single-question-page"]')[0].get('data-urltoken')
    qitem['qID'] = qid
    qcontent = html_tree.xpath('//div[@id="zh-question-detail"]/div')
    qitem['qContent'] = qcontent[0].text
    if qitem['qContent'] == None:
        qitem['qContent'] = ''
    qcommenttree = html_tree.xpath("//div[@id='zh-question-meta-wrap']/div[1]/a[1]/text()")
    text = qcommenttree[1]
    if len(text.split(' ')) == 1:
        qcommentnum = '0'
    else:
        qcommentnum = text.split(' ')[0]
    qitem['qCommentNum'] = qcommentnum
    yield(qitem)

def pickAnswer(html_tree, extra = None):
    answertrees = html_tree.xpath('//div[@id="zh-question-answer-wrap"]/div')
    answerItems = []
    for it in answertrees:
        item = dict()
        text = ''.join(it.xpath('./div[2]/div[1]/h3/a[2]/text()'))
        if len(text) == 0:
            text = 'anoymous'
        item['aWriter'] = text
        item['qID'] = extra['qID']
        item['aID'] = it.attrib['data-aid']
        item['aContent'] = ''.join(it.xpath('string(./div[3])').strip())
        item['aVoteNum'] = it.xpath('./div[1]/button[1]/span[2]/text()')[0]
        yield(item)