1 Star 0 Fork 0

李志强/CuteSpider

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
test.py 3.37 KB
一键复制 编辑 原始数据 按行查看 历史
李志强 提交于 2015-07-09 11:04 . 'cute'
# -*- coding:utf-8 -*-
import unittest
import crawler
from contrib.spider_html import *
from contrib.spider_http import *
from contrib.dosql import *
class MyHtmlTool(HtmlParserTool):
def __init__(self, data):
HtmlParserTool.__init__(self, data)
'''
def extractItems(self, pipeline,extra = None):
try:
qitem = dict()
qtitle = self.html_tree.xpath("//div[@id='zh-question-title']/h2/text()")
qitem["qTitle"] = qtitle[0].strip()
qid = self.html_tree.xpath('//div[@id="zh-single-question-page"]')[0].get('data-urltoken')
qitem['qID'] = qid
qcontent = self.html_tree.xpath('//div[@id="zh-question-detail"]/div')
qitem['qContent'] = qcontent[0].text
if qitem['qContent'] == None:
qitem['qContent'] = ''
qcommenttree = self.html_tree.xpath("//div[@id='zh-question-meta-wrap']/div[1]/a[1]/text()")
text = qcommenttree[1]
if len(text.split(' ')) == 1:
qcommentnum = '0'
else:
qcommentnum = text.split(' ')[0]
qitem['qCommentNum'] = qcommentnum
qanswertree = self.html_tree.xpath('//h3[@id="zh-question-answer-num"]')
if len(qanswertree) == 0:
qitem['qAnswerNum'] = '0'
else:
qitem['qAnswerNum'] = qanswertree[0].get('data-num')
answertrees = self.html_tree.xpath('//div[@id="zh-question-answer-wrap"]/div')
answerItems = []
for it in answertrees:
item = dict()
text = ''.join(it.xpath('./div[2]/div[1]/h3/a[2]/text()'))
if len(text) == 0:
text = 'anoymous'
item['aWriter'] = text
item['qID'] = qid
item['aID'] = it.attrib['data-aid']
item['aContent'] = ''.join(it.xpath('string(./div[3])').strip())
item['aVoteNum'] = it.xpath('./div[1]/button[1]/span[2]/text()')[0]
answerItems.append(item)
if insert('question', qitem):
for item in answerItems:
insert('answer', item)
except Exception, e:
print 'extractItems', e
'''
def cusFixURL(self, url, prefix):
url = self.fixURL(url, prefix)
from urlparse import urlparse
urlpar = urlparse(url)
url = urlpar.scheme + '://' + urlpar.hostname + urlpar.path
return url
#rint len(answertree)
class Test(unittest.TestCase):
def testName(self):
'''
url1 = 'http://www.zhihu.com/question/20124337'
url2 = "http://www.zhihu.com/question/20120168"
urp = "http://www.zhihu.com/?next=%2Fquestion%2F20014415"
url3 = "http://www.zhihu.com"
import urlparse
urlpar = urlparse.urlparse("http://www.zhihu.com/question/28523831?sort=created")
print urlpar
url = urlpar.scheme + '://'+urlpar.hostname + urlpar.path
download = Downloader()
data = download.downLoadFile(url2)
htmltool = MyHtmlTool(data)
print urp.find('http://www.zhihu.com/question/') == -1 or url.find('answer') != -1
#if htmltool.urlFilter(url1):
htmltool.extractItems()
'''
pass
if __name__ == "__main__":
#import sys;sys.argv = ['', 'Test.testName']
unittest.main()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/gikieng/CuteSpider.git
git@gitee.com:gikieng/CuteSpider.git
gikieng
CuteSpider
CuteSpider
master

搜索帮助