groupSpider
/
groupSpider.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import os
import time
import random
import urllib
import urllib2
import sqlite3
import traceback

# 常量
IMG_URL_REG = re.compile(r'src="(http://img[35].douban.com/view/group_topic/large/public/p[\d]{7}\.jpg)"')
IMG_NAME_REG = re.compile(r'p[\d]{7}\.jpg')
TOPIC_URL_REG = re.compile(r'href="(http://www.douban.com/group/topic/[\d]{8}/)"')
I_HEADERS = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.117 Safari/537.36'}

# 全局变量
cx = None

# 初始化全局变量
def init():
	# 打开本地数据库
	global cx
	cx = sqlite3.connect('imgDb.db')

	# 设置代理
	proxy = urllib2.ProxyHandler({'http': '127.0.0.1:8087'})
	opener = urllib2.build_opener(proxy)
	urllib2.install_opener(opener)

# 读取配置文件
def readConfig(path='spider.properties'):
	try:
		propsFile = open(path, 'r')
		props = {}
		for line in propsFile:
			if line.find('=') > 0:
				strs = line.replace('\n', '').split('=')
				if len(strs) == 2:
					props[strs[0]] = strs[1]
				else:
					props[strs[0]] = None

	except Exception, e:
		raise e
	else:
		propsFile.close()
	return props

def getHtmlContent(url):
	req = urllib2.Request(url, headers=I_HEADERS)
	page = urllib2.urlopen(req)
	html = page.read()
	return html

# 判断是否需要下载
def isDownloadImg(cu, imgPath, imgName):
	selectSql = "select name from imgs where name='%s'" % (imgName)
	cu.execute(selectSql)
	notExistImg = len(cu.fetchall()) == 0
	return not os.path.exists(imgPath) and notExistImg

# 下载文件
def downloadFile(url, filePath):
	f = open(filePath, 'wb')
	f.write(getHtmlContent(url))
	f.close()

# test
# downloadFile('http://img3.douban.com/view/group_topic/large/public/p9565882.jpg', 'p9565882.jpg')

# 保存信息到本地数据库
def saveImgInfo(cu, imgName, imgPath, imgUrl):
	insertSql = "insert into imgs values('%s', '%s', '%s')" % (imgName, imgPath, imgUrl)
	cu.execute(insertSql)
	cx.commit()

# http://img3.douban.com/view/group_topic/large/public/p9527810.jpg
def getGroupImgs(html, rootDir):
	imgs = re.findall(IMG_URL_REG, html)

	if not os.path.exists(rootDir):
		os.makedirs(rootDir)

	cu = cx.cursor()
	for imgUrl in imgs:
		imgName = re.findall(IMG_NAME_REG, imgUrl)[0]
		imgPath = rootDir + imgName
		if isDownloadImg(cu, imgPath, imgName):
			print imgUrl + ' => ' + imgPath

			try:
				downloadFile(imgUrl, imgPath)
				# 存储信息到本地
				saveImgInfo(cu, imgName, imgPath, imgUrl)
			except urllib2.HTTPError, data:
				inc = random.randint(1, 5) * 10
				print 'HTTPError : ', data, 'pause ', inc, 's ...'
				time.sleep(inc)

	cu.close()

# http://www.douban.com/group/topic/49620798/
def getGroupTopics(url):
	topics = re.findall(TOPIC_URL_REG, getHtmlContent(url))
	return topics

def getGroupUrl(groupId, pageNum):
	return 'http://www.douban.com/group/' + groupId + '/discussion?start=' + str(pageNum)

def download(groupId, pageNum, rootDir):
	print 'groupId=' + groupId + ', pageNum=' + str(pageNum)
	topics = getGroupTopics(getGroupUrl(groupId, pageNum))
	# 逆序
	topics.reverse()
	for topic in topics:
		try:
			getGroupImgs(getHtmlContent(topic), rootDir + '/' + groupId + '/')
		except urllib2.HTTPError, data:
			inc = random.randint(1, 5) * 10
			print 'HTTPError : ', data, 'pause ', inc, 's ...'
			time.sleep(inc)

def main():
	init()
	inc = 10 * random.randint(1, 10)
	rootDir = '.'
	try:
		while True:
			download('haixiuzu', 0, rootDir)
			download('kaopulove', 0, rootDir)
			download('407518', 0, rootDir)
			download('Xsz', 0, rootDir)
			download('441239', 0, rootDir)
			download('475406', 0, rootDir)

			print 'pause ' + str(inc) + 's'
			time.sleep(inc)
	except :
		print traceback.print_exc()

	print 'close db'
	cx.close()

main()