0 Star 0 Fork 0

wxyz/groupSpider

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
groupSpider.py 3.71 KB
一键复制 编辑 原始数据 按行查看 历史
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import os
import time
import random
import urllib
import urllib2
import sqlite3
import traceback
# 常量
IMG_URL_REG = re.compile(r'src="(http://img[35].douban.com/view/group_topic/large/public/p[\d]{7}\.jpg)"')
IMG_NAME_REG = re.compile(r'p[\d]{7}\.jpg')
TOPIC_URL_REG = re.compile(r'href="(http://www.douban.com/group/topic/[\d]{8}/)"')
I_HEADERS = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.117 Safari/537.36'}
# 全局变量
cx = None
# 初始化全局变量
def init():
# 打开本地数据库
global cx
cx = sqlite3.connect('imgDb.db')
# 设置代理
proxy = urllib2.ProxyHandler({'http': '127.0.0.1:8087'})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
# 读取配置文件
def readConfig(path='spider.properties'):
try:
propsFile = open(path, 'r')
props = {}
for line in propsFile:
if line.find('=') > 0:
strs = line.replace('\n', '').split('=')
if len(strs) == 2:
props[strs[0]] = strs[1]
else:
props[strs[0]] = None
except Exception, e:
raise e
else:
propsFile.close()
return props
def getHtmlContent(url):
req = urllib2.Request(url, headers=I_HEADERS)
page = urllib2.urlopen(req)
html = page.read()
return html
# 判断是否需要下载
def isDownloadImg(cu, imgPath, imgName):
selectSql = "select name from imgs where name='%s'" % (imgName)
cu.execute(selectSql)
notExistImg = len(cu.fetchall()) == 0
return not os.path.exists(imgPath) and notExistImg
# 下载文件
def downloadFile(url, filePath):
f = open(filePath, 'wb')
f.write(getHtmlContent(url))
f.close()
# test
# downloadFile('http://img3.douban.com/view/group_topic/large/public/p9565882.jpg', 'p9565882.jpg')
# 保存信息到本地数据库
def saveImgInfo(cu, imgName, imgPath, imgUrl):
insertSql = "insert into imgs values('%s', '%s', '%s')" % (imgName, imgPath, imgUrl)
cu.execute(insertSql)
cx.commit()
# http://img3.douban.com/view/group_topic/large/public/p9527810.jpg
def getGroupImgs(html, rootDir):
imgs = re.findall(IMG_URL_REG, html)
if not os.path.exists(rootDir):
os.makedirs(rootDir)
cu = cx.cursor()
for imgUrl in imgs:
imgName = re.findall(IMG_NAME_REG, imgUrl)[0]
imgPath = rootDir + imgName
if isDownloadImg(cu, imgPath, imgName):
print imgUrl + ' => ' + imgPath
try:
downloadFile(imgUrl, imgPath)
# 存储信息到本地
saveImgInfo(cu, imgName, imgPath, imgUrl)
except urllib2.HTTPError, data:
inc = random.randint(1, 5) * 10
print 'HTTPError : ', data, 'pause ', inc, 's ...'
time.sleep(inc)
cu.close()
# http://www.douban.com/group/topic/49620798/
def getGroupTopics(url):
topics = re.findall(TOPIC_URL_REG, getHtmlContent(url))
return topics
def getGroupUrl(groupId, pageNum):
return 'http://www.douban.com/group/' + groupId + '/discussion?start=' + str(pageNum)
def download(groupId, pageNum, rootDir):
print 'groupId=' + groupId + ', pageNum=' + str(pageNum)
topics = getGroupTopics(getGroupUrl(groupId, pageNum))
# 逆序
topics.reverse()
for topic in topics:
try:
getGroupImgs(getHtmlContent(topic), rootDir + '/' + groupId + '/')
except urllib2.HTTPError, data:
inc = random.randint(1, 5) * 10
print 'HTTPError : ', data, 'pause ', inc, 's ...'
time.sleep(inc)
def main():
init()
inc = 10 * random.randint(1, 10)
rootDir = '.'
try:
while True:
download('haixiuzu', 0, rootDir)
download('kaopulove', 0, rootDir)
download('407518', 0, rootDir)
download('Xsz', 0, rootDir)
download('441239', 0, rootDir)
download('475406', 0, rootDir)
print 'pause ' + str(inc) + 's'
time.sleep(inc)
except :
print traceback.print_exc()
print 'close db'
cx.close()
main()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/hyhjwzx/groupSpider.git
git@gitee.com:hyhjwzx/groupSpider.git
hyhjwzx
groupSpider
groupSpider
master

搜索帮助