1 Star 0 Fork 0

codelong/wxspider

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
updatemp.py 11.03 KB
一键复制 编辑 原始数据 按行查看 历史
codelong 提交于 2017-08-29 09:59 . 修改 upload
# -*- coding: utf-8 -*-
#查找公众号最新文章
# 导入包
from wechatsogou import upload
from wechatsogou.tools import *
from wechatsogou import *
from PIL import Image
import datetime
import time
import logging
import logging.config
import random
import requests
from bs4 import BeautifulSoup
import re
import os
import shortuuid
# 日志
logging.config.fileConfig('logging.conf')
logger = logging.getLogger()
# 搜索API实例
wechats = WechatSogouApi() #不使用外部Cookie
#如果想使用外部cookie,主要是为了实现搜狗微信登录状态
#你需要安装chrom浏览器,然后给浏览器安装EditThisCooke这个插件
#1、使用Chrom浏览器登录搜狗微信
#2、使用EditThisCooke插件复制当前Cookie信息
#3、把cookie信息复制到代码目录下的cookies.txt文件
#4、开启下面这行语句
#wechats = WechatSogouApi(cookies_file={'file_name':'cookies.txt'}) #使用外部cookie
#数据库实例
mysql = mysql('wechat_wechat')
#循环获取数据库中所有公众号
mysql.order_sql = " order by id desc"
mp_list = mysql.find(0)
succ_count = 0
now_time = datetime.datetime.today()
now_time = datetime.datetime(now_time.year, now_time.month, now_time.day, 0, 0, 0)
#now_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(now_time))
for item in mp_list:
try:
time.sleep(random.randrange(1,3))
#查看一下该号今天是否已经发送文章
last_qunfa_id = item['last_qunfa_id']
last_qunfa_time = item['last_qufa_time']
cur_qunfa_id = last_qunfa_id
wz_url = ""
if item.has_key('wz_url'):
wz_url = item['wz_url']
else :
wechat_info = wechats.get_gzh_info(item['wechatid'])
if not wechat_info.has_key('url'):
continue
wz_url = wechat_info['url']
print(item['name'])
if wz_url=='':
continue
#获取最近文章信息
if (wz_url == null)
continue
wz_list = wechats.get_gzh_message(url=wz_url)
if u'链接已过期' in wz_list:
wechat_info = wechats.get_gzh_info(item['wechatid'])
if not wechat_info.has_key('url'):
continue
print('guo qi sz chong xin huo qu success')
wz_url = wechat_info['url']
wz_list = wechats.get_gzh_message(url=wz_url)
mysql.where_sql = "id=%s" % (item['id'])
mysql.table('wechat_wechat').where({'id': item['id']}).save({'wz_url': wechat_info['url'], 'avatar': wechat_info['img'], 'qrcode': wechat_info['qrcode']})
#type==49表示是图文消息
qunfa_time = ''
for wz_item in wz_list :
temp_qunfa_id = int(wz_item['qunfa_id'])
if(last_qunfa_id >= temp_qunfa_id):
print(u"没有更新文章")
print(u"")
#break
continue
if(cur_qunfa_id < temp_qunfa_id):
cur_qunfa_id = temp_qunfa_id
qunfa_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(wz_item['datetime']))
succ_count += 1
if wz_item['type'] == '49':
#把文章写入数据库
#更新文章条数
print(succ_count)
print(wz_item['content_url'])
if not wz_item['content_url']:
continue
time.sleep(0.5)
article = requests.get(wz_item['content_url'])
bsp = BeautifulSoup(article.content)
#[script.extract() for script in res.findAll('script')]
#[style.extract() for style in res.findAll('style')]
res = bsp.find('div', {'id': 'js_content'})
#res = str(res).replace('var occupyImg = ', '')
bs = BeautifulSoup(str(res))
vote = bs.find('span', {'class': 'vote_area'})
if vote:
vote.replace_with('')
contents = ''
p_tags = bs.select('p')
for p in p_tags:
# print '开始-------------get_text'
if p.get_text():
contents += '<p>%s</p>' % (str(p.get_text()), )
# print '开始-------------img'
if p.select('img'):
for img in p.select('img'):
if img.get('data-src'):
try:
width = ''
if img.get('data-w'):
width = img.get('data-w')
imageUrl = ''
imageUrl = upload.uploadImage(img.get('data-src'), width)
print imageUrl
if len(imageUrl) <= 1:
imageUrl = str(img.get('data-src')).replace('=', '.')
else:
imageUrl = img.get('data-src')
else:
width = 'auto'
contents += '<span><img style="width: %s" src="%s"></img></span>' % (width, imageUrl,)
except KeyboardInterrupt:
print u"上传图片出错出错,继续"
continue
time.sleep(0.1)
# print '开始-------------iframe'
if p.select('iframe'):
for iframe in p.select('iframe'):
iframe['width'] = 'auto'
iframe['height'] = 'auto'
if iframe.get('src'):
s = iframe.get('src')
s = s.replace('preview', 'player')
s = re.sub(r'(height=\d+[\.\d+]*)', 'height=auto', s)
s = re.sub(r'(width=\d+[\.\d+]*)', 'width=auto', s)
iframe['src'] = s
# print iframe
contents += '<span><iframe allowfullscreen="" src="%s" frameborder="0" height="auto" scrolling="no" width="auto"></iframe></span>' % (s, )
if iframe.get('data-src'):
s = iframe.get('data-src')
s = s.replace('preview', 'player')
s = re.sub(r'(height=\d+[\.\d+]*)', 'height=auto', s)
s = re.sub(r'(width=\d+[\.\d+]*)', 'width=auto', s)
# print iframe
iframe['data-src'] = s
contents += '<span><iframe allowfullscreen="" src="%s" frameborder="0" height="auto" scrolling="no" width="auto"></iframe></span>' % (s, )
res = unicode(contents)
print res
article_info = wechats.deal_article(url=wz_item['content_url'])
if not article_info :
continue
sourceurl = wz_item['source_url']
if len(sourceurl) >= 300:
sourceurl = ''
#如果想把文章下载到本地,请开启下面的语句,请确保已经安装:urllib2,httplib2,BeautifulSoup4
#返回值为下载的html文件路径,可以自己保存到数据库
# index_html_path = wechats.down_html(article_info['yuan'],wz_item['title'])
if len(str(res)) >= 20:
# mysql.table('wenzhang_info').add({'title':wz_item['title'],
# 'source_url':sourceurl,
# 'content':str(res),
# 'content_url': wz_item['content_url'],
# 'cover_url':wz_item['cover'],
# 'description':wz_item['digest'],
# 'date_time': time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(wz_item['datetime'])),
# 'mp_id':item['_id'],
# 'author':wz_item['author'],
# 'msg_index':wz_item['main'],
# 'copyright_stat':wz_item['copyright_stat'],
# 'qunfa_id':wz_item['qunfa_id'],
# 'type':wz_item['type'],
# 'like_count':0,
# 'read_count':0,
# 'comment_count':0})
uniqueid = shortuuid.ShortUUID().random(length=32) + str(time.time()).replace('.', '')
mysql.table('wechat_topic').add({
'uniqueid': uniqueid,
'title': wz_item['title'],
'content': '<html><body>'+ str(res)+'</html></body>',
'avatar': wz_item['cover'],
'abstract': wz_item['digest'],
'publish_time': time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(wz_item['datetime'])),
'wechat_id': item['id'],
# 'author': wz_item['author'],
# 'msg_index': wz_item['main'],
# 'copyright_stat': wz_item['copyright_stat'],
'qunfa_id': wz_item['qunfa_id'],
'type': wz_item['type'],
'create_time': time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())),
'update_time': time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
})
else:
print u"插入数据库出错"
continue
#更新最新推送ID
if(last_qunfa_id < cur_qunfa_id):
mysql.where_sql = "id=%s" %(item['id'])
mysql.table('wechat_wechat').save({'last_qunfa_id':cur_qunfa_id,'last_qufa_time':qunfa_time,'update_time':time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))})
except KeyboardInterrupt:
break
# except: #如果不想因为错误使程序退出,可以开启这两句代码
print u"出错,继续"
continue
print('最新success')
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/studyLong/wxspider.git
git@gitee.com:studyLong/wxspider.git
studyLong
wxspider
wxspider
master

搜索帮助

0d507c66 1850385 C8b1a773 1850385