master

分支 (1)

管理

管理

master

wxspider
/
updatemp.py

# -*- coding: utf-8 -*-
#查找公众号最新文章

# 导入包
from wechatsogou import upload
from wechatsogou.tools import *
from wechatsogou import *
from PIL import Image
import datetime
import time
import logging
import logging.config
import random
import requests
from bs4 import BeautifulSoup
import re
import os
import shortuuid

# 日志
logging.config.fileConfig('logging.conf')
logger = logging.getLogger()

# 搜索API实例
wechats = WechatSogouApi() #不使用外部Cookie


#如果想使用外部cookie，主要是为了实现搜狗微信登录状态
#你需要安装chrom浏览器，然后给浏览器安装EditThisCooke这个插件
#1、使用Chrom浏览器登录搜狗微信
#2、使用EditThisCooke插件复制当前Cookie信息
#3、把cookie信息复制到代码目录下的cookies.txt文件
#4、开启下面这行语句
#wechats = WechatSogouApi(cookies_file={'file_name':'cookies.txt'})  #使用外部cookie


#数据库实例
mysql = mysql('wechat_wechat')

#循环获取数据库中所有公众号
mysql.order_sql = " order by id desc"
mp_list = mysql.find(0)
succ_count = 0

now_time = datetime.datetime.today()
now_time = datetime.datetime(now_time.year, now_time.month, now_time.day, 0, 0, 0)
#now_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(now_time))

for item in mp_list:
    try:
        time.sleep(random.randrange(1,3))
        #查看一下该号今天是否已经发送文章
        last_qunfa_id = item['last_qunfa_id']
        last_qunfa_time = item['last_qufa_time']

        cur_qunfa_id = last_qunfa_id
        wz_url = ""
        if item.has_key('wz_url'):
            wz_url = item['wz_url']
        else :
            wechat_info = wechats.get_gzh_info(item['wechatid'])
            if not wechat_info.has_key('url'):
                continue
            wz_url = wechat_info['url']

        print(item['name'])
        if wz_url=='':
	    continue
        #获取最近文章信息
        if (wz_url == null)
            continue

        wz_list = wechats.get_gzh_message(url=wz_url)
        if u'链接已过期' in wz_list:
            wechat_info = wechats.get_gzh_info(item['wechatid'])
            if not wechat_info.has_key('url'):
                continue
            print('guo qi sz chong xin huo qu success')
            wz_url = wechat_info['url']
            wz_list = wechats.get_gzh_message(url=wz_url)
            mysql.where_sql = "id=%s" % (item['id'])
            mysql.table('wechat_wechat').where({'id': item['id']}).save({'wz_url': wechat_info['url'], 'avatar': wechat_info['img'], 'qrcode': wechat_info['qrcode']})
        #type==49表示是图文消息
        qunfa_time = ''
        for wz_item in wz_list :
            temp_qunfa_id = int(wz_item['qunfa_id'])
            if(last_qunfa_id >= temp_qunfa_id):
                print(u"没有更新文章")
                print(u"")
                #break
		continue
            if(cur_qunfa_id < temp_qunfa_id):
                cur_qunfa_id = temp_qunfa_id
                qunfa_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(wz_item['datetime']))
            succ_count += 1
            if wz_item['type'] == '49':

                #把文章写入数据库
                #更新文章条数
                print(succ_count)
                print(wz_item['content_url'])
                if not wz_item['content_url']:
                    continue
                time.sleep(0.5)

                article = requests.get(wz_item['content_url'])
                bsp = BeautifulSoup(article.content)

                #[script.extract() for script in res.findAll('script')]
                #[style.extract() for style in res.findAll('style')]

                res = bsp.find('div', {'id': 'js_content'})
                #res = str(res).replace('var occupyImg = ', '')
                bs = BeautifulSoup(str(res))

                vote = bs.find('span', {'class': 'vote_area'})
                if vote:
                    vote.replace_with('')

                contents = ''
                p_tags = bs.select('p')

                for p in  p_tags:

                    # print '开始-------------get_text'

                    if p.get_text():
                        contents += '<p>%s</p>' % (str(p.get_text()), )

                    # print '开始-------------img'
                    if p.select('img'):
                        for img in p.select('img'):
                            if img.get('data-src'):
                                try:
                                    width = ''
                                    if img.get('data-w'):
                                        width = img.get('data-w')
					                    imageUrl = ''
                                        imageUrl = upload.uploadImage(img.get('data-src'), width)
                                        print imageUrl
                                        if len(imageUrl) <= 1:
                                            imageUrl = str(img.get('data-src')).replace('=', '.')
                                        else:
                                            imageUrl = img.get('data-src')
                                    else:
                                        width = 'auto'

                                    contents += '<span><img style="width: %s" src="%s"></img></span>' % (width, imageUrl,)
                                except KeyboardInterrupt:
                                    print u"上传图片出错出错，继续"
                                    continue
                    time.sleep(0.1)
                    # print '开始-------------iframe'
                    if p.select('iframe'):
                        for iframe in p.select('iframe'):

                            iframe['width'] = 'auto'
                            iframe['height'] = 'auto'

                            if iframe.get('src'):
                                s = iframe.get('src')
                                s = s.replace('preview', 'player')
                                s = re.sub(r'(height=\d+[\.\d+]*)', 'height=auto', s)
                                s = re.sub(r'(width=\d+[\.\d+]*)', 'width=auto', s)
                                iframe['src'] = s
                                # print iframe
                                contents += '<span><iframe allowfullscreen=""  src="%s" frameborder="0" height="auto" scrolling="no" width="auto"></iframe></span>' % (s, )

                            if iframe.get('data-src'):
                                s = iframe.get('data-src')
                                s = s.replace('preview', 'player')
                                s = re.sub(r'(height=\d+[\.\d+]*)', 'height=auto', s)
                                s = re.sub(r'(width=\d+[\.\d+]*)', 'width=auto', s)
                                # print iframe
                                iframe['data-src'] = s
                                contents += '<span><iframe allowfullscreen=""  src="%s" frameborder="0" height="auto" scrolling="no" width="auto"></iframe></span>' % (s, )

                res = unicode(contents)
                print res

                article_info = wechats.deal_article(url=wz_item['content_url'])

                if not article_info :
                    continue

                sourceurl = wz_item['source_url']
                if len(sourceurl) >= 300:
                    sourceurl = ''

                #如果想把文章下载到本地，请开启下面的语句,请确保已经安装：urllib2，httplib2，BeautifulSoup4
                #返回值为下载的html文件路径，可以自己保存到数据库
                # index_html_path = wechats.down_html(article_info['yuan'],wz_item['title'])

                if len(str(res)) >= 20:
                    # mysql.table('wenzhang_info').add({'title':wz_item['title'],
                    #                             'source_url':sourceurl,
                    #                             'content':str(res),
                    #                             'content_url': wz_item['content_url'],
                    #                             'cover_url':wz_item['cover'],
                    #                             'description':wz_item['digest'],
                    #                             'date_time': time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(wz_item['datetime'])),
                    #                             'mp_id':item['_id'],
                    #                             'author':wz_item['author'],
                    #                             'msg_index':wz_item['main'],
                    #                             'copyright_stat':wz_item['copyright_stat'],
                    #                             'qunfa_id':wz_item['qunfa_id'],
                    #                             'type':wz_item['type'],
                    #                             'like_count':0,
                    #                             'read_count':0,
                    #                             'comment_count':0})
                    uniqueid = shortuuid.ShortUUID().random(length=32) + str(time.time()).replace('.', '')
                    mysql.table('wechat_topic').add({
                                                      'uniqueid': uniqueid,
                                                      'title': wz_item['title'],
                                                      'content': '<html><body>'+ str(res)+'</html></body>',
                                                      'avatar': wz_item['cover'],
                                                      'abstract': wz_item['digest'],
                                                      'publish_time': time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(wz_item['datetime'])),
                                                      'wechat_id': item['id'],
                                                      # 'author': wz_item['author'],
                                                      # 'msg_index': wz_item['main'],
                                                      # 'copyright_stat': wz_item['copyright_stat'],
                                                      'qunfa_id': wz_item['qunfa_id'],
                                                      'type': wz_item['type'],
                                                      'create_time': time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())),
                                                      'update_time': time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
                                            })
                else:
                    print u"插入数据库出错"
                    continue
        #更新最新推送ID
        if(last_qunfa_id < cur_qunfa_id):
            mysql.where_sql = "id=%s" %(item['id'])
            mysql.table('wechat_wechat').save({'last_qunfa_id':cur_qunfa_id,'last_qufa_time':qunfa_time,'update_time':time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))})
    except KeyboardInterrupt:
        break
    # except: #如果不想因为错误使程序退出，可以开启这两句代码
        print u"出错，继续"
        continue

print('最新success')