master

分支 (1)

管理

管理

master

wx
/
test.py

#!/usr/bin/env python
# coding=utf-8
import sys

reload(sys)  # 不加这部分处理中文还是会出问题
sys.setdefaultencoding('utf-8')

import urllib
import mysql
from bs4 import BeautifulSoup
import config

domain = "http://www.qinan.gov.cn"
url = "http://www.qinan.gov.cn/html/news/gphq/"

# get all pages
def getGpjgHtml():
    page = urllib.urlopen(url)
    soup = BeautifulSoup(page,"html.parser")
    pages = soup.find(id="pages")
    for item in pages.children:
        if item.name == 'a':
            if 'href' in item.attrs:
                gpjgHtmlStore(domain + item['href'])

# get all hyperlink in one page
def gpjgAll(url1 = ""):
    page = urllib.urlopen(url1)
    soup = BeautifulSoup(page,"html.parser")
    list = soup.ul
    for item in list.find_all_next('li'):
        if item.a != None:
            gpjgAllStore(item.a['href'], item.a.contents[0], item.span.contents[0])

# Store the hyperlink in one page
def gpjgAllStore(url='', title='', date=''):
    query1 = "INSERT INTO gpjgAll (href, title, date) values ('"+ url+"','" + title +"','"+ date +"')"
    query(query1)

def gpjgAllFromDB():
    query1 = "select distinct href from gpjgAll WHERE title like " + "'%" +config.AppleSign + "%'"
    db = mysql.connect()
    result = mysql.getData(db, query1, "gpjgAll")
    mysql.close(db)
    return result

#Store All page`s link
def gpjgHtmlStore(href=''):
    query1 = "INSERT INTO gpjgHtml (href) values ('"+ str(href) +"')"
    query(query1)

#Get All page`s links from db with distinct
def gpjgUrlsFromDB():
    query1 = "select distinct href from gpjgHtml"
    db = mysql.connect()
    result = mysql.getData(db, query1, "gpjgHtml")
    mysql.close(db)
    return result

def query(query=''):
    db = mysql.connect()
    mysql.query(db, query)
    mysql.close(db)

# width 为一个url里面的某个表的宽度, table为存储该表的数据表名称
def getApplePrice(url='', width='', table = ''):
    page = urllib.urlopen(url)
    soup = BeautifulSoup(page,"html.parser")
    Apple = soup.find(height="54")

    if config.AppleSign in Apple.string:
        temp = soup.find(bgcolor="#FAE2E4")
        date = temp.string[0:10]
        pages = soup.find(style="width:"+ str(width) +"px;")
        if not pages:
            print "fuck"
            pages = soup.find(style="width: "+ str(width) +"px")
        try:
            for item in pages.find('tr').find_next_siblings('tr'):
                result = []
                for td in item.select('td'):
                    result.append(td.string)
                sql = "insert into `"+ table +"` (variety, place, package, price, date) VALUES ('" + result[0] +"','"+ result[1] +"','"+ result[2] +"','"+ result[3]+"','"+date +"')"
                query(sql)
        except:
            print "There is no qax in "+ url
    else:
        print "NOT APPLE"

def resetDB():
    sql = "delete from `bjxfd`;"
    query(sql)
    sql = "delete from `qax`;"
    query(sql)
    sql = "delete from `zjjx`;"
    query(sql)
    sql = "delete from `gpjgAll`;"
    query(sql)
    sql = "delete from `gpjgHtml`;"
    query(sql)

def main():
    resetDB()
    getGpjgHtml()
    pages = gpjgUrlsFromDB()
    for page in pages:
        gpjgAll(page['href'])
    urls = gpjgAllFromDB()
    for url in urls:
        print url['href']
        getApplePrice(url['href'], config.zjjxWidth, config.zjjxTable)
        getApplePrice(url['href'], config.bjxfdWidth, config.bjxfdTable)
        getApplePrice(url['href'], config.qaxWidth, config.qaxTable)

main()
#getApplePrice("http://www.qinan.gov.cn/html/2017/gphq_1122/23310.html",591,"qax")