代码拉取完成,页面将自动刷新
#!/usr/bin/env python
# coding=utf-8
import sys
reload(sys) # 不加这部分处理中文还是会出问题
sys.setdefaultencoding('utf-8')
import urllib
import mysql
from bs4 import BeautifulSoup
import config
domain = "http://www.qinan.gov.cn"
url = "http://www.qinan.gov.cn/html/news/gphq/"
# get all pages
def getGpjgHtml():
page = urllib.urlopen(url)
soup = BeautifulSoup(page,"html.parser")
pages = soup.find(id="pages")
for item in pages.children:
if item.name == 'a':
if 'href' in item.attrs:
gpjgHtmlStore(domain + item['href'])
# get all hyperlink in one page
def gpjgAll(url1 = ""):
page = urllib.urlopen(url1)
soup = BeautifulSoup(page,"html.parser")
list = soup.ul
for item in list.find_all_next('li'):
if item.a != None:
gpjgAllStore(item.a['href'], item.a.contents[0], item.span.contents[0])
# Store the hyperlink in one page
def gpjgAllStore(url='', title='', date=''):
query1 = "INSERT INTO gpjgAll (href, title, date) values ('"+ url+"','" + title +"','"+ date +"')"
query(query1)
def gpjgAllFromDB():
query1 = "select distinct href from gpjgAll WHERE title like " + "'%" +config.AppleSign + "%'"
db = mysql.connect()
result = mysql.getData(db, query1, "gpjgAll")
mysql.close(db)
return result
#Store All page`s link
def gpjgHtmlStore(href=''):
query1 = "INSERT INTO gpjgHtml (href) values ('"+ str(href) +"')"
query(query1)
#Get All page`s links from db with distinct
def gpjgUrlsFromDB():
query1 = "select distinct href from gpjgHtml"
db = mysql.connect()
result = mysql.getData(db, query1, "gpjgHtml")
mysql.close(db)
return result
def query(query=''):
db = mysql.connect()
mysql.query(db, query)
mysql.close(db)
# width 为一个url里面的某个表的宽度, table为存储该表的数据表名称
def getApplePrice(url='', width='', table = ''):
page = urllib.urlopen(url)
soup = BeautifulSoup(page,"html.parser")
Apple = soup.find(height="54")
if config.AppleSign in Apple.string:
temp = soup.find(bgcolor="#FAE2E4")
date = temp.string[0:10]
pages = soup.find(style="width:"+ str(width) +"px;")
if not pages:
print "fuck"
pages = soup.find(style="width: "+ str(width) +"px")
try:
for item in pages.find('tr').find_next_siblings('tr'):
result = []
for td in item.select('td'):
result.append(td.string)
sql = "insert into `"+ table +"` (variety, place, package, price, date) VALUES ('" + result[0] +"','"+ result[1] +"','"+ result[2] +"','"+ result[3]+"','"+date +"')"
query(sql)
except:
print "There is no qax in "+ url
else:
print "NOT APPLE"
def resetDB():
sql = "delete from `bjxfd`;"
query(sql)
sql = "delete from `qax`;"
query(sql)
sql = "delete from `zjjx`;"
query(sql)
sql = "delete from `gpjgAll`;"
query(sql)
sql = "delete from `gpjgHtml`;"
query(sql)
def main():
resetDB()
getGpjgHtml()
pages = gpjgUrlsFromDB()
for page in pages:
gpjgAll(page['href'])
urls = gpjgAllFromDB()
for url in urls:
print url['href']
getApplePrice(url['href'], config.zjjxWidth, config.zjjxTable)
getApplePrice(url['href'], config.bjxfdWidth, config.bjxfdTable)
getApplePrice(url['href'], config.qaxWidth, config.qaxTable)
main()
#getApplePrice("http://www.qinan.gov.cn/html/2017/gphq_1122/23310.html",591,"qax")
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。