1 Star 0 Fork 0

saber110/wx

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
gpjg.py 3.33 KB
一键复制 编辑 原始数据 按行查看 历史
saber110 提交于 2018-09-13 16:06 . 爬虫定型
#!/usr/bin/env python
# coding=utf-8
import sys
reload(sys) # 不加这部分处理中文还是会出问题
sys.setdefaultencoding('utf-8')
import urllib
import mysql
from bs4 import BeautifulSoup
import config
domain = "http://www.qinan.gov.cn"
url = "http://www.qinan.gov.cn/html/news/gphq/"
# get all pages
def getGpjgHtml():
page = urllib.urlopen(url)
soup = BeautifulSoup(page,"html.parser")
pages = soup.find(id="pages")
for item in pages.children:
if item.name == 'a':
if 'href' in item.attrs:
gpjgHtmlStore(domain + item['href'])
# get all hyperlink in one page
def gpjgAll(url1 = ""):
page = urllib.urlopen(url1)
soup = BeautifulSoup(page,"html.parser")
list = soup.ul
for item in list.find_all_next('li'):
if item.a != None:
gpjgAllStore(item.a['href'], item.a.contents[0], item.span.contents[0])
# Store the hyperlink in one page
def gpjgAllStore(url='', title='', date=''):
query1 = "INSERT INTO gpjgAll (href, title, date) values ('"+ url+"','" + title +"','"+ date +"')"
query(query1)
def gpjgAllFromDB():
query1 = "select distinct href from gpjgAll WHERE title like " + "'%" +config.AppleSign + "%'"
db = mysql.connect()
result = mysql.getData(db, query1, "gpjgAll")
mysql.close(db)
return result
#Store All page`s link
def gpjgHtmlStore(href=''):
query1 = "INSERT INTO gpjgHtml (href) values ('"+ str(href) +"')"
query(query1)
#Get All page`s links from db with distinct
def gpjgUrlsFromDB():
query1 = "select distinct href from gpjgHtml"
db = mysql.connect()
result = mysql.getData(db, query1, "gpjgHtml")
mysql.close(db)
return result
def query(query=''):
db = mysql.connect()
mysql.query(db, query)
mysql.close(db)
# width 为一个url里面的某个表的宽度, table为存储该表的数据表名称
def getApplePrice(url='', width='', table = ''):
page = urllib.urlopen(url)
soup = BeautifulSoup(page,"html.parser")
Apple = soup.find(height="54")
if config.AppleSign in Apple.string:
temp = soup.find(bgcolor="#FAE2E4")
date = temp.string[0:10]
pages = soup.find(style="width: "+ str(width) +"px;")
for item in pages.find('tr').find_next_siblings('tr'):
result = []
for td in item.select('td'):
result.append(td.string)
sql = "insert into `"+ table +"` (variety, place, package, price, date) VALUES ('" + result[0] +"','"+ result[1] +"','"+ result[2] +"','"+ result[3]+"','"+date +"')"
query(sql)
print sql
else:
print "NOT APPLE"
def resetDB():
sql = "delete from `bjxfd`;"
query(sql)
sql = "delete from `qax`;"
query(sql)
sql = "delete from `zjjx`;"
query(sql)
sql = "delete from `gpjgAll`;"
query(sql)
sql = "delete from `gpjgHtml`;"
query(sql)
def main():
resetDB()
getGpjgHtml()
pages = gpjgUrlsFromDB()
for page in pages:
gpjgAll(page['href'])
urls = gpjgAllFromDB()
for url in urls:
getApplePrice(url['href'], config.zjjxWidth, config.zjjxTable)
getApplePrice(url['href'], config.bjxfdWidth, config.bjxfdTable)
getApplePrice(url['href'], config.qaxWidth, config.qaxTable)
#main()
getApplePrice("http://www.qinan.gov.cn/html/2018/gphq_0510/26891.html",584,"zjjx")
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/saber456789/wx.git
git@gitee.com:saber456789/wx.git
saber456789
wx
wx
master

搜索帮助

0d507c66 1850385 C8b1a773 1850385