代码拉取完成,页面将自动刷新
__author__ = 'Vivienfanghua'
# -*- coding:utf-8 -*-
import urllib
import urllib2
from bs4 import BeautifulSoup
import re
import codecs
import socket
socket.setdefaulttimeout(10)
baseurl="http://poi86.com"
user_agent = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'
headers = { 'User-Agent' : user_agent }
a=codecs.open("poi_115.txt", "w", "utf8")
b=codecs.open("error_115.txt", "w", "utf8")
c=codecs.open("page_error_115.txt", "w", "utf8")
def handler(signum, frame):
raise AssertionError
def getOnePOI(url):
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content=response.read().decode("utf-8")
soup=BeautifulSoup(content)
items=soup.select(".list-group-item")
for i in items:
a.write(i.getText().split(u":")[1]+u" ")
a.write(u"\n")
print url
except Exception,e:
b.write(url+u"\n")
getOnePOI(url)
print e.message
return
def getDistrictOne(url):
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content=response.read().decode("utf-8")
soup=BeautifulSoup(content)
trs=soup.select("tr")
urls=[i.select("a")[0].get("href") for i in trs if len(i.select("a"))>0]
for i in urls:
getOnePOI(baseurl+i)
except Exception,e:
c.write(url+u"\n")
getDistrictOne(url)
print e.message
return
def getDistrict(url):
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content=response.read().decode("utf-8")
soup=BeautifulSoup(content)
num=0
pagearea=soup.select(".disabled")
if len(pagearea)>1:
numtext=pagearea[1].getText()
num=int(numtext.split("/")[1])
if num==0:
return
for i in range(115,num+1):
urli=re.sub(re.compile("\d\.html"),str(i)+".html",url)
print urli
getDistrictOne(urli)
except Exception,e:
c.write(url+u"\n")
getDistrict(url)
print e.message
return
def getPOI(url):
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content=response.read().decode("utf-8")
soup=BeautifulSoup(content)
list=soup.find_all(href=re.compile('/poi/amap/district/'))
urls=[i.get("href") for i in list]
for i in urls:
print i
getDistrict(baseurl+i)
except Exception,e:
print e.message
return
if __name__=="__main__":
getDistrict("http://www.poi86.com/poi/amap/district/420111/1.html")
a.close()
b.close()
c.close()
# getPOI("http://poi86.com/poi/amap/city/330100.html")
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。