1 Star 0 Fork 0

Vivienfanghua/TestPython

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
poi2.py 2.78 KB
一键复制 编辑 原始数据 按行查看 历史
Vivienfanghua 提交于 2017-10-14 13:50 . first commit
__author__ = 'Vivienfanghua'
# -*- coding:utf-8 -*-
import urllib
import urllib2
from bs4 import BeautifulSoup
import re
import codecs
import socket
socket.setdefaulttimeout(10)
baseurl="http://poi86.com"
user_agent = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'
headers = { 'User-Agent' : user_agent }
a=codecs.open("poi_115.txt", "w", "utf8")
b=codecs.open("error_115.txt", "w", "utf8")
c=codecs.open("page_error_115.txt", "w", "utf8")
def handler(signum, frame):
raise AssertionError
def getOnePOI(url):
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content=response.read().decode("utf-8")
soup=BeautifulSoup(content)
items=soup.select(".list-group-item")
for i in items:
a.write(i.getText().split(u":")[1]+u" ")
a.write(u"\n")
print url
except Exception,e:
b.write(url+u"\n")
getOnePOI(url)
print e.message
return
def getDistrictOne(url):
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content=response.read().decode("utf-8")
soup=BeautifulSoup(content)
trs=soup.select("tr")
urls=[i.select("a")[0].get("href") for i in trs if len(i.select("a"))>0]
for i in urls:
getOnePOI(baseurl+i)
except Exception,e:
c.write(url+u"\n")
getDistrictOne(url)
print e.message
return
def getDistrict(url):
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content=response.read().decode("utf-8")
soup=BeautifulSoup(content)
num=0
pagearea=soup.select(".disabled")
if len(pagearea)>1:
numtext=pagearea[1].getText()
num=int(numtext.split("/")[1])
if num==0:
return
for i in range(115,num+1):
urli=re.sub(re.compile("\d\.html"),str(i)+".html",url)
print urli
getDistrictOne(urli)
except Exception,e:
c.write(url+u"\n")
getDistrict(url)
print e.message
return
def getPOI(url):
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content=response.read().decode("utf-8")
soup=BeautifulSoup(content)
list=soup.find_all(href=re.compile('/poi/amap/district/'))
urls=[i.get("href") for i in list]
for i in urls:
print i
getDistrict(baseurl+i)
except Exception,e:
print e.message
return
if __name__=="__main__":
getDistrict("http://www.poi86.com/poi/amap/district/420111/1.html")
a.close()
b.close()
c.close()
# getPOI("http://poi86.com/poi/amap/city/330100.html")
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/vivienfanghua/TestPython.git
git@gitee.com:vivienfanghua/TestPython.git
vivienfanghua
TestPython
TestPython
master

搜索帮助

D67c1975 1850385 1daf7b77 1850385