2 Star 2 Fork 0

moqsien/hz_fangchan

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
community.py 1.70 KB
一键复制 编辑 原始数据 按行查看 历史
moqsien 提交于 2018-06-25 13:18 . version0.01
import requests
import re
from lxml import etree
class GetCommunityName(object):
def __init__(self):
self.headers = {
"Host":"hangzhou.tuitui99.com",
"Referer":"http://hangzhou.tuitui99.com",
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"
}
def generate_url(self):
for i in range(1, 101):
if i == 1:
page_url = "http://hangzhou.tuitui99.com/Community.html"
elif i > 1:
page_url = "http://hangzhou.tuitui99.com/Community/p{}.html".format(str(i))
yield page_url
def get_name_list(self, url, headers):
respons = requests.get(url=url, headers=headers)
content = respons.content.decode()
html = etree.HTML(content)
name_list = html.xpath("//ul[@class='h_list']/li//h3/a/text()")
return name_list
def run(self):
page_urls = self.generate_url()
for url in page_urls:
name_list = self.get_name_list(url, self.headers)
for name in name_list:
index = name_list.index(name)
name = name.replace(".", "")
name = name.replace("·", "")
name = name.replace("/", "")
if "(" in name:
name = re.match(r"(.*)(.*)", name, re.S).group(1)
elif "(" in name:
name = re.match(r"(.*)\(.*\)", name, re.S).group(1)
name_list[index] = name
yield name_list
if __name__ == "__main__":
# 测试
app = GetCommunityName()
name_list = app.run()
for name in name_list:
print(name)
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/moqsien/hz_fangchan.git
git@gitee.com:moqsien/hz_fangchan.git
moqsien
hz_fangchan
hz_fangchan
master

搜索帮助