代码拉取完成,页面将自动刷新
import requests
import re
from lxml import etree
class GetCommunityName(object):
def __init__(self):
self.headers = {
"Host":"hangzhou.tuitui99.com",
"Referer":"http://hangzhou.tuitui99.com",
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"
}
def generate_url(self):
for i in range(1, 101):
if i == 1:
page_url = "http://hangzhou.tuitui99.com/Community.html"
elif i > 1:
page_url = "http://hangzhou.tuitui99.com/Community/p{}.html".format(str(i))
yield page_url
def get_name_list(self, url, headers):
respons = requests.get(url=url, headers=headers)
content = respons.content.decode()
html = etree.HTML(content)
name_list = html.xpath("//ul[@class='h_list']/li//h3/a/text()")
return name_list
def run(self):
page_urls = self.generate_url()
for url in page_urls:
name_list = self.get_name_list(url, self.headers)
for name in name_list:
index = name_list.index(name)
name = name.replace(".", "")
name = name.replace("·", "")
name = name.replace("/", "")
if "(" in name:
name = re.match(r"(.*)(.*)", name, re.S).group(1)
elif "(" in name:
name = re.match(r"(.*)\(.*\)", name, re.S).group(1)
name_list[index] = name
yield name_list
if __name__ == "__main__":
# 测试
app = GetCommunityName()
name_list = app.run()
for name in name_list:
print(name)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。