1 Star 1 Fork 2

茗趣yhj/链家楼盘

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
链家楼盘.py 7.03 KB
一键复制 编辑 原始数据 按行查看 历史
茗趣yhj 提交于 2021-09-10 05:05 . git init
# -*- coding: utf-8 -*-
"""
@File Name:链家二手房.py
@Author: name
@Mail: name@qq.com
@Created Time: 2021/7/17 17:33
@Description: 郑州链家楼盘
"""
import os
import io
import sys
import yaml
import time
import logging
import pymysql
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from ratelimiter import RateLimiter
from logging.handlers import RotatingFileHandler
from urllib.parse import urlparse, ParseResult, urlunparse
# 解决相应结果页面编码问题
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
class connect_mysql(object):
# 执行下面的execute_sql方法时会自动执行该初始化方法进行连接数据库
def __init__(self, config):
self.config = config
# 建立连接
self.conn = pymysql.connect(
host=self.config['host'],
port=self.config['port'],
user=self.config['user'],
password=self.config['password'],
db=self.config['database'],
charset='utf8'
)
# 创建游标
# pymysql.cursors.DictCursor 返回字典列表格式数据
self.cursor = self.conn.cursor(pymysql.cursors.DictCursor)
def execute_sql(self, sql):
"""
执行sql语句,并commit提交
:param sql:需要执行的sql语句
:return:
"""
self.cursor.execute(sql)
self.conn.commit()
def execute_many_sql(self, sql, data):
self.cursor.executemany(sql, data)
self.conn.commit()
def get_data(self):
"""
获得查询数据
:return: 返回查到的数据
"""
data = self.cursor.fetchall()
return data
def close_oracle(self):
# 关闭游标
self.cursor.close()
# 关闭数据库连接
self.conn.close()
class MyDaemon(object):
def __init__(self):
self.url = "https://zz.lianjia.com/loupan/pg{0}"
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
@RateLimiter(max_calls=1, period=30)
def send_request(self, url):
"""链家楼盘"""
resp = requests.get(url, headers=self.headers)
if resp.status_code == 200:
return resp
def parse_html(self, resp):
"""解析html页面"""
# 解析url链接中的各个部分
o = urlparse(resp.url)
html = resp.text
bs = BeautifulSoup(html, 'lxml')
ul = bs.find('ul', class_='resblock-list-wrapper')
li_list = ul.find_all('li')
lst = []
for item in li_list:
img_link = item.find('a', class_='resblock-img-wrapper')['href']
# 组合parse到的协议/网络地址,以及目的页面的相对路径
t = ParseResult(o.scheme, o.netloc, img_link, "", "", "")
img_link = urlunparse(t)
text_list = item.find('div', class_='resblock-name')
title = text_list.find('a', class_='name').text
resblock_type = text_list.find('span', class_='resblock-type').text
sale_status = text_list.find('span', class_='sale-status').text
houseInfo = item.find('div', class_='resblock-location')
locat_list = houseInfo.find_all('span')
area = locat_list[0].text
district = locat_list[1].text
address_test = houseInfo.find('a').text
address_link = houseInfo.find('a')['href']
t = ParseResult(o.scheme, o.netloc, address_link, "", "", "")
address_link = urlunparse(t)
resblock_room = item.find('a', class_='resblock-room').text.replace('\n', '')
resblock_area = item.find('div', class_='resblock-area').text.split(' ')[-1] if item.find('div',
class_='resblock-area') else ''
unit_price = item.find('div', class_='main-price').find('span', class_='number').text
total_price = item.find('div', class_='second').text[2:] if item.find('div', class_='second') else ''
tag = item.find('div', class_='resblock-tag').text.replace('\n', ',').strip(',')
lst.append(
(title, resblock_type, sale_status, area, district, address_test, resblock_room, resblock_area,
unit_price, total_price, tag, img_link, address_link))
time.sleep(1)
# print(self.page, lst)
# 保存数据
self.save_data(lst)
def save_data(self, data):
"""保存数据"""
"""保存文件 or 数据库都可"""
insert_sql = "insert into zz_lianjia_loupan (title, resblock_type, sale_status, area, district, address_test, resblock_room,resblock_area, unit_price, total_price, tag, img_link, address_link) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
self.db.execute_many_sql(insert_sql, data)
print(self.page, '插入完毕')
def select_data(self):
"""查询数据"""
sql = "select * from zz_lianjia_loupan"
self.db.execute_sql(sql)
res = self.db.get_data()
return res
def format_data(self, data):
"""格式化数据"""
print(data)
df = pd.DataFrame(data,columns=data[0].keys())
print(df)
df.to_excel('../data/lianjia_loupan.xls',index=False)
logging.info('保存成功')
def run(self):
## 如果没有此文件,请手动配置config 字典
## {'mysql': {'host': '', 'port': 3306, 'user': '', 'password': '', 'database': ''},...}
config_file = open(os.path.dirname(os.path.abspath(__file__)) + '/config.yaml', encoding='utf-8')
self.config = yaml.safe_load(config_file)
config_file.close()
########### 如有报错 此块注释 #########
name = 'lianjie_house'
logging.basicConfig(level=logging.INFO)
handler = RotatingFileHandler(os.path.dirname(os.path.abspath(__file__)) + '/../logs/%s.log' % name,
maxBytes=134217728, backupCount=7)
formatter = logging.Formatter('%(asctime)s - %(lineno)d- %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logging.getLogger().addHandler(handler)
requests_log = logging.getLogger("requests.packages.urllib3")
requests_log.setLevel(logging.ERROR)
########### 如有报错 此块注释 #########
logging.warning('启动 [%s]', name)
logging.warning('主线程 PID [%s]', os.getpid())
self.db = connect_mysql(self.config['mysql'])
data = self.select_data()
if not data:
# 链家楼盘
for i in range(1, 95):
self.page = i
full_url = self.url.format(i)
resp = self.send_request(full_url)
self.parse_html(resp)
else:
self.format_data(data)
self.db.close_oracle()
if __name__ == '__main__':
my_daemon = MyDaemon()
my_daemon.run()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/yhjkcjm/chain-home-real-estate.git
git@gitee.com:yhjkcjm/chain-home-real-estate.git
yhjkcjm
chain-home-real-estate
链家楼盘
master

搜索帮助