代码拉取完成,页面将自动刷新
# -*- coding: utf-8 -*-
"""
@File Name:链家二手房.py
@Author: name
@Mail: name@qq.com
@Created Time: 2021/7/17 17:33
@Description: 郑州链家楼盘
"""
import os
import io
import sys
import yaml
import time
import logging
import pymysql
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from ratelimiter import RateLimiter
from logging.handlers import RotatingFileHandler
from urllib.parse import urlparse, ParseResult, urlunparse
# 解决相应结果页面编码问题
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
class connect_mysql(object):
# 执行下面的execute_sql方法时会自动执行该初始化方法进行连接数据库
def __init__(self, config):
self.config = config
# 建立连接
self.conn = pymysql.connect(
host=self.config['host'],
port=self.config['port'],
user=self.config['user'],
password=self.config['password'],
db=self.config['database'],
charset='utf8'
)
# 创建游标
# pymysql.cursors.DictCursor 返回字典列表格式数据
self.cursor = self.conn.cursor(pymysql.cursors.DictCursor)
def execute_sql(self, sql):
"""
执行sql语句,并commit提交
:param sql:需要执行的sql语句
:return:
"""
self.cursor.execute(sql)
self.conn.commit()
def execute_many_sql(self, sql, data):
self.cursor.executemany(sql, data)
self.conn.commit()
def get_data(self):
"""
获得查询数据
:return: 返回查到的数据
"""
data = self.cursor.fetchall()
return data
def close_oracle(self):
# 关闭游标
self.cursor.close()
# 关闭数据库连接
self.conn.close()
class MyDaemon(object):
def __init__(self):
self.url = "https://zz.lianjia.com/loupan/pg{0}"
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
@RateLimiter(max_calls=1, period=30)
def send_request(self, url):
"""链家楼盘"""
resp = requests.get(url, headers=self.headers)
if resp.status_code == 200:
return resp
def parse_html(self, resp):
"""解析html页面"""
# 解析url链接中的各个部分
o = urlparse(resp.url)
html = resp.text
bs = BeautifulSoup(html, 'lxml')
ul = bs.find('ul', class_='resblock-list-wrapper')
li_list = ul.find_all('li')
lst = []
for item in li_list:
img_link = item.find('a', class_='resblock-img-wrapper')['href']
# 组合parse到的协议/网络地址,以及目的页面的相对路径
t = ParseResult(o.scheme, o.netloc, img_link, "", "", "")
img_link = urlunparse(t)
text_list = item.find('div', class_='resblock-name')
title = text_list.find('a', class_='name').text
resblock_type = text_list.find('span', class_='resblock-type').text
sale_status = text_list.find('span', class_='sale-status').text
houseInfo = item.find('div', class_='resblock-location')
locat_list = houseInfo.find_all('span')
area = locat_list[0].text
district = locat_list[1].text
address_test = houseInfo.find('a').text
address_link = houseInfo.find('a')['href']
t = ParseResult(o.scheme, o.netloc, address_link, "", "", "")
address_link = urlunparse(t)
resblock_room = item.find('a', class_='resblock-room').text.replace('\n', '')
resblock_area = item.find('div', class_='resblock-area').text.split(' ')[-1] if item.find('div',
class_='resblock-area') else ''
unit_price = item.find('div', class_='main-price').find('span', class_='number').text
total_price = item.find('div', class_='second').text[2:] if item.find('div', class_='second') else ''
tag = item.find('div', class_='resblock-tag').text.replace('\n', ',').strip(',')
lst.append(
(title, resblock_type, sale_status, area, district, address_test, resblock_room, resblock_area,
unit_price, total_price, tag, img_link, address_link))
time.sleep(1)
# print(self.page, lst)
# 保存数据
self.save_data(lst)
def save_data(self, data):
"""保存数据"""
"""保存文件 or 数据库都可"""
insert_sql = "insert into zz_lianjia_loupan (title, resblock_type, sale_status, area, district, address_test, resblock_room,resblock_area, unit_price, total_price, tag, img_link, address_link) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
self.db.execute_many_sql(insert_sql, data)
print(self.page, '插入完毕')
def select_data(self):
"""查询数据"""
sql = "select * from zz_lianjia_loupan"
self.db.execute_sql(sql)
res = self.db.get_data()
return res
def format_data(self, data):
"""格式化数据"""
print(data)
df = pd.DataFrame(data,columns=data[0].keys())
print(df)
df.to_excel('../data/lianjia_loupan.xls',index=False)
logging.info('保存成功')
def run(self):
## 如果没有此文件,请手动配置config 字典
## {'mysql': {'host': '', 'port': 3306, 'user': '', 'password': '', 'database': ''},...}
config_file = open(os.path.dirname(os.path.abspath(__file__)) + '/config.yaml', encoding='utf-8')
self.config = yaml.safe_load(config_file)
config_file.close()
########### 如有报错 此块注释 #########
name = 'lianjie_house'
logging.basicConfig(level=logging.INFO)
handler = RotatingFileHandler(os.path.dirname(os.path.abspath(__file__)) + '/../logs/%s.log' % name,
maxBytes=134217728, backupCount=7)
formatter = logging.Formatter('%(asctime)s - %(lineno)d- %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logging.getLogger().addHandler(handler)
requests_log = logging.getLogger("requests.packages.urllib3")
requests_log.setLevel(logging.ERROR)
########### 如有报错 此块注释 #########
logging.warning('启动 [%s]', name)
logging.warning('主线程 PID [%s]', os.getpid())
self.db = connect_mysql(self.config['mysql'])
data = self.select_data()
if not data:
# 链家楼盘
for i in range(1, 95):
self.page = i
full_url = self.url.format(i)
resp = self.send_request(full_url)
self.parse_html(resp)
else:
self.format_data(data)
self.db.close_oracle()
if __name__ == '__main__':
my_daemon = MyDaemon()
my_daemon.run()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。