代码拉取完成,页面将自动刷新
import pandas as pd
# 导入数据集
data = pd.read_csv('student-mat.csv', sep=';')
# 描述性统计
print(data.describe())
# 查看每个属性与目标变量之间的相关性
print(data.corr()['G3'].sort_values())
# 检查是否有缺失值
print(data.isnull().sum())
# 检查异常值
print(data.boxplot())
# 对分类变量进行编码
data_encoded = pd.get_dummies(data)
# 对数据进行特征缩放
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data_encoded), columns=data_encoded.columns)
# 划分训练、验证和测试集
from sklearn.model_selection import train_test_split
X = data_scaled.drop('G3', axis=1)
y = data_scaled['G3']
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1765, random_state=42)
# 建立决策树模型
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X_train, y_train)
# 在验证集上进行评估
import numpy as np
y_val_pred = tree_reg.predict(X_val)
mse = mean_squared_error(y_val, y_val_pred)
rmse = np.sqrt(mse)
print('Validation set RMSE:', rmse)
# 网格搜索寻找最佳超参数组合
from sklearn.model_selection import GridSearchCV
params = {'max_depth': range(1, 21), 'min_samples_leaf': range(1, 11)}
tree_reg_grid = DecisionTreeRegressor(random_state=42)
grid_search = GridSearchCV(tree_reg_grid, params, cv=5, scoring='neg_mean_squared_error', return_train_score=False)
grid_search.fit(X_train_val, y_train_val)
# 输出最佳超参数组合
print(grid_search.best_params_)
# 训练最终模型并在测试集上进行评估
best_reg = DecisionTreeRegressor(max_depth=10, min_samples_leaf=2, random_state=42)
best_reg.fit(X_train_val, y_train_val)
y_test_pred = best_reg.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
print('Test set RMSE:', rmse)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。