1 Star 0 Fork 0

0_0请用洛必达/训练赛7

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
聚类.py 7.61 KB
一键复制 编辑 原始数据 按行查看 历史
0_0请用洛必达 提交于 2024-08-24 21:00 . 4
import os
os.environ["OMP_NUM_THREADS"] = '1'
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from kmodes.kmodes import KModes
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# 读取整理后的数据
file_path = r"整理后的数据.csv"
data = pd.read_csv(file_path, index_col=0) # 假设第一列是姓名
# 将数据按不同量表分开
scl90_cols = [col for col in data.columns if 'SCL90' in col]
big_five_cols = [col for col in data.columns if '大五人格' in col]
maturity_cols = [col for col in data.columns if '职业成熟度' in col]
# 处理 SCL90 数据,保留姓名
scl90_data = data[scl90_cols].apply(pd.to_numeric, errors='coerce').dropna()
# 处理 大五人格 数据,保留姓名
big_five_data = data[big_five_cols].apply(pd.to_numeric, errors='coerce').dropna()
# 处理 职业成熟度 数据,保留姓名
maturity_data = data[maturity_cols].apply(pd.to_numeric, errors='coerce').dropna()
# 确定最佳聚类数的通用函数
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt
def determine_optimal_clusters(scaled_data, max_clusters=15):
db_scores = []
n_clusters_range = range(2, max_clusters + 1)
for n_clusters in n_clusters_range:
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
labels = kmeans.fit_predict(scaled_data)
db_index = davies_bouldin_score(scaled_data, labels)
db_scores.append(db_index)
plt.figure(figsize=(10, 5))
plt.plot(n_clusters_range, db_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Davies-Bouldin Index')
plt.title('Davies-Bouldin Index for Different Numbers of Clusters')
plt.show()
optimal_k = n_clusters_range[db_scores.index(min(db_scores))]
print(f"推荐的最佳聚类数 (Davies-Bouldin Index): {optimal_k}")
return optimal_k
def select_important_features(data, scaled_data, target_clusters, threshold=0.95):
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(scaled_data, target_clusters)
importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({
'feature': data.columns,
'importance': importances
}).sort_values(by='importance', ascending=False)
print("特征重要性排序:")
print(feature_importance_df)
cumulative_importance = feature_importance_df['importance'].cumsum()
# 找到使累计重要性达到或超过阈值的第一个位置
top_features = feature_importance_df.loc[cumulative_importance <= threshold, 'feature'].values
# 如果阈值设定过低导致没有特征被选择,至少选一个最重要的特征
if len(top_features) == 0:
top_features = feature_importance_df.iloc[0:1]['feature'].values
print(f"推荐使用的特征为: {top_features}")
return top_features
# SCL90 聚类分析
def analyze_scl90(scl90_data):
scaler = StandardScaler()
scl90_scaled = scaler.fit_transform(scl90_data)
optimal_k = determine_optimal_clusters(scl90_scaled)
kmeans = KMeans(n_clusters=optimal_k, random_state=42,n_init=10)
clusters = kmeans.fit_predict(scl90_scaled)
top_features = select_important_features(scl90_data, scl90_scaled, clusters)
scl90_top_scaled = scaler.fit_transform(scl90_data[top_features])
gmm = GaussianMixture(n_components=optimal_k, covariance_type='full', random_state=42)
gmm.fit(scl90_top_scaled)
scl90_clusters = gmm.predict(scl90_top_scaled)
centers = scaler.inverse_transform(gmm.means_)
center_df = pd.DataFrame(centers, columns=top_features)
print("SCL90聚类中心值(对应原始特征):")
print(center_df)
scl90_result = pd.DataFrame({'姓名': scl90_data.index, 'SCL90_cluster': scl90_clusters})
scl90_result.to_csv(r'scl90_clusters.csv', encoding='utf-8-sig', index=False)
print("SCL90聚类结果已保存至 'scl90_clusters.csv'")
# 大五人格 聚类分析
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from kmodes.kmodes import KModes
def combined_optimal_clusters(data, max_k=26):
costs = []
silhouette_scores = []
for k in range(2, max_k + 1):
kmodes = KModes(n_clusters=k, init='Huang', n_init=10, random_state=42)
clusters = kmodes.fit_predict(data)
costs.append(kmodes.cost_)
silhouette_avg = silhouette_score(data, clusters)
silhouette_scores.append(silhouette_avg)
# 绘制肘部法曲线
plt.figure(figsize=(12, 6))
plt.plot(range(2, max_k + 1), costs, marker='o', label='Cost (Elbow Method)')
plt.xlabel('Number of clusters')
plt.ylabel('Cost')
plt.title('Elbow Method for Optimal k')
plt.legend()
plt.grid(True)
plt.show()
# 绘制轮廓系数曲线
plt.figure(figsize=(12, 6))
plt.plot(range(2, max_k + 1), silhouette_scores, marker='o', color='red', label='Silhouette Score')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Optimal k')
plt.legend()
plt.grid(True)
plt.show()
# 选择轮廓系数最高的K值
optimal_k = silhouette_scores.index(max(silhouette_scores)) + 2 # +2 因为range从2开始
print(f"最佳的聚类数建议为: {optimal_k}")
return optimal_k
# 使用函数来决定最佳聚类数
def analyze_big_five(big_five_data):
big_five_mean = big_five_data.mean()
big_five_encoded = big_five_data.apply(lambda x: (x > big_five_mean).astype(int), axis=1)
optimal_k = combined_optimal_clusters(big_five_encoded) # 调用组合函数确定最佳聚类数
kmodes = KModes(n_clusters=optimal_k, init='Huang', n_init=10, verbose=10, random_state=42)
big_five_clusters = kmodes.fit_predict(big_five_encoded)
centers = pd.DataFrame(kmodes.cluster_centroids_, columns=big_five_encoded.columns)
print("大五人格聚类中心值:")
print(centers)
big_five_result = pd.DataFrame({'姓名': big_five_data.index, 'BigFive_cluster': big_five_clusters})
big_five_result.to_csv(r'big_five_clusters.csv', encoding='utf-8-sig', index=False)
print("大五人格聚类结果已保存至 'big_five_clusters.csv'")
# 职业成熟度 聚类分析
def analyze_maturity(maturity_data):
scaler = StandardScaler()
maturity_scaled = scaler.fit_transform(maturity_data)
optimal_k = determine_optimal_clusters(maturity_scaled)
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(maturity_scaled)
top_features = select_important_features(maturity_data, maturity_scaled, clusters)
maturity_top_scaled = scaler.fit_transform(maturity_data[top_features])
gmm_maturity = GaussianMixture(n_components=optimal_k, covariance_type='full', random_state=42)
gmm_maturity.fit(maturity_top_scaled)
maturity_clusters = gmm_maturity.predict(maturity_top_scaled)
centers = scaler.inverse_transform(gmm_maturity.means_)
center_df = pd.DataFrame(centers, columns=top_features)
print("职业成熟度聚类中心值(对应原始特征):")
print(center_df)
maturity_result = pd.DataFrame({'姓名': maturity_data.index, 'Maturity_cluster': maturity_clusters})
maturity_result.to_csv(r'maturity_clusters.csv', encoding='utf-8-sig', index=False)
print("职业成熟度聚类结果已保存至 'maturity_clusters.csv'")
# 分析并输出结果
analyze_scl90(scl90_data)
analyze_big_five(big_five_data)
analyze_maturity(maturity_data)
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/zhangzherui666/training-competition-7.git
git@gitee.com:zhangzherui666/training-competition-7.git
zhangzherui666
training-competition-7
训练赛7
master

搜索帮助