main

分支 (1)

管理

管理

main

faiss_dog_cat_questio_ensemble-learning
/
train_ensemble.py

import time
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from tabulate import tabulate
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import cv2
import os
from os.path import exists
from imutils import paths
import pickle
from tensorflow.keras.preprocessing import image
import logging
import faiss
from sklearn.neighbors import KNeighborsClassifier
from FaissKNeighbors import FaissKNeighbors
from util import createXY

# 配置日志记录的基本设置，设置日志级别为INFO，并定义日志格式
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


# 此函数用于获取文件的大小（以MB为单位）
def get_size(file):
    return os.path.getsize(file) / (1024 * 1024)


# 此函数用于从图像数据创建特征矩阵X和标签向量y
def createXY(train_folder, dest_folder, method='flat', batch_size=64):
    x_file_path = os.path.join(dest_folder, "X.pkl")
    y_file_path = os.path.join(dest_folder, "y.pkl")

    # 如果X和y的数据文件已经存在，则直接读取并返回
    if os.path.exists(x_file_path) and os.path.exists(y_file_path):
        logging.info("X和y已经存在，直接读取")
        logging.info(f"X文件大小:{get_size(x_file_path):.2f}MB")
        logging.info(f"y文件大小:{get_size(y_file_path):.2f}MB")

        with open(x_file_path, 'rb') as f:
            X = pickle.load(f)
        with open(y_file_path, 'rb') as f:
            y = pickle.load(f)
        return X, y

    logging.info("读取所有图像，生成X和y")
    # 获取训练文件夹中所有图像的路径
    image_paths = list(paths.list_images(train_folder))

    X = []
    y = []

    # 如果方法是'flat'，则不使用预训练模型（这里移除了对VGG16模型的处理）
    if method == 'flat':
        model = None

    num_batches = len(image_paths) // batch_size + (1 if len(image_paths) % batch_size else 0)

    # 按批次处理图像数据，保留读取图像的进度条显示
    for idx in tqdm(range(num_batches), desc="读取图像"):
        batch_images = []
        batch_labels = []

        start = idx * batch_size
        end = min((idx + 1) * batch_size, len(image_paths))

        for i in range(start, end):
            image_path = image_paths[i]
            # 如果方法是'flat'，以灰度模式读取图像并调整大小
            img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            img = cv2.resize(img, (32, 32))
            batch_images.append(img)

            label = os.path.basename(image_path).split('.')[0]
            label = 1 if label == 'dog' else 0
            batch_labels.extend([label])

        batch_images = np.array(batch_images)
        # 如果方法是'flat'，将图像数据展平
        batch_pixels = batch_images.reshape(batch_images.shape[0], -1)

        X.extend(batch_pixels)
        y.extend(batch_labels)

    logging.info(f"X.shape: {np.shape(X)}")
    logging.info(f"y.shape: {np.shape(y)}")

    with open(x_file_path, 'wb') as f:
        pickle.dump(X, f)
    with open(y_file_path, 'wb') as f:
        pickle.dump(y, f)

    return X, y


# 使用自定义函数createXY从猫狗数据集创建特征和标签，并划分训练集和测试集
train_X, train_y = createXY(
    train_folder="data/train",
    dest_folder=".",
    method='flat'
)
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.5, random_state=2023)


# 创建逻辑回归分类器，增加max_iter参数以尝试解决收敛问题，并更换求解器为liblinear
logistic_clf = LogisticRegression(max_iter=1000, solver='liblinear')

# 创建随机森林分类器
random_forest_clf_subcls = RandomForestClassifier(random_state=42)
random_forest_clf = RandomForestClassifier(
    n_estimators=500,
    n_jobs=-1,
    random_state=42
)

# 创建支持向量机分类器
svm_clf = SVC(probability=True)

# 创建硬投票分类器
voting_clf_hard = VotingClassifier(
    estimators=[('lr', logistic_clf), ('rf', random_forest_clf_subcls), ('svc', svm_clf)],
    voting='hard'
)

# 创建软投票分类器
voting_clf_soft = VotingClassifier(
    estimators=[('lr', logistic_clf), ('rf', random_forest_clf_subcls), ('svc', svm_clf)],
    voting='soft',
    weights=[1, 1, 2]
)

# 创建Bagging分类器
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=100,
    bootstrap=True,
    random_state=42
)

# 创建Pasting分类器
paste_clf = BaggingClassifier(
    DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=100,
    bootstrap=False,
    random_state=42
)

# 创建AdaBoost分类器
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=200,
    algorithm="SAMME.R",
    learning_rate=0.5
)

# 创建一个梯度提升分类器
xgb_clf_v1 = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=4,
    min_child_weight=6,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    random_state=42
)
xgb_clf_v2 = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=2,
    min_child_weight=2,
    subsample=0.9,
    colsample_bytree=1.0,
    gamma=0,
    random_state=42
)

# 创建一个堆叠分类器
stacking_clf = StackingClassifier(
    estimators=[('lr', logistic_clf), ('rf', random_forest_clf_subcls), ('svc', svm_clf)],
    final_estimator=LogisticRegression(),
    cv=5,
    stack_method='auto'
)

# 将所有分类器放入一个字典中
clfs = {
    "logistic_regression": logistic_clf,
    "random_forest": random_forest_clf,
    "svm": svm_clf,
    "hard_voting": voting_clf_hard,
    "soft_voting": voting_clf_soft,
    "bagging": bag_clf,
    "pasting": paste_clf,
    "adaboot": ada_clf,
    "gradient_boosting_v1": xgb_clf_v1,
    "gradient_boosting_v2": xgb_clf_v2,
    "stacking": stacking_clf
}

# 用于存储每个分类器的训练时间、预测时间和准确率
results = []
# 遍历所有分类器，训练并评估，记录结果，去掉训练过程的进度条显示，并添加日志输出
for name, clf in clfs.items():
    # 记录分类器开始训练的日志
    start_train_time = time.time()
    clf.fit(X_train, y_train)
    train_time = time.time() - start_train_time
    # 记录分类器训练完成的日志，包括训练时间
    logging.info(f"{name}_model训练完成,用时{train_time:.4f}秒")
    start_score_time = time.time()
    accuracy = clf.score(X_test, y_test)
    score_time = time.time() - start_score_time
    # 记录分类器评估完成的日志，包括评估时间和准确率
    logging.info(f"{name}_model评估完成,用时{score_time:.4f}秒，准确率：{accuracy}")

    results.append([name, train_time, score_time, accuracy])

# 定义表格的表头
headers = ["Classifier", "Training Time (s)", "Prediction Time (s)", "Accuracy"]
# 以表格形式打印结果
print(tabulate(results, headers=headers, tablefmt="simple"))

# 找到准确率最高的集成学习模型名称及准确率
best_model_name = max(clfs, key=lambda x: clfs[x].score(X_test, y_test))
best_model_accuracy = clfs[best_model_name].score(X_test, y_test)

# 保存准确率最高的集成学习模型
best_model = clfs[best_model_name]
with open(f"{best_model_name}_best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

logging.info(f"准确率最高的集成学习模型为：{best_model_name}，准确率为：{best_model_accuracy}，已保存为：{best_model_name}_best_model.pkl")