代码拉取完成,页面将自动刷新
同步操作将从 mynameisi/faiss_dog_cat_question 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
import time
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from tabulate import tabulate
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import cv2
import os
from os.path import exists
from imutils import paths
import pickle
from tensorflow.keras.preprocessing import image
import logging
import faiss
from sklearn.neighbors import KNeighborsClassifier
from FaissKNeighbors import FaissKNeighbors
from util import createXY
# 配置日志记录的基本设置,设置日志级别为INFO,并定义日志格式
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# 此函数用于获取文件的大小(以MB为单位)
def get_size(file):
return os.path.getsize(file) / (1024 * 1024)
# 此函数用于从图像数据创建特征矩阵X和标签向量y
def createXY(train_folder, dest_folder, method='flat', batch_size=64):
x_file_path = os.path.join(dest_folder, "X.pkl")
y_file_path = os.path.join(dest_folder, "y.pkl")
# 如果X和y的数据文件已经存在,则直接读取并返回
if os.path.exists(x_file_path) and os.path.exists(y_file_path):
logging.info("X和y已经存在,直接读取")
logging.info(f"X文件大小:{get_size(x_file_path):.2f}MB")
logging.info(f"y文件大小:{get_size(y_file_path):.2f}MB")
with open(x_file_path, 'rb') as f:
X = pickle.load(f)
with open(y_file_path, 'rb') as f:
y = pickle.load(f)
return X, y
logging.info("读取所有图像,生成X和y")
# 获取训练文件夹中所有图像的路径
image_paths = list(paths.list_images(train_folder))
X = []
y = []
# 如果方法是'flat',则不使用预训练模型(这里移除了对VGG16模型的处理)
if method == 'flat':
model = None
num_batches = len(image_paths) // batch_size + (1 if len(image_paths) % batch_size else 0)
# 按批次处理图像数据,保留读取图像的进度条显示
for idx in tqdm(range(num_batches), desc="读取图像"):
batch_images = []
batch_labels = []
start = idx * batch_size
end = min((idx + 1) * batch_size, len(image_paths))
for i in range(start, end):
image_path = image_paths[i]
# 如果方法是'flat',以灰度模式读取图像并调整大小
img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
img = cv2.resize(img, (32, 32))
batch_images.append(img)
label = os.path.basename(image_path).split('.')[0]
label = 1 if label == 'dog' else 0
batch_labels.extend([label])
batch_images = np.array(batch_images)
# 如果方法是'flat',将图像数据展平
batch_pixels = batch_images.reshape(batch_images.shape[0], -1)
X.extend(batch_pixels)
y.extend(batch_labels)
logging.info(f"X.shape: {np.shape(X)}")
logging.info(f"y.shape: {np.shape(y)}")
with open(x_file_path, 'wb') as f:
pickle.dump(X, f)
with open(y_file_path, 'wb') as f:
pickle.dump(y, f)
return X, y
# 使用自定义函数createXY从猫狗数据集创建特征和标签,并划分训练集和测试集
train_X, train_y = createXY(
train_folder="data/train",
dest_folder=".",
method='flat'
)
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.5, random_state=2023)
# 创建逻辑回归分类器,增加max_iter参数以尝试解决收敛问题,并更换求解器为liblinear
logistic_clf = LogisticRegression(max_iter=1000, solver='liblinear')
# 创建随机森林分类器
random_forest_clf_subcls = RandomForestClassifier(random_state=42)
random_forest_clf = RandomForestClassifier(
n_estimators=500,
n_jobs=-1,
random_state=42
)
# 创建支持向量机分类器
svm_clf = SVC(probability=True)
# 创建硬投票分类器
voting_clf_hard = VotingClassifier(
estimators=[('lr', logistic_clf), ('rf', random_forest_clf_subcls), ('svc', svm_clf)],
voting='hard'
)
# 创建软投票分类器
voting_clf_soft = VotingClassifier(
estimators=[('lr', logistic_clf), ('rf', random_forest_clf_subcls), ('svc', svm_clf)],
voting='soft',
weights=[1, 1, 2]
)
# 创建Bagging分类器
bag_clf = BaggingClassifier(
DecisionTreeClassifier(),
n_estimators=500,
max_samples=100,
bootstrap=True,
random_state=42
)
# 创建Pasting分类器
paste_clf = BaggingClassifier(
DecisionTreeClassifier(),
n_estimators=500,
max_samples=100,
bootstrap=False,
random_state=42
)
# 创建AdaBoost分类器
ada_clf = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1),
n_estimators=200,
algorithm="SAMME.R",
learning_rate=0.5
)
# 创建一个梯度提升分类器
xgb_clf_v1 = XGBClassifier(
n_estimators=200,
learning_rate=0.1,
max_depth=4,
min_child_weight=6,
subsample=0.8,
colsample_bytree=0.8,
gamma=0.1,
random_state=42
)
xgb_clf_v2 = XGBClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=2,
min_child_weight=2,
subsample=0.9,
colsample_bytree=1.0,
gamma=0,
random_state=42
)
# 创建一个堆叠分类器
stacking_clf = StackingClassifier(
estimators=[('lr', logistic_clf), ('rf', random_forest_clf_subcls), ('svc', svm_clf)],
final_estimator=LogisticRegression(),
cv=5,
stack_method='auto'
)
# 将所有分类器放入一个字典中
clfs = {
"logistic_regression": logistic_clf,
"random_forest": random_forest_clf,
"svm": svm_clf,
"hard_voting": voting_clf_hard,
"soft_voting": voting_clf_soft,
"bagging": bag_clf,
"pasting": paste_clf,
"adaboot": ada_clf,
"gradient_boosting_v1": xgb_clf_v1,
"gradient_boosting_v2": xgb_clf_v2,
"stacking": stacking_clf
}
# 用于存储每个分类器的训练时间、预测时间和准确率
results = []
# 遍历所有分类器,训练并评估,记录结果,去掉训练过程的进度条显示,并添加日志输出
for name, clf in clfs.items():
# 记录分类器开始训练的日志
start_train_time = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start_train_time
# 记录分类器训练完成的日志,包括训练时间
logging.info(f"{name}_model训练完成,用时{train_time:.4f}秒")
start_score_time = time.time()
accuracy = clf.score(X_test, y_test)
score_time = time.time() - start_score_time
# 记录分类器评估完成的日志,包括评估时间和准确率
logging.info(f"{name}_model评估完成,用时{score_time:.4f}秒,准确率:{accuracy}")
results.append([name, train_time, score_time, accuracy])
# 定义表格的表头
headers = ["Classifier", "Training Time (s)", "Prediction Time (s)", "Accuracy"]
# 以表格形式打印结果
print(tabulate(results, headers=headers, tablefmt="simple"))
# 找到准确率最高的集成学习模型名称及准确率
best_model_name = max(clfs, key=lambda x: clfs[x].score(X_test, y_test))
best_model_accuracy = clfs[best_model_name].score(X_test, y_test)
# 保存准确率最高的集成学习模型
best_model = clfs[best_model_name]
with open(f"{best_model_name}_best_model.pkl", "wb") as f:
pickle.dump(best_model, f)
logging.info(f"准确率最高的集成学习模型为:{best_model_name},准确率为:{best_model_accuracy},已保存为:{best_model_name}_best_model.pkl")
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。