main

分支 (1)

管理

管理

main

faiss_dog_cat_question
/
util.py

import cv2
import numpy as np
import os
from os.path import exists
from imutils import paths
import pickle
from tqdm import tqdm
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing import image
import logging

# 配置日志记录的基本设置
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def get_size(file_path):
    """
    获取指定文件的大小（以MB为单位）

    参数:
    file_path (str): 文件的路径

    返回:
    float: 文件大小（MB）
    """
    return os.path.getsize(file_path) / (1024 * 1024)  # 文件大小（MB）

def load_existing_data(x_file_path, y_file_path):
    """
    加载已存在的X和y数据

    参数:
    x_file_path (str): X数据文件路径
    y_file_path (str): y数据文件路径

    返回:
    tuple: (X, y)
    """
    logging.info("X和y已经存在，直接读取")
    logging.info(f"X文件大小:{get_size(x_file_path):.2f}MB")
    logging.info(f"y文件大小:{get_size(y_file_path):.2f}MB")

    with open(x_file_path, 'rb') as f:
        X = pickle.load(f)
    with open(y_file_path, 'rb') as f:
        y = pickle.load(f)
    return X, y

def create_vgg_features(image_paths, model, batch_size):
    """
    使用VGG16模型创建图像特征

    参数:
    image_paths (list): 图像文件路径列表
    model (VGG16): VGG16模型
    batch_size (int): 批处理大小

    返回:
    list: 图像特征列表
    list: 标签列表
    """
    X = []
    y = []
    num_batches = len(image_paths) // batch_size + (1 if len(image_paths) % batch_size else 0)

    for idx in tqdm(range(num_batches), desc="读取图像"):
        batch_images = []
        batch_labels = []

        start = idx * batch_size
        end = min((idx + 1) * batch_size, len(image_paths))

        for image_path in image_paths[start:end]:
            img = image.load_img(image_path, target_size=(224, 224))
            img = image.img_to_array(img)
            batch_images.append(img)

            label = os.path.basename(image_path).split('_')[0]
            label = 1 if label == 'dog' else 0
            batch_labels.append(label)

        batch_images = np.array(batch_images)
        batch_images = preprocess_input(batch_images)
        batch_features = model.predict(batch_images, verbose=0)

        X.extend(batch_features)
        y.extend(batch_labels)

    return X, y

def create_flat_features(image_paths, batch_size):
    """
    创建平面图像特征

    参数:
    image_paths (list): 图像文件路径列表
    batch_size (int): 批处理大小

    返回:
    list: 图像特征列表
    list: 标签列表
    """
    X = []
    y = []
    num_batches = len(image_paths) // batch_size + (1 if len(image_paths) % batch_size else 0)

    for idx in tqdm(range(num_batches), desc="读取图像"):
        batch_images = []
        batch_labels = []

        start = idx * batch_size
        end = min((idx + 1) * batch_size, len(image_paths))

        for image_path in image_paths[start:end]:
            img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
            img = cv2.resize(img, (32, 32))
            img = img.flatten()
            batch_images.append(img)

            label = os.path.basename(image_path).split('_')[0]
            label = 1 if label == 'dog' else 0
            batch_labels.append(label)

        batch_images = np.array(batch_images)
        X.extend(batch_images)
        y.extend(batch_labels)

    return X, y

def save_data(X, y, x_file_path, y_file_path):
    """
    保存X和y数据

    参数:
    X (list): 图像特征列表
    y (list): 标签列表
    x_file_path (str): X数据文件路径
    y_file_path (str): y数据文件路径
    """
    with open(x_file_path, 'wb') as f:
        pickle.dump(X, f)
    with open(y_file_path, 'wb') as f:
        pickle.dump(y, f)

def createXY(train_folder, dest_folder, method='vgg', batch_size=64):
    x_file_path = os.path.join(dest_folder, "X.pkl")
    y_file_path = os.path.join(dest_folder, "y.pkl")

    if exists(x_file_path) and exists(y_file_path):
        return load_existing_data(x_file_path, y_file_path)

    logging.info("读取所有图像，生成X和y")
    image_paths = list(paths.list_images(train_folder))

    if method == 'vgg':
        model = VGG16(weights='imagenet', include_top=False, pooling="max")
        logging.info("完成构建 VGG16 模型")
        X, y = create_vgg_features(image_paths, model, batch_size)
    elif method == 'flat':
        X, y = create_flat_features(image_paths, batch_size)

    logging.info(f"X.shape: {np.shape(X)}")
    logging.info(f"y.shape: {np.shape(y)}")

    save_data(X, y, x_file_path, y_file_path)

    return X, y