master

分支 (1)

管理

管理

master

shufa-hog
/
1_Xy.py

# 0. 引入必要的包
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from util import get

# 1. 读取配置文件中的信息
train_dir = get("train") # 获取 训练数据路径
char_styles = get("char_styles") # 获取 字符样式列表，注意: 必须是列标
new_size = get("new_size") # 获取 新图像大小元组, 注意: 必须包含h和w

# 2. 生成X,y
print("# 读取训练数据并进行预处理，")
X = []
y = []
for filename in os.listdir(train_dir):
    if filename.endswith('.jpg') or filename.endswith('.png'):
        file_path = os.path.join(train_dir, filename)
        img = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
        if img is not None:
            img = cv2.resize(img, new_size)
            X.append(img.flatten())
            style = filename.split('_')[0]
            if style in char_styles:
                y.append(char_styles.index(style))
X = np.array(X, dtype=np.uint8)
y = np.array(y)
# 3. 分割测试集和训练集
print("# 将数据按 80% 和 20% 的比例分割")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. 打印样本维度和类型信息
print("X_train: ", X_train.shape, X_train.dtype)  # 训练集特征的维度和类型
print("X_test: ", X_test.shape, X_test.dtype)  # 测试集特征的维度和类型
print("y_train: ", y_train.shape, y_train.dtype)  # 训练集标签的维度和类型
print("y_test: ", y_test.shape, y_test.dtype)  # 测试集标签的维度和类型

# 5. 序列化分割后的训练和测试样本
output_dir = './Xys'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
filename = os.path.join(output_dir, 'Xy')
joblib.dump((X_train, X_test, y_train, y_test), filename, compress=True)