master

分支 (1)

管理

管理

master

shufa_app
/
1_Xy.py

# 0. 引入必要的包
# TODO
import os
from sklearn.model_selection import train_test_split
# import pickle
from util import get,preprocess_image,dump
import numpy as np
from tqdm import tqdm  # tqdm库用于创建进度条
import time
import glob

# 1. 读取配置文件中的信息
train_dir = get("train") # 获取 训练数据路径
char_styles = get("char_styles") # 获取 字符样式列表，注意: 必须是列标
new_size = get("new_size") # 获取 新图像大小元组, 注意: 必须包含h和w
Xy_root = get('Xy_root')
# print(char_styles)

# 2. 生成X,y
print("# 读取训练数据并进行预处理，")
# TODO
X=[]
y=[]

for i in char_styles:
    file_name = glob.glob("{}/train_{}*".format(train_dir,i[0]))
    # 循环训练数据train文件夹路径下的每个类别图片，并显示进度条
    start = 0
    end = 100
    num_element = np.size(file_name)
    for element in tqdm(file_name,desc="处理 {} 图像：".format(i), unit="bit"):
        # 调用util.py文件中的preprocess_image函数处理每一张图像
        img = preprocess_image(element,new_size)
        X.append(img)
        # 标签
        label = str(file_name).split(os.path.sep)[-1].split(".")[0].split("_")[1]
        label = char_styles.index(label)
        y.append(label)

        time.sleep(0.00001)
X = np.array(X)
y = np.array(y).astype(np.int64)
# print(np.size(X))
# print(num_element)
# 3. 分割测试集和训练集
print("# 将数据按 80% 和 20% 的比例分割")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 4. 打印样本维度和类型信息
print("X_train: ", X_train.shape, X_train.dtype)  # 训练集特征的维度和类型
print("X_test: ", X_test.shape, X_test.dtype)  # 测试集特征的维度和类型
print("y_train: ", y_train.shape, y_train.dtype)  # 训练集标签的维度和类型
print("y_test: ", y_test.shape, y_test.dtype)  # 测试集标签的维度和类型

# 5. 序列化分割后的训练和测试样本
# TODO
# with open('{}/Xy.pkl'.format(Xy_root), 'wb') as f:
#     pickle.dump((X_train, X_test, y_train, y_test), f)
dump((X_train, X_test, y_train, y_test), '(X_train, X_test, y_train, y_test)','{}/Xy'.format(Xy_root))