1 Star 1 Fork 1

lonerlin/classification

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
simplified_train.py 16.76 KB
一键复制 编辑 原始数据 按行查看 历史
lonerlin 提交于 2021-09-14 21:19 . tensor 转 float保存。
# 简化版的训练,去掉一些不必要的参数
import argparse
import os
import random
import shutil
import time
import warnings
import json
import gc
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
from reshape import reshape_model
model_names = sorted(name for name in models.__dict__
if name.islower() and not name.startswith("__")
and callable(models.__dict__[name]))
#
# parse command-line arguments
#
# data ‘训练路径’
# model_dir '模型存放路径'
# model_name '模型名称'
# workders ‘数据导入线程’
# epochs '轮次'
# batch-size ‘批量大小’
# evaluate ‘仅验证(不训练)’
# pretrained ‘采用预训练参数(默认是)’
# gpu ‘指定gpu 默认是 0’
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
best_acc1 = 0
is_cpu = False if torch.cuda.is_available() else True
#
# initiate worker threads (if using distributed multi-GPU)
#
train_record = {'loss': [], 'Acc1': [], 'Acc5': []}
val_record = {'loss': [], 'Acc1': [], 'Acc5': []}
def main():
# parser = ArgumentParser()
os.environ['TORCH_HOME'] = 'models/pretrained_models'
args = parser.parse_args()
with open('args.txt', 'r') as f:
args.__dict__ = json.load(f)
print(args)
# args = parser.parse_args()
# with open('args.txt', 'w') as f:
# json.dump(args.__dict__, f, indent=2)
# return
# args.gpu = device
if args.seed is not None:
random.seed(args.seed)
torch.manual_seed(args.seed)
cudnn.deterministic = True
warnings.warn('You have chosen to seed training. '
'This will turn on the CUDNN deterministic setting, '
'which can slow down your training considerably! '
'You may see unexpected behavior when restarting '
'from checkpoints.')
#if args.gpu is not None:
# warnings.warn('You have chosen a specific GPU. This will completely '
# 'disable data parallelism.')
if args.dist_url == "env://" and args.world_size == -1:
args.world_size = int(os.environ["WORLD_SIZE"])
args.distributed = args.world_size > 1 or args.multiprocessing_distributed
ngpus_per_node = torch.cuda.device_count()
if args.multiprocessing_distributed:
# Since we have ngpus_per_node processes per node, the total world_size
# needs to be adjusted accordingly
args.world_size = ngpus_per_node * args.world_size
# Use torch.multiprocessing.spawn to launch distributed processes: the
# main_worker process function
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
else:
# Simply call main_worker function
main_worker(args.gpu, ngpus_per_node, args)
#
# worker thread (per-GPU)
#
def main_worker(gpu, ngpus_per_node, args):
global best_acc1
args.gpu = gpu
if args.gpu is not None:
print("Use GPU: {} for training".format(args.gpu))
if args.distributed:
if args.dist_url == "env://" and args.rank == -1:
args.rank = int(os.environ["RANK"])
if args.multiprocessing_distributed:
# For multiprocessing distributed training, rank needs to be the
# global rank among all the processes
args.rank = args.rank * ngpus_per_node + gpu
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
# data loading code
traindir = os.path.join(args.data, 'train')
valdir = os.path.join(args.data, 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
train_dataset = datasets.ImageFolder(
traindir,
transforms.Compose([
# transforms.Resize(224),
transforms.RandomResizedCrop(args.resolution),
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(96), # 自定义数据增强
transforms.ColorJitter(brightness=0.5, contrast=0.5, hue=0.5), # 自定义数据增强
transforms.ToTensor(),
normalize,
]))
num_classes = len(train_dataset.classes)
print('=> dataset classes: ' + str(num_classes) + ' ' + str(train_dataset.classes))
if args.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
else:
train_sampler = None
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
num_workers=args.workers, pin_memory=True, sampler=train_sampler)
val_loader = torch.utils.data.DataLoader(
datasets.ImageFolder(valdir, transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(args.resolution),
transforms.ToTensor(),
normalize,
])),
batch_size=args.batch_size, shuffle=False,
num_workers=args.workers, pin_memory=True)
# create or load the model if using pre-trained (the default)
if args.pretrained:
print("=> using pre-trained model '{}'".format(args.arch))
model = models.__dict__[args.arch](pretrained=True)
else:
print("=> creating model '{}'".format(args.arch))
model = models.__dict__[args.arch]()
# reshape the model for the number of classes in the dataset
model = reshape_model(model, args.arch, num_classes)
if is_cpu:
# if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
# model.features = torch.nn.DataParallel(model.features)
#
# else:
# model = torch.nn.DataParallel(model)
criterion = nn.CrossEntropyLoss()
else:
# transfer the model to the GPU that it should be run on
if args.distributed:
# For multiprocessing distributed, DistributedDataParallel constructor
# should always set the single device scope, otherwise,
# DistributedDataParallel will use all available devices.
if args.gpu is not None:
torch.cuda.set_device(args.gpu)
model.cuda(args.gpu)
# When using a single GPU per process and per
# DistributedDataParallel, we need to divide the batch size
# ourselves based on the total number of GPUs we have
args.batch_size = int(args.batch_size / ngpus_per_node)
args.workers = int(args.workers / ngpus_per_node)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
else:
model.cuda()
# DistributedDataParallel will divide and allocate batch_size to all
# available GPUs if device_ids are not set
model = torch.nn.parallel.DistributedDataParallel(model)
elif args.gpu is not None:
torch.cuda.set_device(args.gpu)
model = model.cuda(args.gpu)
else:
# DataParallel will divide and allocate batch_size to all available GPUs
if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
model.features = torch.nn.DataParallel(model.features)
model.cuda()
else:
model = torch.nn.DataParallel(model).cuda()
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda(args.gpu)
optimizer = torch.optim.SGD(model.parameters(), args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
# optionally resume from a checkpoint
if args.resume:
if os.path.isfile(args.resume):
print("=> loading checkpoint '{}'".format(args.resume))
checkpoint = torch.load(args.resume)
args.start_epoch = checkpoint['epoch']
best_acc1 = checkpoint['best_acc1']
if args.gpu is not None:
# best_acc1 may be from a checkpoint from a different GPU
best_acc1 = best_acc1.to(args.gpu)
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
print("=> loaded checkpoint '{}' (epoch {})"
.format(args.resume, checkpoint['epoch']))
else:
cudnn.benchmark = True
print("=> no checkpoint found at '{}'".format(args.resume))
if not is_cpu:
cudnn.benchmark = True
# if in evaluation mode, only run validation
if args.evaluate:
validate(val_loader, model, criterion, num_classes, args)
return
# train for the specified number of epochs
for epoch in range(args.start_epoch, args.epochs):
if args.distributed:
train_sampler.set_epoch(epoch)
# decay the learning rate
adjust_learning_rate(optimizer, epoch, args)
# train for one epoch
train(train_loader, model, criterion, optimizer, epoch, num_classes, args)
# evaluate on validation set
acc1 = validate(val_loader, model, criterion, num_classes, args)
# remember best acc@1 and save checkpoint
is_best = acc1 > best_acc1
best_acc1 = max(acc1, best_acc1)
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
and args.rank % ngpus_per_node == 0):
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'resolution': args.resolution,
'num_classes': num_classes,
'state_dict': model.state_dict(),
'best_acc1': best_acc1,
'optimizer' : optimizer.state_dict(),
}, is_best, args)
# train_loader = None
# val_loader = None
# train_dataset = None
# torch.cuda.empty_cache()
# gc.collect()
#
# train one epoch
#
save_record()
def train(train_loader, model, criterion, optimizer, epoch, num_classes, args):
batch_time = AverageMeter('Time', ':6.3f')
data_time = AverageMeter('Data', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
progress = ProgressMeter(
len(train_loader),
[batch_time, data_time, losses, top1, top5],
prefix="Epoch: [{}]".format(epoch))
# switch to train mode
model.train()
# get the start time
epoch_start = time.time()
end = epoch_start
# train over each image batch from the dataset
for i, (images, target) in enumerate(train_loader):
# measure data loading time
data_time.update(time.time() - end)
if not is_cpu:
if args.gpu is not None:
images = images.cuda(args.gpu, non_blocking=True)
target = target.cuda(args.gpu, non_blocking=True)
# compute output
output = model(images)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, min(5, num_classes)))
losses.update(loss.item(), images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# compute gradient and do SGD step
optimizer.zero_grad()
loss.backward()
optimizer.step()
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i % args.print_freq == 0:
progress.display(i)
print("Epoch: [{:d}] completed, elapsed time {:6.3f} seconds".format(epoch, time.time() - epoch_start))
train_record['loss'].append(float(losses.avg))
train_record['Acc1'].append(float(top1.avg))
train_record['Acc5'].append(float(top5.avg))
#
# measure model performance across the val dataset
#
def validate(val_loader, model, criterion, num_classes, args):
batch_time = AverageMeter('Time', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
progress = ProgressMeter(
len(val_loader),
[batch_time, losses, top1, top5],
prefix='Test: ')
# switch to evaluate mode
model.eval()
with torch.no_grad():
end = time.time()
for i, (images, target) in enumerate(val_loader):
if not is_cpu:
if args.gpu is not None:
images = images.cuda(args.gpu, non_blocking=True)
target = target.cuda(args.gpu, non_blocking=True)
# compute output
output = model(images)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, min(5, num_classes)))
losses.update(loss.item(), images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i % args.print_freq == 0:
progress.display(i)
# TODO: this should also be done with the ProgressMeter
print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
.format(top1=top1, top5=top5))
val_record['loss'].append(float(losses.avg))
val_record['Acc1'].append(float(top1.avg))
val_record['Acc5'].append(float(top5.avg))
return top1.avg
#
# save model checkpoint
#
def save_checkpoint(state, is_best, args, filename='checkpoint.pth.tar', best_filename='model_best.pth.tar'):
"""Save a model checkpoint file, along with the best-performing model if applicable"""
# if saving to an output directory, make sure it exists
if args.model_dir:
model_path = os.path.expanduser(args.model_dir)
if not os.path.exists(model_path):
os.mkdir(model_path)
filename = os.path.join(model_path, filename)
best_filename = os.path.join(model_path, best_filename)
# save the checkpoint
torch.save(state, filename)
# earmark the best checkpoint
if is_best:
shutil.copyfile(filename, best_filename)
print("saved best model to: " + best_filename)
else:
print("saved checkpoint to: " + filename)
#
# statistic averaging
#
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self, name, fmt=':f'):
self.name = name
self.fmt = fmt
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def __str__(self):
fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
return fmtstr.format(**self.__dict__)
#
# progress metering
#
class ProgressMeter(object):
def __init__(self, num_batches, meters, prefix=""):
self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
self.meters = meters
self.prefix = prefix
def display(self, batch):
entries = [self.prefix + self.batch_fmtstr.format(batch)]
entries += [str(meter) for meter in self.meters]
print(' '.join(entries))
def _get_batch_fmtstr(self, num_batches):
num_digits = len(str(num_batches // 1))
fmt = '{:' + str(num_digits) + 'd}'
return '[' + fmt + '/' + fmt.format(num_batches) + ']'
#
# learning rate decay
#
def adjust_learning_rate(optimizer, epoch, args):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
lr = args.lr * (0.1 ** (epoch // 30))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
#
# compute the accuracy for a given result
#
def accuracy(output, target, topk=(1,)):
"""Computes the accuracy over the k top predictions for the specified values of k"""
with torch.no_grad():
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size))
return res
def save_record():
with open('record.txt', 'w') as f:
record = {'train': train_record, 'val': val_record}
json.dump(record, f, indent=2)
if __name__ == '__main__':
main()
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/lonerlin/classification.git
git@gitee.com:lonerlin/classification.git
lonerlin
classification
classification
master

搜索帮助