代码拉取完成,页面将自动刷新
# YOLOv5 🚀 by Ultralytics, GPL-3.0 license
"""
Train a YOLOv5 model on a custom dataset
Usage:
$ python path/to/train.py --data coco128.yaml --weights yolov5s.pt --img 640
python train.py --data coco128.yaml --weights yolov5s.pt --img 640 --cfg models/yolov5s.yaml --cache --rect --epochs 2
"""
import argparse
import logging
import math
import os
import random
import sys
import time
from copy import deepcopy
from pathlib import Path
import numpy as np
import torch
import torch.distributed as dist
import torch.nn as nn
import yaml
from torch.cuda import amp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.optim import Adam, SGD, lr_scheduler
from tqdm import tqdm
# 当前目录的路径
FILE = Path(__file__).resolve() # resolve 转换为绝对路径
# 项目根目录路径
ROOT = FILE.parents[0] # YOLOv5 root directory
if str(ROOT) not in sys.path:
sys.path.append(str(ROOT)) # add ROOT to PATH
# Path.cwd()获取当前脚本的所在绝对路径 &path_cwd_test
# os.path.relpath https://blog.csdn.net/Hunter_Murphy/article/details/108043298
# 由绝对路径变成相对路径
ROOT = Path(os.path.relpath(ROOT, Path.cwd())) # relative
import val # for end-of-epoch mAP
from models.experimental import attempt_load
from models.yolo import Model
from utils.autoanchor import check_anchors
from utils.datasets import create_dataloader
from utils.general import labels_to_class_weights, increment_path, labels_to_image_weights, init_seeds, \
strip_optimizer, get_latest_run, check_dataset, check_git_status, check_img_size, check_requirements, \
check_file, check_yaml, check_suffix, print_args, print_mutation, set_logging, one_cycle, colorstr, methods
from utils.downloads import attempt_download
from utils.loss import ComputeLoss
from utils.plots import plot_labels, plot_evolve
from utils.torch_utils import EarlyStopping, ModelEMA, de_parallel, intersect_dicts, select_device, \
torch_distributed_zero_first
from utils.loggers.wandb.wandb_utils import check_wandb_resume
from utils.metrics import fitness
from utils.loggers import Loggers
from utils.callbacks import Callbacks
# 可以点进去,就是创建一个logger对象
LOGGER = logging.getLogger(__name__)
# 获取环境变量LOCAL_RANK 没找到则默认为-1 os.getenv https://vimsky.com/examples/usage/python-os-getenv-method.html
# LOCAL_RANK、RANK、WORLD_SIZE的区别和联系 https://blog.csdn.net/hxxjxw/article/details/119606518
# 本机的进程号
LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # LOCAL_RANK:device_ids 即设备ID也就是GUU的ID号 https://pytorch.org/docs/stable/elastic/run.html
# 分布式中的进程号
RANK = int(os.getenv('RANK', -1))
# 总共的进程数
WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
def train(hyp, # path/to/hyp.yaml or hyp dictionary
opt,
device,
callbacks
):
save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze, = \
Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \
opt.resume, opt.noval, opt.nosave, opt.workers, opt.freeze
# Directories 权重保存文件夹
w = save_dir / 'weights' # weights dir
# 刚才没创建,这个地方创建weight顺便吧exp文件创建了
(w.parent if evolve else w).mkdir(parents=True, exist_ok=True) # make dir
# 指定权重保存的文件名,两个一个是last.pt,一个市best.pt
last, best = w / 'last.pt', w / 'best.pt'
# Hyperparameters 加载超参数
if isinstance(hyp, str):
with open(hyp, errors='ignore') as f:
hyp = yaml.safe_load(f) # load hyps dict
LOGGER.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items()))
# Save run settings 保存运行时的超参数的设置
with open(save_dir / 'hyp.yaml', 'w') as f:
yaml.safe_dump(hyp, f, sort_keys=False)
# 保存传递的超参数
with open(save_dir / 'opt.yaml', 'w') as f:
yaml.safe_dump(vars(opt), f, sort_keys=False)
data_dict = None
# Loggers
if RANK in [-1, 0]:
# 创建一个logger对象
loggers = Loggers(save_dir, weights, opt, hyp, LOGGER) # loggers instance
if loggers.wandb:
data_dict = loggers.wandb.data_dict
if resume:
weights, epochs, hyp = opt.weights, opt.epochs, opt.hyp
# Register actions 注册回调函数
for k in methods(loggers):
callbacks.register_action(k, callback=getattr(loggers, k))
# Config
plots = not evolve # create plots
cuda = device.type != 'cpu'
# 作用就是以后每次运行时让随机初始化的结果都一样
# &random_seed
init_seeds(1 + RANK)
# 主进程优先 可点进去
with torch_distributed_zero_first(LOCAL_RANK):
# check_dataset在此处的作用是yaml中的内容是否完整
data_dict = data_dict or check_dataset(data) # check if None
train_path, val_path = data_dict['train'], data_dict['val']
nc = 1 if single_cls else int(data_dict['nc']) # number of classes
# 这里边有个小问题如果是单一类别那len咋可能不等于1 答:可能是为了防止没写名字的情况,在这里给它临时起个名字
names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names'] # class names
# 如果类名和种类数不一致则直接抛异常
assert len(names) == nc, f'{len(names)} names found for nc={nc} dataset in {data}' # check
# 判断是否是CoCo数据集
is_coco = data.endswith('coco.yaml') and nc == 80 # COCO dataset
# Model
check_suffix(weights, '.pt') # check weights
# pretrained 之前训练过保存过最优的参数
pretrained = weights.endswith('.pt')
if pretrained: # 使用训练好的参数进行训练,即在此基础上继续训练
with torch_distributed_zero_first(LOCAL_RANK):
# 先判断权重文件是否存在如果不存在权重文件则下载
weights = attempt_download(weights) # download if not found locally
# 断点调试的时候本来都返回了为啥又进入了torch_distributed_zero_first里边
# 原因可能是:先让这个程序做,做完了再从yield继续向后执行这也就是阻塞的功能吧
ckpt = torch.load(weights, map_location=device) # load checkpoint
model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create
exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else [] # exclude keys
csd = ckpt['model'].float().state_dict() # checkpoint state_dict as FP32
print('ckpt csd', len(csd))
print('model.state_dict()', len(model.state_dict()))
# intersect 将yolo5s.pt中的模型和当前模型参数进行比较取交集 排除anchor
csd = intersect_dicts(csd, model.state_dict(), exclude=exclude)
print('intersect_dicts csd', len(csd))
# 相当于给model加载yolov5.pt中的weight初始化参数 复制csd到当前模型及其子模型
# 主要是复制ckpt['model']中的参数到当前模型
model.load_state_dict(csd, strict=False) # load
# 因为从这一句打印上可以看出是从yolov5s.pt中复制权重给当前模型 再结合ckpt 是load weight的结果
LOGGER.info(f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}') # report
else:
model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create
# Freeze
freeze = [f'model.{x}.' for x in range(freeze)] # layers to freeze
# &m_name_parameters_test
for k, v in model.named_parameters():
v.requires_grad = True # train all layers
if any(x in k for x in freeze):
print(f'freezing {k}')
v.requires_grad = False # 不进行梯度计算也就不更新权重等信息 即为冻结状态
# Optimizer
nbs = 64 # nominal batch size 批量标准化大小64
accumulate = max(round(nbs / batch_size), 1) # accumulate loss before optimizing 累积多少个batch后进行优化一次
hyp['weight_decay'] *= batch_size * accumulate / nbs # scale weight_decay 权重衰减比例
LOGGER.info(f"Scaled weight_decay = {hyp['weight_decay']}")
# g0:weight (no decay), g1:weight (with decay) g2:bias
g0, g1, g2 = [], [], [] # optimizer parameter groups
# &modules_test
for v in model.modules(): # 迭代遍历(就是一个一个层全遍历一遍,包括子层)
if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): # bias
g2.append(v.bias)
if isinstance(v, nn.BatchNorm2d): # weight (no decay)
g0.append(v.weight)
elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): # weight (with decay)
g1.append(v.weight)
# 优化器选择
if opt.adam:
optimizer = Adam(g0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum
else: # 之前有二者的区别链接
optimizer = SGD(g0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
optimizer.add_param_group({'params': g1, 'weight_decay': hyp['weight_decay']}) # add g1 with weight_decay
optimizer.add_param_group({'params': g2}) # add g2 (biases)
LOGGER.info(f"{colorstr('optimizer:')} {type(optimizer).__name__} with parameter groups "
f"{len(g0)} weight, {len(g1)} weight (no decay), {len(g2)} bias")
del g0, g1, g2
# Scheduler
if opt.linear_lr: # 使用这个需要在前边的参数中设置 linear-lr
# lrf为啥叫最后一轮的学习率 因为epochs - 1是最后一个
lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp['lrf'] # linear
else: # 需要深入的了解下
lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf']
# 学习率调度器 其实就是管理lr 迭代一次调整一次学习率 https://blog.csdn.net/xingghaoyuxitong/article/details/119961669
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs)
# 可点进去
# EMA 指数移动平均模型 https://zhuanlan.zhihu.com/p/68748778
ema = ModelEMA(model) if RANK in [-1, 0] else None
# Resume
start_epoch, best_fitness = 0, 0.0
if pretrained:
# Optimizer
if ckpt['optimizer'] is not None: # 如果之前存的有,训练过 但是最后不都赋值为None了 可能是加载别的模型
optimizer.load_state_dict(ckpt['optimizer']) # 加载优化器
best_fitness = ckpt['best_fitness'] # 加载最好的结果
if ema and ckpt.get('ema'):
# ckpt['ema']应该是保存的效果最好的模型
ema.ema.load_state_dict(ckpt['ema'].float().state_dict())
ema.updates = ckpt['updates']
# Epochs
start_epoch = ckpt['epoch'] + 1
if resume: # 如果是接着训练则来个断言
assert start_epoch > 0, f'{weights} training to {epochs} epochs is finished, nothing to resume.'
# 如果设置的epochs小于之前的start_epoch则再训练epochs次
# 如果大于则从start_epoch开始继续训练
if epochs < start_epoch:
LOGGER.info(f"{weights} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {epochs} more epochs.")
epochs += ckpt['epoch'] # finetune additional epochs
del ckpt, csd
# Image sizes
gs = max(int(model.stride.max()), 32) # grid size (max stride)
nl = model.model[-1].nl # number of detection layers (used for scaling hyp['obj']) 大中小三个探测层
# 检查图片大小是不是步长的整数倍,并返回整数倍的图片大小
imgsz = check_img_size(opt.imgsz, gs, floor=gs * 2) # verify imgsz is gs-multiple
# DP mode 我猜测单机多卡模式 # 从它下边的提示上看并不推荐这种方式
if cuda and RANK == -1 and torch.cuda.device_count() > 1:
logging.warning('DP not recommended, instead use torch.distributed.run for best DDP Multi-GPU results.\n'
'See Multi-GPU Tutorial at https://github.com/ultralytics/yolov5/issues/475 to get started.')
model = torch.nn.DataParallel(model)
# SyncBatchNorm 同步批量标准化
if opt.sync_bn and cuda and RANK != -1:
# 将原来的模型转换成同步批量标准化模型 DDP模式使用这个
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
LOGGER.info('Using SyncBatchNorm()')
# 也进入了__getitem__ collate_fn4 方法了
# Trainloader # augment=True这居然直接写死了,不合理 它gs作为步长使用的
train_loader, dataset = create_dataloader(train_path, imgsz, batch_size // WORLD_SIZE, gs, single_cls,
hyp=hyp, augment=True, cache=opt.cache, rect=opt.rect, rank=LOCAL_RANK,
workers=workers, image_weights=opt.image_weights, quad=opt.quad,
prefix=colorstr('train: '))
mlc = int(np.concatenate(dataset.labels, 0)[:, 0].max()) # max label class
nb = len(train_loader) # number of batches
assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}'
# Process 0
if RANK in [-1, 0]:
# batch_size // WORLD_SIZE * 2这个为啥乘以2 可能是想让batch-size变大2倍
val_loader = create_dataloader(val_path, imgsz, batch_size // WORLD_SIZE * 2, gs, single_cls,
hyp=hyp, cache=None if noval else opt.cache, rect=True, rank=-1,
workers=workers, pad=0.5,
prefix=colorstr('val: '))[0] # 注意这个地方有个取值第一个所以把数据集的个去掉了
print('resume\n', resume)
if not resume:
labels = np.concatenate(dataset.labels, 0)
# c = torch.tensor(labels[:, 0]) # classes
# cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency
# model._initialize_biases(cf.to(device))
if plots:
plot_labels(labels, names, save_dir)
# Anchors
if not opt.noautoanchor: # 意思是用不用重新用k-means聚类锚框
check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)
# half()转换精度,加快计算 https://blog.csdn.net/weixin_38145317/article/details/106836616
model.half().float() # pre-reduce anchor precision
# 执行回调函数on_pretrain_routine_end
callbacks.run('on_pretrain_routine_end')
# DDP mode DDP:DistributedDataParallel 分布式数据并行
if cuda and RANK != -1:
model = DDP(model, device_ids=[LOCAL_RANK], output_device=LOCAL_RANK)
# Model parameters
hyp['box'] *= 3. / nl # scale to layers 盒子损失
hyp['cls'] *= nc / 80. * 3. / nl # scale to classes and layers 分类损失
hyp['obj'] *= (imgsz / 640) ** 2 * 3. / nl # scale to image size and layers 物体损失
hyp['label_smoothing'] = opt.label_smoothing # 这个在超参数中没有但是这样写可以添加 标签平滑大小
model.nc = nc # attach number of classes to model 多少个类别
model.hyp = hyp # attach hyperparameters to model 超参数
model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc # attach class weights 初始化类别权重
model.names = names # 要分类的名字
# Start training
t0 = time.time()
nw = max(round(hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) 热身迭代batch次数 而不是整个数据的次数,后边会有解释
# nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training
last_opt_step = -1 # 用来记录上一层进行优化的是那一个批次
maps = np.zeros(nc) # mAP per class 初始化每个类的mAP值 mAP:mean average precision 即给每个类先做平均,然后再来个类别之间的平均 https://blog.csdn.net/jacke121/article/details/108674895
results = (0, 0, 0, 0, 0, 0, 0) # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls)
scheduler.last_epoch = start_epoch - 1 # do not move 接着上次训练的继续训练
scaler = amp.GradScaler(enabled=cuda) # 混合精度训练 https://www.cnblogs.com/jimchen1218/p/14315008.html
stopper = EarlyStopping(patience=opt.patience) # 早停机制
compute_loss = ComputeLoss(model) # init loss class
LOGGER.info(f'Image sizes {imgsz} train, {imgsz} val\n'
f'Using {train_loader.num_workers} dataloader workers\n'
f"Logging results to {colorstr('bold', save_dir)}\n"
f'Starting training for {epochs} epochs...')
for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------
model.train() # 设置为训练模式
# Update image weights (optional, single-GPU only)
if opt.image_weights:
cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc # class weights 类别权重
iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights 图片权重
# &random_choices_test
dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx
# Update mosaic border (optional)
# b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
# dataset.mosaic_border = [b - imgsz, -b] # height, width borders
mloss = torch.zeros(3, device=device) # mean losses
if RANK != -1:
train_loader.sampler.set_epoch(epoch) # 不至于加载重复数据
pbar = enumerate(train_loader)
LOGGER.info(('\n' + '%10s' * 7) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'labels', 'img_size'))
if RANK in [-1, 0]:
pbar = tqdm(pbar, total=nb) # progress bar
optimizer.zero_grad() # 梯度置零
for i, (imgs, targets, paths, _) in pbar: # batch -------------------------------------------------------------
ni = i + nb * epoch # number integrated batches 集成批次数即总共进行了多少个批次训练 (since train start) nb:多少个批次 一个迭代(epoch)8个批次
imgs = imgs.to(device, non_blocking=True).float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0
# Warmup 热身
if ni <= nw: # 如果这样进行比较那warmup_epochs 这个epochs不应该代表迭代一遍样本,而应该代表批次即小批量的批量
xi = [0, nw] # x interp
# compute_loss.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou)
# np.interp https://blog.csdn.net/hfutdog/article/details/87386901
accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round()) # 搜上边有解释 累积多少个batch后进行优化一次
# 每一批次都要重新甚至参数的lr和momentum但是值却每次都不一定一样
for j, x in enumerate(optimizer.param_groups): # 理解理解这些参数是不是和模型结构有关系这些参数分别与哪些模型层对应?
# bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
if 'momentum' in x: # hyp['momentum']此时说明超参数中的的动量 为最终要达到的动量值
x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']])
# Multi-scale 多尺度imgs
if opt.multi_scale:
# &random_randrange_test gs:还是最大步长
sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size
sf = sz / max(imgs.shape[2:]) # scale factor 图片缩放因子
if sf != 1:
# &math_ceil_floor_test
# 要改成的图片的大小 最终得到的size是步长的整数倍
ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple)
# 对图片进行变换 只对图片做变换,那label呢?
imgs = nn.functional.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
# Forward amp.autocast意思是包裹区域可以进行混合精度训练
# 可以点进去 https://blog.csdn.net/weixin_42216799/article/details/110876374
with amp.autocast(enabled=cuda):
pred = model(imgs) # forward
loss, loss_items = compute_loss(pred, targets.to(device)) # loss scaled by batch_size
if RANK != -1:
loss *= WORLD_SIZE # gradient averaged between devices in DDP mode 总进程损失
if opt.quad:
loss *= 4. # 这为啥又乘个 四倍图所以乘4
# Backward 求导
scaler.scale(loss).backward()
# Optimize
if ni - last_opt_step >= accumulate: # 累积多少个batch后进行优化一次 随着批次的变化 累积次数也在变化
scaler.step(optimizer) # optimizer.step 优化
scaler.update() # 更新参数
optimizer.zero_grad() # 梯度置零
if ema:
ema.update(model) # 损失计算完了也需要进行指数平均下了
last_opt_step = ni # 上一个批次是第几批次
# Log
if RANK in [-1, 0]:
# (之前的平均损失*之前的批次数+当前的损失)/现在批次数 = 现在的平均损失
mloss = (mloss * i + loss_items) / (i + 1) # update mean losses
# memory_reserved 返回当前GPU占用的内存大小(字节)
mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB)
# https://blog.csdn.net/qq_41895190/article/details/90301184
pbar.set_description(('%10s' * 2 + '%10.4g' * 5) % (
f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1]))
# ni:当前批次数, model模型, imgs:批次图片, targets批次标签, paths图片路径, plots是否画出图片, opt.sync_bn 是否跨卡同步
callbacks.run('on_train_batch_end', ni, model, imgs, targets, paths, plots, opt.sync_bn)
# 一个批次结束调用on_train_batch_end这个回调函数
# end batch ------------------------------------------------------------------------------------------------
# Scheduler
lr = [x['lr'] for x in optimizer.param_groups] # for loggers
scheduler.step() # 对lr进行调整 每迭代一个周期调整一次学习率 https://blog.csdn.net/qq_20622615/article/details/83150963 可以一试https://www.cnblogs.com/hahaah/p/15392895.html
if RANK in [-1, 0]:
# mAP
callbacks.run('on_train_epoch_end', epoch=epoch)
# 将model的参数设置到self.ema模型中 不过这个的作用是干啥的? ema是deepcopy的模型和当前模型不同步
ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights'])
final_epoch = (epoch + 1 == epochs) or stopper.possible_stop
# noval 是否只验证最后一次
if not noval or final_epoch: # Calculate mAP mean average precise
# 每迭代一次需要计算mAP值 在这里边做了打印操作
results, maps, _ = val.run(data_dict, # data_dict数据字典
batch_size=batch_size // WORLD_SIZE * 2,
imgsz=imgsz,
model=ema.ema,
single_cls=single_cls,
dataloader=val_loader,
save_dir=save_dir,
plots=False,
callbacks=callbacks,
compute_loss=compute_loss)
# Update best mAP
fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
if fi > best_fitness: # 如果本轮迭代的效果好于之前的则更新 需要注意没有等于号,因为用简单的模型能表示为啥要用复杂的模型,
best_fitness = fi
log_vals = list(mloss) + list(results) + lr
# best_fitness:最好的结果, fi:不一定和best_fitness相等 # 主要工作时间最好的结果保存
callbacks.run('on_fit_epoch_end', log_vals, epoch, best_fitness, fi)
# Save model nosave 是否每次迭代都保存模型 最后一次要保存
if (not nosave) or (final_epoch and not evolve): # if save
ckpt = {'epoch': epoch,
'best_fitness': best_fitness,
'model': deepcopy(de_parallel(model)).half(),
'ema': deepcopy(ema.ema).half(),
'updates': ema.updates,
'optimizer': optimizer.state_dict(),
'wandb_id': loggers.wandb.wandb_run.id if loggers.wandb else None}
# Save last, best and delete
torch.save(ckpt, last) # 把当前次作为最后一次保存
if best_fitness == fi: # 如果当前次是最好的,则也保存一次
torch.save(ckpt, best)
# 每隔save_period个迭代保存一次
if (epoch > 0) and (opt.save_period > 0) and (epoch % opt.save_period == 0):
torch.save(ckpt, w / f'epoch{epoch}.pt')
del ckpt # 释放内存
callbacks.run('on_model_save', last, epoch, final_epoch, best_fitness, fi)
# Stop Single-GPU 如果达到早停的要求了则退出循环
if RANK == -1 and stopper(epoch=epoch, fitness=fi):
break
# Stop DDP TODO: known issues shttps://github.com/ultralytics/yolov5/pull/4576
# stop = stopper(epoch=epoch, fitness=fi)
# if RANK == 0:
# dist.broadcast_object_list([stop], 0) # broadcast 'stop' to all ranks
# Stop DPP
# with torch_distributed_zero_first(RANK):
# if stop:
# break # must break all DDP ranks
# end epoch ----------------------------------------------------------------------------------------------------
# end training -----------------------------------------------------------------------------------------------------
if RANK in [-1, 0]:
LOGGER.info(f'\n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.')
for f in last, best:
if f.exists():
strip_optimizer(f) # strip optimizers
if f is best:
LOGGER.info(f'\nValidating {f}...')
results, _, _ = val.run(data_dict, # 上个是训练的时候作为验证的 主要是计算mAP 这个主要是画图的
batch_size=batch_size // WORLD_SIZE * 2,
imgsz=imgsz,
model=attempt_load(f, device).half(),
iou_thres=0.65 if is_coco else 0.60, # best pycocotools results at 0.65
single_cls=single_cls,
dataloader=val_loader,
save_dir=save_dir,
save_json=is_coco,
verbose=True,
plots=True,
callbacks=callbacks,
compute_loss=compute_loss) # val best model with plots
# 主要是用来画图,图形化显示,更直观
callbacks.run('on_train_end', last, best, plots, epoch)
LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}")
# 释放缓存
torch.cuda.empty_cache()
return results
# --data coco128.yaml --weights yolov5s.pt --img 640 --cfg models/yolov5s.yaml --cache --rect --epochs 2
def parse_opt(known=False):
parser = argparse.ArgumentParser()
# &sys_argv_test
# 初始化权重参数 &division_test
parser.add_argument('--weights', type=str, default=ROOT / 'yolov5s.pt', help='initial weights path')
# 模型的配置信息 主要是网络结构信息
parser.add_argument('--cfg', type=str, default='', help='model.yaml path')
# 数据集配置信息
parser.add_argument('--data', type=str, default=ROOT / 'data/coco128.yaml', help='dataset.yaml path')
# 超参数的配置信息
parser.add_argument('--hyp', type=str, default=ROOT / 'data/hyps/hyp.scratch.yaml', help='hyperparameters path')
# 训练的迭代次数
parser.add_argument('--epochs', type=int, default=300)
# 批量大小 虽然写的时候是这样写的--batch-size 但是使用的时候却是batch_size
parser.add_argument('--batch-size', type=int, default=16, help='total batch size for all GPUs')
# 输入图片大小
parser.add_argument('--imgsz', '--img', '--img-size', type=int, default=640, help='train, val image size (pixels)')
# 是否使用矩形框进行训练 使用的时候只需要设置 --rect 后边加一个空格即可,不需要指定True/False
# 如果用rect就不能进行镶嵌的数据增强 所谓的rect其实就是让训练的时候的同一批次的图形的形状弄成大致一直的
parser.add_argument('--rect', action='store_true', help='rectangular training')
# 可以指定从最近最好的开始训练 从上次打断训练的结果接着训练 resume有重新开始的意思
# 例如:--resume './runs/exp0/weghts/last.pt'
parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training')
# 是否需要保存每个epoch后的模型参数 即是否保存每次迭代后的ckpt到last.pt中
parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
# 是否只验证最后一次
parser.add_argument('--noval', action='store_true', help='only validate final epoch')
# 意思是用不用重新用k-means聚类锚框
parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')
# 是否进行超参数进化
parser.add_argument('--evolve', type=int, nargs='?', const=300, help='evolve hyperparameters for x generations')
# 是否需要上到google云盘bucket
parser.add_argument('--bucket', type=str, default='', help='gsutil bucket')
# 是否使用图片缓存 是否提前缓存图片到内存,以加快训练速度
# 使用的时候 只需要写--cache即可 默认缓存到RAM 上,也可以指定到disk(磁盘上)
parser.add_argument('--cache', type=str, nargs='?', const='ram', help='--cache images in "ram" (default) or "disk"')
# 是否用图片权重 如果需要只需要在配置那添加这样一行即可"--image-weights"
# 经试验发现如果不想使用则可以不用设置,且不能这样设置"--image-weights False"
# 用的时候是image_weights 根据图片中的label占比 确定图片的权重大小 后边后边会说
parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training')
# 使用CPU还是GPU GPU设备 --device 1,2,3 经试验虽然可以指定多个device但并没有速度上的提升
parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
# 是否多尺度(比例)训练 就是修改原来图片的尺寸,即在原图片的基础上随机缩放0.5~1.5倍
parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
# 是否只检测一个类别
parser.add_argument('--single-cls', action='store_true', help='train multi-class data as single-class')
# 优化器选择 是否是用adam优化器
# SGD,Adam区别联系:
# https://blog.csdn.net/bl128ve900/article/details/95355041
# https://zhuanlan.zhihu.com/p/32230623
parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer')
# 同步BN, 在DDP模式使用 DistributedDataParallel
parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
# 最多几个加载器进行数据加载
parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers')
# 就是修改runs下的目录名为指定的目录名 'runs/train' 'runs/evolve'
parser.add_argument('--project', default=ROOT / 'runs/train', help='save to project/name')
# 指定输出格式:将exp变成了name
parser.add_argument('--name', default='exp', help='save to project/name')
# 如果文件目录已存在则不另外创建 runs/evolve/*
parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
# 四倍加载器加载图片标签
parser.add_argument('--quad', action='store_true', help='quad dataloader')
# 使用线性学习率
parser.add_argument('--linear-lr', action='store_true', help='linear LR')
# 标签平滑指数即大小
parser.add_argument('--label-smoothing', type=float, default=0.0, help='Label smoothing epsilon')
# 忍耐力 默认100轮迭代差,即如果本轮迭代和最好一轮的迭代序号之差大于等于100则停止训练
parser.add_argument('--patience', type=int, default=100, help='EarlyStopping patience (epochs without improvement)')
# 要冻结前多少层
parser.add_argument('--freeze', type=int, default=0, help='Number of layers to freeze. backbone=10, all=24')
# 每隔多少个迭代周期保存一次
parser.add_argument('--save-period', type=int, default=-1, help='Save checkpoint every x epochs (disabled if < 1)')
# 本地进程号 DistributedDataParallel
parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
# Weights & Biases arguments 以下都是wandb用到的参数
parser.add_argument('--entity', default=None, help='W&B: Entity')
parser.add_argument('--upload_dataset', action='store_true', help='W&B: Upload dataset as artifact table')
parser.add_argument('--bbox_interval', type=int, default=-1, help='W&B: Set bounding-box image logging interval')
parser.add_argument('--artifact_alias', type=str, default='latest', help='W&B: Version of dataset artifact to use')
opt = parser.parse_known_args()[0] if known else parser.parse_args()
return opt
def main(opt, callbacks=Callbacks()):
# Checks
set_logging(RANK)
if RANK in [-1, 0]:
# 打印参数信息
print_args(FILE.stem, opt)
# 检查你的git相关操作
check_git_status()
# 核对requirements.txt文件包依赖是否安装 排除thop
check_requirements(exclude=['thop'])
# Resume wandb 可视化工具
if opt.resume and not check_wandb_resume(opt) and not opt.evolve: # resume an interrupted run
# 进来则表示从上一次打断的地方开始训练
# 获取last.pt文件
ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path
# 判断文件是否存在
assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
# 读取文件
with open(Path(ckpt).parent.parent / 'opt.yaml', errors='ignore') as f:
# 格式转换:将字典类型的转换成Namespace格式的
# 有啥用?,可以试试不转换 这样做的好处就是可以直接用.进行调用 &dict_test
opt = argparse.Namespace(**yaml.safe_load(f)) # replace
# 赋值操作
opt.cfg, opt.weights, opt.resume = '', ckpt, True # reinstate
LOGGER.info(f'Resuming training from {ckpt}')
else:
# check_file 就是根据给定的文件路径判断文件是否存在
opt.data, opt.cfg, opt.hyp, opt.weights, opt.project = \
check_file(opt.data), check_yaml(opt.cfg), check_yaml(opt.hyp), str(opt.weights), str(opt.project) # checks
assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
if opt.evolve:
# 修改工程名即修改为train或者evolve名字
opt.project = str(ROOT / 'runs/evolve')
opt.exist_ok, opt.resume = opt.resume, False # pass resume to exist_ok and disable resume
# / https://blog.csdn.net/jjw_zyfx/article/details/121277404
# &division_test
# 按照升序的顺序创建文件路径
opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok))
# DDP mode 如果能用cuda就用cuda,不能就用GPU
device = select_device(opt.device, batch_size=opt.batch_size)
if LOCAL_RANK != -1:
assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command'
assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count'
assert not opt.image_weights, '--image-weights argument is not compatible with DDP training'
assert not opt.evolve, '--evolve argument is not compatible with DDP training'
torch.cuda.set_device(LOCAL_RANK)
# 这不就可以修改显卡设备数量了
device = torch.device('cuda', LOCAL_RANK) # 根据gpu编号选择设备
# 可以点进去:初始化分布式进程组并且也会初始化进程包
dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo")
# Train
if not opt.evolve:
train(opt.hyp, opt, device, callbacks)
if WORLD_SIZE > 1 and RANK == 0:
LOGGER.info('Destroying process group... ')
dist.destroy_process_group() # 销毁其他进程
# Evolve hyperparameters (optional)
else:
# Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
meta = {'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3)
'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf)
'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1
'weight_decay': (1, 0.0, 0.001), # optimizer weight decay
'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok)
'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum
'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr
'box': (1, 0.02, 0.2), # box loss gain
'cls': (1, 0.2, 4.0), # cls loss gain
'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight
'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels)
'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight
'iou_t': (0, 0.1, 0.7), # IoU training threshold
'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold
'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore)
'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5)
'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction)
'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction)
'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction)
'degrees': (1, 0.0, 45.0), # image rotation (+/- deg)
'translate': (1, 0.0, 0.9), # image translation (+/- fraction)
'scale': (1, 0.0, 0.9), # image scale (+/- gain)
'shear': (1, 0.0, 10.0), # image shear (+/- deg)
'perspective': (0, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001
'flipud': (1, 0.0, 1.0), # image flip up-down (probability)
'fliplr': (0, 0.0, 1.0), # image flip left-right (probability)
'mosaic': (1, 0.0, 1.0), # image mixup (probability)
'mixup': (1, 0.0, 1.0), # image mixup (probability)
'copy_paste': (1, 0.0, 1.0)} # segment copy-paste (probability)
with open(opt.hyp, errors='ignore') as f:
hyp = yaml.safe_load(f) # load hyps dict
if 'anchors' not in hyp: # anchors commented in hyp.yaml
hyp['anchors'] = 3
opt.noval, opt.nosave, save_dir = True, True, Path(opt.save_dir) # only val/save final epoch
# ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices
evolve_yaml, evolve_csv = save_dir / 'hyp_evolve.yaml', save_dir / 'evolve.csv'
if opt.bucket:
os.system(f'gsutil cp gs://{opt.bucket}/evolve.csv {save_dir}') # download evolve.csv if exists
for _ in range(opt.evolve): # generations to evolve
if evolve_csv.exists(): # if evolve.csv exists: select best hyps and mutate
# Select parent(s)
parent = 'single' # parent selection method: 'single' or 'weighted'
x = np.loadtxt(evolve_csv, ndmin=2, delimiter=',', skiprows=1)
n = min(5, len(x)) # number of previous results to consider
x = x[np.argsort(-fitness(x))][:n] # top n mutations
w = fitness(x) - fitness(x).min() + 1E-6 # weights (sum > 0)
if parent == 'single' or len(x) == 1:
# x = x[random.randint(0, n - 1)] # random selection
x = x[random.choices(range(n), weights=w)[0]] # weighted selection
elif parent == 'weighted':
x = (x * w.reshape(n, 1)).sum(0) / w.sum() # weighted combination
# Mutate
mp, s = 0.8, 0.2 # mutation probability, sigma
npr = np.random
npr.seed(int(time.time()))
g = np.array([meta[k][0] for k in hyp.keys()]) # gains 0-1
ng = len(meta)
v = np.ones(ng)
while all(v == 1): # mutate until a change occurs (prevent duplicates)
v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0)
for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300)
hyp[k] = float(x[i + 7] * v[i]) # mutate
# Constrain to limits
for k, v in meta.items():
hyp[k] = max(hyp[k], v[1]) # lower limit
hyp[k] = min(hyp[k], v[2]) # upper limit
hyp[k] = round(hyp[k], 5) # significant digits
# Train mutation
results = train(hyp.copy(), opt, device, callbacks)
# Write mutation results
print_mutation(results, hyp.copy(), save_dir, opt.bucket)
# Plot results
plot_evolve(evolve_csv)
print(f'Hyperparameter evolution finished\n'
f"Results saved to {colorstr('bold', save_dir)}\n"
f'Use best hyperparameters example: $ python train.py --hyp {evolve_yaml}')
# 提供个接口供别的文件调用
def run(**kwargs):
# Usage: import train; train.run(data='coco128.yaml', imgsz=320, weights='yolov5m.pt')
opt = parse_opt(True)
for k, v in kwargs.items():
setattr(opt, k, v)
main(opt)
if __name__ == "__main__":
opt = parse_opt()
main(opt)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。