1 Star 0 Fork 0

光井/finetune_llm

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
sft.py 13.24 KB
一键复制 编辑 原始数据 按行查看 历史
赵国梁 提交于 2024-01-17 20:50 . ambrose_20240117
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import time
import math
import pickle
from contextlib import nullcontext
import numpy as np
import torch
from model import Transformer, ModelArgs
from torch.distributed import destroy_process_group, init_process_group
from torch.nn.parallel import DistributedDataParallel as DDP
import pandas as pd
from dataset_sft import SFTDataset
import logging
import json
import torch.nn.functional as F
from chatglm_tokenizer.tokenization_chatglm import ChatGLMTokenizer
def get_logger(filename, verbosity=1, name=None):
level_dict = {0: logging.DEBUG, 1: logging.INFO, 2: logging.WARNING}
formatter = logging.Formatter(
"[%(asctime)s][%(filename)s][%(levelname)s] %(message)s"
)
logger = logging.getLogger(name)
logger.setLevel(level_dict[verbosity])
fh = logging.FileHandler(filename, "w")
fh.setFormatter(formatter)
logger.addHandler(fh)
sh = logging.StreamHandler()
sh.setFormatter(formatter)
logger.addHandler(sh)
return logger
# -----------------------------------------------------------------------------
def get_lr(it):
# 1) linear warmup for warmup_iters steps
if it < warmup_iters:
return learning_rate * it / warmup_iters
# 2) if it > lr_decay_iters, return min learning rate
if it > lr_decay_iters:
return min_lr
# 3) in between, use cosine decay down to min learning rate
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
assert 0 <= decay_ratio <= 1
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
return min_lr + coeff * (learning_rate - min_lr)
def train_epoch(epoch):
start_time=time.time()
for step, (X, Y,loss_mask) in enumerate(train_loader):
X=X.to(device)
Y=Y.to(device)
loss_mask=loss_mask.to(device)
lr = get_lr(epoch*iter_per_epoch+step) if decay_lr else learning_rate
for param_group in optimizer.param_groups:
param_group['lr'] = lr
# and using the GradScaler if data type is float16
#for micro_step in range(gradient_accumulation_steps):
if ddp:
# in DDP training we only need to sync gradients at the last micro step.
# the official way to do this is with model.no_sync() context manager, but
# I really dislike that this bloats the code and forces us to repeat code
# looking at the source of that context manager, it just toggles this variable
model.require_backward_grad_sync = 0 == gradient_accumulation_steps - 1
with ctx:
logits = model(X, Y)
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), Y.view(-1), ignore_index=0,reduce=False)
loss_mask = loss_mask.view(-1)
loss = torch.sum(loss*loss_mask)/loss_mask.sum()
#loss = raw_model.last_loss
#loss = loss / gradient_accumulation_steps
# immediately async prefetch next batch while model is doing the forward pass on the GPU
# backward pass, with gradient scaling if training in fp16
scaler.scale(loss).backward()
#
# clip the gradient
if grad_clip != 0.0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
# step the optimizer and scaler if training in fp16
scaler.step(optimizer)
scaler.update()
# flush the gradients as soon as we can, no need for this memory anymore
optimizer.zero_grad(set_to_none=True)
#打印日志
if step % log_interval == 0:
spend_time=time.time()-start_time
logger.info(
'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.7f} epoch_Time:{}min:'.format(
epoch,
max_epoch,
step,
iter_per_epoch,
loss.item(),
optimizer.param_groups[-1]['lr'],
spend_time / (step+1) * iter_per_epoch // 60 - spend_time // 60))
@torch.no_grad()
def valid_epoch(epoch):
global best_val_loss
losses = []
model.eval()
for _, (X, Y) in enumerate(val_loader):
X=X.to(device)
Y=Y.to(device)
with ctx:
logits, loss = model(X, Y)
losses.append(loss.item())
model.train()
val_loss=np.mean(losses)
#
logger.info('valid loss = {:.4f}'.format(val_loss))
if val_loss < best_val_loss:
best_val_loss = val_loss
logger.info('best val_loss: {} best_epoch: {} '.format(best_val_loss,epoch))
torch.save(raw_model.state_dict(),'{}/best.pth'.format(save_dir))
#
return val_loss
def init_model():
# model init
# model init
model_args = dict(
dim=dim,
n_layers=n_layers,
n_heads=n_heads,
n_kv_heads=n_heads,
vocab_size=64793,#64793,
multiple_of=multiple_of,
max_seq_len=max_seq_len,
dropout=dropout,
) # start with model_args from command line
if init_from == "scratch":
# init a new model from scratch
print("Initializing a new model from scratch")
gptconf = ModelArgs(**model_args)
model = Transformer(gptconf)
elif init_from == "resume":
print(f"Resuming training from {out_dir}")
# resume training from a checkpoint.
ckpt_path = os.path.join(out_dir, "ckpt.pt")
checkpoint = torch.load(ckpt_path, map_location=device)
checkpoint_model_args = checkpoint["model_args"]
# force these config attributes to be equal otherwise we can't even resume training
# the rest of the attributes (e.g. dropout) can stay as desired from command line
for k in ["dim", "n_layers", "n_heads", "n_kv_heads", "vocab_size", "multiple_of", "max_seq_len"]:
model_args[k] = checkpoint_model_args[k]
# create the model
gptconf = ModelArgs(**model_args)
model = Transformer(gptconf)
state_dict = checkpoint["model"]
# fix the keys of the state dictionary :(
# honestly no idea how checkpoints sometimes get this prefix, have to debug more
unwanted_prefix = "_orig_mod."
for k, v in list(state_dict.items()):
if k.startswith(unwanted_prefix):
state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
model.load_state_dict(state_dict)
iter_num = checkpoint["iter_num"]
best_val_loss = checkpoint["best_val_loss"]
return model
# I/O
if __name__=="__main__":
out_dir = 'out'
max_epoch = 10
eval_interval = 1
log_interval = 50
eval_iters = 200
eval_only = False # if True, script exits right after the first eval
always_save_checkpoint = True # if True, always save a checkpoint after each eval
init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*'
#
gradient_accumulation_steps = 1 # used to simulate larger batch sizes
batch_size = 10 # if gradient_accumulation_steps > 1, this is the micro-batch size
# model
max_seq_len = 1024
dim = 1024
n_layers = 12
n_heads = 8
multiple_of = 32
dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
bias = False # do we use bias inside LayerNorm and Linear layers?
# adamw optimizer
learning_rate = 2e-5 # max learning rate
weight_decay = 1e-4
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
# learning rate decay settings
decay_lr = True # whether to decay the learning rate
warmup_iters = 4015 # how many steps to warm up for
lr_decay_iters = 80300 # should be ~= max_iters per Chinchilla
min_lr = 1e-6 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
# DDP settings
backend = 'nccl' # 'nccl', 'gloo', etc.
# system
device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
dtype = 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
compile = False # use PyTorch 2.0 to compile the model to be faster
# -----------------------------------------------------------------------------
config_keys = [
k
for k, v in globals().items()
if not k.startswith("_") and isinstance(v, (int, float, bool, str))
]
# exec(open("configurator.py").read()) # overrides from command line or config file
# config = {k: globals()[k] for k in config_keys} # will be useful for logging
# -----------------------------------------------------------------------------
save_dir =os.path.join(out_dir , 'sft')
if not os.path.exists(save_dir): os.makedirs(save_dir)
logger = get_logger(os.path.join(save_dir,'log.log'))
# various inits, derived attributes, I/O setup
# various inits, derived attributes, I/O setup
ddp = int(os.environ.get("RANK", -1)) != -1 # is this a ddp run?
if ddp:
init_process_group(backend="nccl")
ddp_rank = int(os.environ["RANK"])
ddp_local_rank = int(os.environ["LOCAL_RANK"])
ddp_world_size = int(os.environ["WORLD_SIZE"])
device = f"cuda:{ddp_local_rank}"
torch.cuda.set_device(device)
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
seed_offset = ddp_rank # each process gets a different seed
# world_size number of processes will be training simultaneously, so we can scale
# down the desired gradient accumulation iterations per process proportionally
#assert gradient_accumulation_steps % ddp_world_size == 0
#gradient_accumulation_steps //= ddp_world_size
else:
# if not ddp, we are running on a single gpu, and one process
master_process = True
seed_offset = 0
ddp_world_size = 1
tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * max_seq_len
if master_process:
print(f"tokens per iteration will be: {tokens_per_iter:,}")
print(f"breaks down as: {gradient_accumulation_steps} grad accum steps * {ddp_world_size} processes * {batch_size} batch size * {max_seq_len} max seq len")
if master_process:
os.makedirs(out_dir, exist_ok=True)
torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
device_type = "cuda" if "cuda" in device else "cpu" # for later use in torch.autocast
# note: float16 data type will automatically use a GradScaler
ptdtype = {"float32": torch.float32, "bfloat16": torch.bfloat16, "float16": torch.float16}[dtype]
ctx = (
nullcontext()
if device_type == "cpu"
else torch.cuda.amp.autocast()
)
#
best_val_loss = 1e9
#-----init dataloader------
df=pd.read_csv('./sft_data/sft_data_norm.csv')
# input=[]
# target=[]
# with open('../track1/train_valid.json','r') as f:
# data=json.load(f)
# #
# for l in data:
# input.append(l['question'])
# target.append(l['answer'])
# df = pd.DataFrame()
# df['prompt']=input
# df['answer']=target
# df=pd.concat((df_sft,df[100:])).reset_index(drop=True)
df=df.sample(frac=1.0)
print(df)
tokenizer=ChatGLMTokenizer(vocab_file='./chatglm_tokenizer/tokenizer.model')
train_ds = SFTDataset(df,tokenizer, max_length=512)
train_loader = torch.utils.data.DataLoader(
train_ds,
batch_size=batch_size,
pin_memory=False,
drop_last=False,
shuffle=False,
num_workers=0,
)
# val_ds = PretrainDataset(data_path_list, max_length=256)
# val_loader = torch.utils.data.DataLoader(
# val_ds,
# batch_size=batch_size,
# pin_memory=False,
# drop_last=False,
# shuffle=False,
# num_workers=0,
# )
#init model
model=init_model()
model.load_state_dict(torch.load('./out/1024-1024-12-8/epoch_0.pth'))
model.to(device)
# initialize a GradScaler. If enabled=False scaler is a no-op
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
# optimizer
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
#
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=max_epoch, T_mult=1, eta_min=1e-6, last_epoch=-1)
iter_per_epoch=len(train_loader)
warmup_epoch=1
# compile the model
if compile:
print("compiling the model... (takes a ~minute)")
unoptimized_model = model
model = torch.compile(model) # requires PyTorch 2.0
# wrap model into DDP container
if ddp:
# Ignore the `freqs_cis` buffer so that DDP does not broadcast it at
# construction time since NCCL does not support `ComplexFloat`
prefix = "_orig_mod." if compile else ""
model._ddp_params_and_buffers_to_ignore = {prefix + "freqs_cis"}
model = DDP(model, device_ids=[ddp_local_rank])
#
raw_model = model.module if ddp else model # unwrap DDP container if needed
# training loop
for epoch in range(max_epoch):
train_epoch(epoch)
#val_loss=valid_epoch(epoch)
torch.save(raw_model.state_dict(),'{}/epoch_{}.pth'.format(save_dir,epoch))
if ddp:
destroy_process_group()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/JackeyLove99/finetune_llm.git
git@gitee.com:JackeyLove99/finetune_llm.git
JackeyLove99
finetune_llm
finetune_llm
master

搜索帮助