2 Star 4 Fork 0

sxlj/BERT微调-NER任务

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
trainer.py 13.67 KB
一键复制 编辑 原始数据 按行查看 历史
sxlj 提交于 2021-07-28 14:57 . add notes
import os
import logging
from tqdm import tqdm, trange
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertConfig, AdamW, get_linear_schedule_with_warmup
from bert_finetune_ner.utils import MODEL_CLASSES, compute_metrics, get_slot_labels
# 日志对象初始化
logger = logging.getLogger(__name__)
class Trainer(object):
"""
Trainer类定义NER任务的训练与评估
"""
def __init__(self, args, train_dataset=None, dev_dataset=None, test_dataset=None):
# 加载参数
self.args = args
# 加载数据集
self.train_dataset = train_dataset
self.dev_dataset = dev_dataset
self.test_dataset = test_dataset
# 命名实体标签
self.slot_label_lst = get_slot_labels(args)
# 损失计算时,忽略label损失的索引
self.pad_token_label_id = args.ignore_index
# 加载预训练好的模型
self.config_class, self.model_class, _ = MODEL_CLASSES[args.model_type]
# 导入模型配置文件
self.config = self.config_class.from_pretrained(args.model_name_or_path, finetuning_task=args.task)
self.model = self.model_class.from_pretrained(args.model_name_or_path,
config=self.config,
args=args,
slot_label_lst=self.slot_label_lst)
# 加载配置:GPU or CPU
self.device = "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
self.model.to(self.device)
def train(self):
"""
微调部分
"""
# 训练集通过DataLoader进行加载
train_sampler = RandomSampler(self.train_dataset)
train_dataloader = DataLoader(self.train_dataset, sampler=train_sampler, batch_size=self.args.train_batch_size)
# 计算需要多少步梯度更新,t_total用于learning rate的更新
if self.args.max_steps > 0:
t_total = self.args.max_steps
self.args.num_train_epochs = self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1
else:
t_total = len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs
for n, p in self.model.named_parameters():
print(n)
# Prepare optimizer and schedule (linear warmup and decay)
optimizer_grouped_parameters = []
# BERT部分参数,设置一个较低的学习率
bert_params = list(self.model.bert.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
# 部分参数设置权重衰减,部分参数不设置
optimizer_grouped_parameters += [
{
'params': [p for n, p in bert_params if not any(nd in n for nd in no_decay)],
'weight_decay': self.args.weight_decay,
"lr": self.args.learning_rate,
},
{
'params': [p for n, p in bert_params if any(nd in n for nd in no_decay)],
'weight_decay': 0.0,
'lr': self.args.learning_rate,
}
]
# 线性层参数
linear_params = list(self.model.slot_classifier.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters += [
{
'params': [p for n, p in linear_params if not any(nd in n for nd in no_decay)],
'weight_decay': self.args.weight_decay,
"lr": self.args.linear_learning_rate,
},
{
'params': [p for n, p in linear_params if any(nd in n for nd in no_decay)],
'weight_decay': 0.0,
'lr': self.args.linear_learning_rate,
}
]
# crf层参数
if self.args.use_crf:
crf_params = list(self.model.crf.named_parameters())
no_decay = ['start_transitions', 'end_transitions']
optimizer_grouped_parameters += [
{
'params': [p for n, p in crf_params if not any(nd in n for nd in no_decay)],
'weight_decay': self.args.weight_decay,
"lr": self.args.crf_learning_rate,
},
{
'params': [p for n, p in crf_params if any(nd in n for nd in no_decay)],
'weight_decay': 0.0,
'lr': self.args.crf_learning_rate,
}
]
# 优化器
optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
# 学习率的衰减
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=t_total)
# Train!
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(self.train_dataset))
logger.info(" Num Epochs = %d", self.args.num_train_epochs)
logger.info(" Total train batch size = %d", self.args.train_batch_size)
logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps)
logger.info(" Total optimization steps = %d", t_total)
logger.info(" Logging steps = %d", self.args.logging_steps)
logger.info(" Save steps = %d", self.args.save_steps)
global_step = 0
tr_loss = 0.0
# 将梯度清空
self.model.zero_grad()
train_iterator = trange(int(self.args.num_train_epochs), desc="Epoch")
# 循环遍历每一个epoch
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration")
# 循环遍历每一个batch的数据
for step, batch in enumerate(epoch_iterator):
# 模型训练
self.model.train()
# GPU or CPU
batch = tuple(t.to(self.device) for t in batch)
# 输入数据为输入样本序号,attention_mask,ner标签序列
inputs = {'input_ids': batch[0],
'attention_mask': batch[1],
'slot_labels_ids': batch[3]}
if self.args.model_type != 'distilbert':
inputs['token_type_ids'] = batch[2]
# 通过模型的前向传播得到outputs
outputs = self.model(**inputs)
# 得到交叉熵损失-slot_loss
loss = outputs[0]
if self.args.gradient_accumulation_steps > 1:
loss = loss / self.args.gradient_accumulation_steps
# 反向传播得到梯度
loss.backward()
tr_loss += loss.item()
# 每隔一个梯度累积步数(gradient_accumulation_steps)做一次梯度更新
if (step + 1) % self.args.gradient_accumulation_steps == 0:
# 防止训练过程中梯度爆炸,进行梯度裁剪
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)
# 参数梯度更新
optimizer.step()
# learning_rate进行梯度更新
scheduler.step()
# 将梯度清零
self.model.zero_grad()
global_step += 1
# 每隔logging_steps在验证集上进行评估
if self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0:
self.evaluate("dev")
# 每隔save_steps进行保存模型
if self.args.save_steps > 0 and global_step % self.args.save_steps == 0:
self.save_model()
if 0 < self.args.max_steps < global_step:
epoch_iterator.close()
break
if 0 < self.args.max_steps < global_step:
train_iterator.close()
break
return global_step, tr_loss / global_step
def evaluate(self, mode):
"""
评估部分
:param mode: 区分验证集与测试集
:return: 返回评估results
"""
if mode == 'test':
dataset = self.test_dataset
elif mode == 'dev':
dataset = self.dev_dataset
else:
raise Exception("Only dev and test dataset available")
# 评估时,数据集不需要打乱
# SequentialSampler:按顺序进行采样
eval_sampler = SequentialSampler(dataset)
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.args.eval_batch_size)
# Eval!
logger.info("***** Running evaluation on %s dataset *****", mode)
logger.info(" Num examples = %d", len(dataset))
logger.info(" Batch size = %d", self.args.eval_batch_size)
eval_loss = 0.0
nb_eval_steps = 0
# 收集所有样本的预测结果
slot_preds = None
# 收集命名实体标注好的序列
out_slot_labels_ids = None
# 模型不需要进行梯度更新,将模型固定
self.model.eval()
# 循环每一个batch
for batch in tqdm(eval_dataloader, desc="Evaluating"):
batch = tuple(t.to(self.device) for t in batch)
# torch.no_grad():它包裹的不需要进行梯度计算
with torch.no_grad():
inputs = {'input_ids': batch[0],
'attention_mask': batch[1],
'slot_labels_ids': batch[3]}
if self.args.model_type != 'distilbert':
inputs['token_type_ids'] = batch[2]
# 通过前向传播得到outputs
outputs = self.model(**inputs)
# slot_loss,预测结果
tmp_eval_loss, slot_logits = outputs[:2]
eval_loss += tmp_eval_loss.mean().item()
nb_eval_steps += 1
# Slot prediction
if slot_preds is None:
# 如果是有CRF
if self.args.use_crf:
# decode() in `torchcrf` returns list with best index directly
slot_preds = np.array(self.model.crf.decode(slot_logits))
else:
# detach()阻断反向传播,不再有梯度
# numpy不能读取CUDA tensor 需要将它转化为 CPU tensor
slot_preds = slot_logits.detach().cpu().numpy()
out_slot_labels_ids = inputs["slot_labels_ids"].detach().cpu().numpy()
else:
if self.args.use_crf:
slot_preds = np.append(slot_preds, np.array(self.model.crf.decode(slot_logits)), axis=0)
else:
slot_preds = np.append(slot_preds, slot_logits.detach().cpu().numpy(), axis=0)
out_slot_labels_ids = np.append(out_slot_labels_ids, inputs["slot_labels_ids"].detach().cpu().numpy(), axis=0)
eval_loss = eval_loss / nb_eval_steps
results = {
"loss": eval_loss
}
# Slot result
if not self.args.use_crf:
slot_preds = np.argmax(slot_preds, axis=2) # (n, L, NUM_OF_LABELS) --> (n, L, 1)
slot_label_map = {i: label for i, label in enumerate(self.slot_label_lst)}
out_slot_label_list = [[] for _ in range(out_slot_labels_ids.shape[0])]
slot_preds_list = [[] for _ in range(out_slot_labels_ids.shape[0])]
# 转化为原始句子的预测结果而不是经过bert处理后的预测结果
for i in range(out_slot_labels_ids.shape[0]):
for j in range(out_slot_labels_ids.shape[1]):
# 如果out_slot_labels_ids是pad_token_label_id的话,说明这个token不是我们想要的
if out_slot_labels_ids[i, j] != self.pad_token_label_id:
# 原始标签存放到out_slot_label_list
out_slot_label_list[i].append(slot_label_map[out_slot_labels_ids[i][j]])
# 标签预测存放到slot_preds_list
slot_preds_list[i].append(slot_label_map[slot_preds[i][j]])
# 评估打分
total_result = compute_metrics(slot_preds_list, out_slot_label_list)
# 更新结果
results.update(total_result)
logger.info("***** Eval results *****")
for key in sorted(results.keys()):
logger.info(" %s = %s", key, str(results[key]))
return results
def save_model(self):
# 如果路径不存在,则构造路径
if not os.path.exists(self.args.model_dir):
os.makedirs(self.args.model_dir)
model_to_save = self.model.module if hasattr(self.model, 'module') else self.model
# 保存模型
model_to_save.save_pretrained(self.args.model_dir)
# 将训练参数与训练好的模型一起保存
torch.save(self.args, os.path.join(self.args.model_dir, 'training_args.bin'))
logger.info("Saving model checkpoint to %s", self.args.model_dir)
def load_model(self):
# Check whether model exists
if not os.path.exists(self.args.model_dir):
raise Exception("Model doesn't exists! Train first!")
try:
# 加载预训练模型
self.model = self.model_class.from_pretrained(self.args.model_dir,
args=self.args,
slot_label_lst=self.slot_label_lst)
self.model.to(self.device)
logger.info("***** Model Loaded *****")
except:
raise Exception("Some model files might be missing...")
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/lj857335332/bert-spinner---ner-task.git
git@gitee.com:lj857335332/bert-spinner---ner-task.git
lj857335332
bert-spinner---ner-task
BERT微调-NER任务
master

搜索帮助

0d507c66 1850385 C8b1a773 1850385