1 Star 0 Fork 0

samprasgit/Learn-in-Python

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
DPCNN.py 21.33 KB
一键复制 编辑 原始数据 按行查看 历史
samprasgit 提交于 2021-08-27 10:29 . reformat code
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654
import pandas as pd
import numpy as np
import collections
import re
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss, roc_auc_score
from utils.spatial_dropout import SpatialDropout
import warnings
import torch.nn as nn
from tqdm import tqdm
import random
import gensim
import argparse
from torchcontrib.optim import SWA
import os
import logging
from torch.utils import data
from sklearn.metrics import f1_score
from torch import nn
import torch.nn.functional as F
from torch.optim import *
torch.set_printoptions(edgeitems=768)
warnings.filterwarnings("ignore")
np.set_printoptions(threshold=np.inf)
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
SEED = 2021
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if DEVICE != "cpu":
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def data_process():
train_data = pd.read_csv(
"/media/mgege007/winType/DaGuan/data/datagrand_2021_train.csv"
)
test_data = pd.read_csv(
"/media/mgege007/winType/DaGuan/data/datagrand_2021_test.csv"
)
id2label = list(train_data["label"].unique())
label2id = {id2label[i]: i for i in range(len(id2label))}
y_train = np.zeros((len(train_data), len(id2label)), dtype=np.int8)
all_sentences = pd.concat([train_data["text"], test_data["text"]]).reset_index(
drop=True
)
all_sentences.drop_duplicates().reset_index(drop=True, inplace=True)
all_sentences = all_sentences.apply(lambda x: x.split(" ")).tolist()
if not os.path.exists("./embedding/w2v.model"):
w2v_model = gensim.models.word2vec.Word2Vec(
all_sentences,
sg=1,
vector_size=300,
window=7,
min_count=1,
negative=3,
sample=0.001,
hs=1,
seed=452,
)
w2v_model.save("./embedding/w2v.model")
else:
w2v_model = gensim.models.word2vec.Word2Vec.load("./embedding/w2v.model")
if not os.path.exists("./embedding/fasttext.model"):
fasttext_model = gensim.models.FastText(
all_sentences, seed=452, vector_size=100, min_count=1, epochs=20, window=2
)
fasttext_model.save("./embedding/fasttext.model")
else:
fasttext_model = gensim.models.word2vec.Word2Vec.load(
"./embedding/fasttext.model"
)
train_dataset = []
###################################
# 增加一个List ylabel 存储训练集的类别
###################################
ylabel = []
for i in tqdm(range(len(train_data))):
train_dict = {}
train_dict["id"] = train_data.loc[i, "id"]
train_dict["text"] = train_data.loc[i, "text"]
y_train[i][label2id[train_data.loc[i, "label"]]] = 1
train_dict["label"] = y_train[i]
###################################
# 增加一个List ylabel 存储训练集的类别
###################################
ylabel.append(train_data.loc[i, "label"])
train_dataset.append(train_dict)
test_dataset = []
for i in tqdm(range(len(test_data))):
test_dict = {}
test_dict["id"] = test_data.loc[i, "id"]
test_dict["text"] = test_data.loc[i, "text"]
test_dict["label"] = -1
test_dataset.append(test_dict)
return (
test_data,
train_dataset,
test_dataset,
w2v_model,
fasttext_model,
id2label,
ylabel,
)
class DataSet(data.Dataset):
def __init__(self, args, data, mode="train"):
self.data = data
self.mode = mode
self.w2v_model = gensim.models.word2vec.Word2Vec.load("./embedding/w2v.model")
self.dataset = self.get_data(self.data, self.mode)
def get_data(self, data, mode):
dataset = []
global s
for data_li in tqdm(data):
text = data_li["text"].split(" ")
text = [
self.w2v_model.wv.key_to_index[s] + 1 if s in self.w2v_model.wv else 0
for s in text
]
if len(text) < args.MAX_LEN:
text += [0] * (args.MAX_LEN - len(text))
else:
text = text[: args.MAX_LEN]
label = data_li["label"]
dataset_dict = {"text": text, "label": label}
dataset.append(dataset_dict)
return dataset
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
data = self.dataset[idx]
text = torch.tensor(data["text"])
if self.mode == "test":
return text
else:
label = torch.tensor(data["label"])
return text, label
def get_dataloader(args, dataset, mode):
torchdata = DataSet(args, dataset, mode=mode)
if mode == "train":
dataloader = torch.utils.data.DataLoader(
torchdata,
batch_size=args.batch_size,
shuffle=True,
num_workers=4,
drop_last=True,
)
elif mode == "test":
dataloader = torch.utils.data.DataLoader(
torchdata,
batch_size=args.batch_size,
shuffle=False,
num_workers=4,
drop_last=False,
)
elif mode == "valid":
dataloader = torch.utils.data.DataLoader(
torchdata,
batch_size=args.batch_size,
shuffle=False,
num_workers=4,
drop_last=True,
)
return dataloader, torchdata
class CyclicLR(object):
def __init__(
self,
optimizer,
base_lr=1e-3,
max_lr=6e-3,
step_size=2000,
mode="triangular",
gamma=1.0,
scale_fn=None,
scale_mode="cycle",
last_batch_iteration=-1,
):
if not isinstance(optimizer, Optimizer):
raise TypeError("{} is not an Optimizer".format(type(optimizer).__name__))
self.optimizer = optimizer
if isinstance(base_lr, list) or isinstance(base_lr, tuple):
if len(base_lr) != len(optimizer.param_groups):
raise ValueError(
"expected {} base_lr, got {}".format(
len(optimizer.param_groups), len(base_lr)
)
)
self.base_lrs = list(base_lr)
else:
self.base_lrs = [base_lr] * len(optimizer.param_groups)
if isinstance(max_lr, list) or isinstance(max_lr, tuple):
if len(max_lr) != len(optimizer.param_groups):
raise ValueError(
"expected {} max_lr, got {}".format(
len(optimizer.param_groups), len(max_lr)
)
)
self.max_lrs = list(max_lr)
else:
self.max_lrs = [max_lr] * len(optimizer.param_groups)
self.step_size = step_size
if mode not in ["triangular", "triangular2", "exp_range"] and scale_fn is None:
raise ValueError("mode is invalid and scale_fn is None")
self.mode = mode
self.gamma = gamma
if scale_fn is None:
if self.mode == "triangular":
self.scale_fn = self._triangular_scale_fn
self.scale_mode = "cycle"
elif self.mode == "triangular2":
self.scale_fn = self._triangular2_scale_fn
self.scale_mode = "cycle"
elif self.mode == "exp_range":
self.scale_fn = self._exp_range_scale_fn
self.scale_mode = "iterations"
else:
self.scale_fn = scale_fn
self.scale_mode = scale_mode
self.batch_step(last_batch_iteration + 1)
self.last_batch_iteration = last_batch_iteration
def batch_step(self, batch_iteration=None):
if batch_iteration is None:
batch_iteration = self.last_batch_iteration + 1
self.last_batch_iteration = batch_iteration
for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
param_group["lr"] = lr
def _triangular_scale_fn(self, x):
return 1.0
def _triangular2_scale_fn(self, x):
return 1 / (2.0 ** (x - 1))
def _exp_range_scale_fn(self, x):
return self.gamma ** (x)
def get_lr(self):
step_size = float(self.step_size)
cycle = np.floor(1 + self.last_batch_iteration / (2 * step_size))
x = np.abs(self.last_batch_iteration / step_size - 2 * cycle + 1)
lrs = []
param_lrs = zip(self.optimizer.param_groups, self.base_lrs, self.max_lrs)
for param_group, base_lr, max_lr in param_lrs:
base_height = (max_lr - base_lr) * np.maximum(0, (1 - x))
if self.scale_mode == "cycle":
lr = base_lr + base_height * self.scale_fn(cycle)
else:
lr = base_lr + base_height * self.scale_fn(self.last_batch_iteration)
lrs.append(lr)
return lrs
# 定义模型
class DPCNN(nn.Module):
def __init__(self, args, vocab_size, embedding_dim, embeddings=None):
super(DPCNN, self).__init__()
self.dropout = 0.5 # 随机失活
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练
self.num_classes = 35 # 类别数
self.learning_rate = 1e-3 # 学习率
self.num_filters = 250 # 卷积核数量(channels数)
self.vocab_size = vocab_size
self.embedding_dim = embedding_dim
self.hidden_size = 128
# 字向量维度
self.embed = nn.Embedding(self.vocab_size, self.embedding_dim)
if embeddings:
w2v_model = gensim.models.word2vec.Word2Vec.load("./embedding/w2v.model").wv
fasttext_model = gensim.models.word2vec.Word2Vec.load(
"./embedding/fasttext.model"
).wv
w2v_embed_matrix = w2v_model.vectors
fasttext_embed_matrix = fasttext_model.vectors
# embed_matrix = w2v_embed_matrix
embed_matrix = np.concatenate(
[w2v_embed_matrix, fasttext_embed_matrix], axis=1
)
oov_embed = np.zeros((1, embed_matrix.shape[1]))
embed_matrix = torch.from_numpy(np.vstack((oov_embed, embed_matrix)))
self.embed.weight.data.copy_(embed_matrix)
self.embed.weight.requires_grad = False
self.spatial_dropout = SpatialDropout(drop_prob=0.5)
self.conv_region = nn.Conv2d(1, self.num_filters, (3, self.embedding_dim))
self.conv = nn.Conv2d(self.num_filters, self.num_filters, (3, 1))
self.max_pool = nn.MaxPool2d(kernel_size=(3, 1), stride=2)
self.padding1 = nn.ZeroPad2d((0, 0, 1, 1))
self.padding2 = nn.ZeroPad2d((0, 0, 0, 1))
self.relu = nn.ReLU()
# 全连接方式二选一
self.fc = nn.Linear(self.num_filters, self.num_classes)
# self.fc = nn.Sequential(
# nn.Linear(self.num_filters, hidden_size),
# nn.BatchNorm1d(hidden_size),
# nn.ReLU(inplace=True),
# nn.Linear(hidden_size, self.num_classes)
# )
self._init_parameters()
def forward(self, x, label=None):
x = self.embed(x)
x = self.spatial_dropout(x)
x = x.unsqueeze(1)
x = self.conv_region(x)
x = self.padding1(x)
x = self.relu(x)
x = self.conv(x)
x = self.padding1(x)
x = self.relu(x)
x = self.conv(x)
while x.size()[2] > 2:
x = self._block(x)
x = x.squeeze(-1)
x = x.squeeze(-1)
out = self.fc(x)
if label is not None:
loss_fct = nn.BCEWithLogitsLoss()
loss = loss_fct(
out.view(-1, self.num_classes).float(),
label.view(-1, self.num_classes).float(),
)
return loss
else:
return out
def _block(self, x):
x = self.padding2(x)
px = self.max_pool(x)
x = self.padding1(px)
x = F.relu(x)
x = self.conv(x)
x = self.padding1(x)
x = F.relu(x)
x = self.conv(x)
x = x + px
return x
def _init_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.kaiming_normal_(p)
else:
nn.init.constant_(p, 0)
loss_fun = nn.BCEWithLogitsLoss()
def cal_macro_f1(y_true, y_pred):
score = f1_score(y_true, y_pred, average="macro")
return score
def validation_funtion(model, valid_dataloader, valid_torchdata, mode="valid"):
model.eval()
pred_list = []
labels_list = []
val_loss = []
y_preds = []
y_trues = []
with torch.no_grad():
if mode == "valid":
for i, (description, label) in enumerate(tqdm(valid_dataloader)):
output = model(description.to(DEVICE))
loss = loss_fun(output.sigmoid(), label.float().to(DEVICE))
y_pred = torch.max(output.sigmoid(), 1)[1].cpu().tolist()
y_label = torch.max(label, 1)[1].cpu().tolist()
y_trues.extend(y_label)
y_preds.extend(y_pred)
val_loss.append(loss.item())
auc = cal_macro_f1(y_trues, y_preds)
return auc, np.mean(val_loss)
else:
for i, (description) in enumerate(tqdm(valid_dataloader)):
output = model(description.to(DEVICE))
pred_list += output.sigmoid().detach().cpu().numpy().tolist()
return pred_list
def train(
args,
model,
train_dataloader,
valid_dataloader,
valid_torchdata,
epochs,
model_num,
early_stop=None,
):
# ema = EMA(model, 0.999)
# ema.register()
param_optimizer = list(model.named_parameters())
embed_pa = ["embed.weight"]
optimizer_grouped_parameters = [
{
"params": [
p for n, p in param_optimizer if not any(nd in n for nd in embed_pa)
]
},
{"params": model.embed.parameters(), "lr": 2e-5},
]
optimizer = AdamW(
optimizer_grouped_parameters, lr=args.lr, amsgrad=True, weight_decay=5e-4
)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
optimizer, T_0=5, T_mult=2, eta_min=1e-5, last_epoch=-1
)
# scheduler = CyclicLR(optimizer, base_lr=1e-3, max_lr=3e-3,
# step_size=30, mode='exp_range',
# gamma=0.99994)
# opt = SWA(optimizer, swa_start=100, swa_freq=5, swa_lr=1e-4)
best_val_loss = np.inf
best_auc = np.inf
best_loss = np.inf
no_improve = 0
for epoch in range(epochs):
model.train()
train_loss = []
if epoch > 2:
for param in model.named_parameters():
if param[0] == "embed.weight":
param[1].requires_grad = True
break
# fgm = FGM(model)
bar = tqdm(train_dataloader)
for i, (description, label) in enumerate(bar):
optimizer.zero_grad()
output = model(description.to(DEVICE), label.to(DEVICE))
loss = output
loss.backward()
train_loss.append(loss.item())
# fgm.attack()
# loss_adv = model(describe.to(DEVICE), label.to(DEVICE))
# loss_ad = loss_adv
# loss_ad.backward()
# fgm.restore()
scheduler.step(epochs + i / len(train_dataloader))
# scheduler.batch_step()
optimizer.step()
# ema.update()
# bar.set_postfix(tloss=np.array(train_loss).mean())
# opt.swap_swa_sgd()
# ema.apply_shadow()
auc, val_loss = validation_funtion(
model, valid_dataloader, valid_torchdata, "valid"
)
# ema.restore()
print(
"Epoch:[{}/{}] train_loss: {:.5f}, val_loss: {:.5f},f1-score: {:.5f}\n".format(
epoch, epochs, np.mean(train_loss), val_loss, auc
)
)
if early_stop:
if val_loss < best_val_loss:
best_val_loss = val_loss
best_auc = auc
best_loss = np.mean(train_loss)
# ema.apply_shadow()
torch.save(
model.state_dict(),
"./saved/{}_model_{}.bin".format(args.NAME, model_num),
)
# ema.restore()
model_num += 1
else:
no_improve += 1
if no_improve == early_stop:
model_num += 1
break
if epoch == epochs - 1:
model_num += 1
else:
if epoch >= epochs - 1:
torch.save(
model.state_dict(),
"./saved/{}_model_{}.bin".format(args.NAME, model_num),
)
model_num += 1
return best_val_loss, best_auc, best_loss, model_num
###################################
# 增加了一个参数 ylabel
###################################
def run(args, train_dataset, w2v_model, fasttext_model, ylabel):
kf = StratifiedKFold(n_splits=args.FOLD, shuffle=True, random_state=SEED)
best_mlogloss = []
best_auc = []
best_loss = []
model_num = 1
model = DPCNN(
args,
w2v_model.wv.vectors.shape[0] + 1,
w2v_model.wv.vectors.shape[1] + fasttext_model.wv.vectors.shape[1],
embeddings=True,
)
model.to(DEVICE)
###################################
# 增加了一个参数 ylabel,用在此处
###################################
for i, (train_index, test_index) in enumerate(
kf.split(np.arange(len(train_dataset)), ylabel)
):
print(str(i + 1), "-" * 50)
tra = [train_dataset[index] for index in train_index]
val = [train_dataset[index] for index in test_index]
print(len(tra))
print(len(val))
train_dataloader, train_torchdata = get_dataloader(args, tra, mode="train")
valid_dataloader, valid_torchdata = get_dataloader(args, val, mode="valid")
mlogloss, auc, loss, model_n = train(
args,
model,
train_dataloader,
valid_dataloader,
valid_torchdata,
args.epochs,
model_num,
early_stop=args.early_step,
)
torch.cuda.empty_cache()
best_mlogloss.append(mlogloss)
best_auc.append(auc)
best_loss.append(loss)
for i in range(args.FOLD):
print(
"- 第{}折中,best valloss: {} best f1-socre: {} best trainloss: {}".format(
i + 1, best_mlogloss[i], best_auc[i], best_loss[i]
)
)
return model_n
def get_submit(args, test_data, test_dataset, id2label, model_num):
model = DPCNN(
args,
w2v_model.wv.vectors.shape[0] + 1,
w2v_model.wv.vectors.shape[1] + fasttext_model.wv.vectors.shape[1],
embeddings=True,
)
model.to(DEVICE)
test_preds_total = []
test_dataloader, test_torchdata = get_dataloader(args, test_dataset, mode="test")
for i in range(1, model_num):
model.load_state_dict(
torch.load("./saved/{}_model_{}.bin".format(args.NAME, i))
)
test_pred_results = validation_funtion(
model, test_dataloader, test_torchdata, "test"
)
test_preds_total.append(test_pred_results)
test_preds_merge = np.sum(test_preds_total, axis=0) / (model_num - 1)
test_pre_tensor = torch.tensor(test_preds_merge)
test_pre = torch.max(test_pre_tensor, 1)[1]
pred_labels = [id2label[i] for i in test_pre]
# submit_file = '/home/zyf/Summer game2021/Datafountain/submits/submit.csv'
submit_file = "./submit/submit_{}.csv".format(args.NAME)
pd.DataFrame({"id": test_data["id"], "label": pred_labels}).to_csv(
submit_file, index=False
)
def arg_setting():
parser = argparse.ArgumentParser()
parser.add_argument("--NAME", default="DPCNN", type=str, help="")
parser.add_argument(
"--MAX_LEN", default=100, type=int, help="max length of sentence"
)
parser.add_argument("--batch_size", default=32, type=int, help="")
parser.add_argument("--FOLD", default=10, type=int, help="k fold")
parser.add_argument("--epochs", default=40, type=int, help="")
parser.add_argument("--early_step", default=20, type=int, help="")
parser.add_argument("--lr", default=1e-3, type=float, help="")
args = parser.parse_args()
return args
if __name__ == "__main__":
args = arg_setting() # 设置基本参数
###################################
# 增加一个返回值 ylabel
###################################
(
test_data,
train_dataset,
test_dataset,
w2v_model,
fasttext_model,
id2label,
ylabel,
) = data_process()
# 开始训练
###################################
# 增加一个参数ylabel
###################################
model_num = run(args, train_dataset, w2v_model, fasttext_model, ylabel)
# 获得提交结果文件
get_submit(args, test_data, test_dataset, id2label, model_num)
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/samprasgit/learn-in-python.git
git@gitee.com:samprasgit/learn-in-python.git
samprasgit
learn-in-python
Learn-in-Python
master

搜索帮助