From a60209b743baac761777f8448f7c1cc213a9c665 Mon Sep 17 00:00:00 2001 From: xinlianglalala Date: Fri, 5 May 2023 10:48:24 +0800 Subject: [PATCH 1/2] add prompt_tuning adapter for t5 --- configs/t5/model_config/t5_small.yaml | 12 +++ configs/t5/task_config/runner.yaml | 7 +- configs/t5/task_config/wmt16_dataset.yaml | 28 +++++- mindformers/models/t5/t5.py | 12 ++- mindformers/pet/models/__init__.py | 1 + mindformers/pet/models/t5/__init__.py | 5 + mindformers/pet/models/t5/t5_pet.py | 111 ++++++++++++++++++++++ mindformers/pet/pet_config.py | 4 + mindformers/pet/tuners/prompt_adapter.py | 68 +++++++++++++ 9 files changed, 242 insertions(+), 6 deletions(-) create mode 100644 mindformers/pet/models/t5/__init__.py create mode 100644 mindformers/pet/models/t5/t5_pet.py create mode 100644 mindformers/pet/tuners/prompt_adapter.py diff --git a/configs/t5/model_config/t5_small.yaml b/configs/t5/model_config/t5_small.yaml index 28641bfd..dc151165 100644 --- a/configs/t5/model_config/t5_small.yaml +++ b/configs/t5/model_config/t5_small.yaml @@ -1,6 +1,18 @@ model: arch: type: T5ForConditionalGeneration + # type: T5ForConditionalGenerationWithPrompt + # pet: + # pet_type: prompt_tuning + # pet_config: + # num_virtual_tokens: 20 + # token_dim: 768 + # num_transformer_submodules: 1 + # prompt_init: "RANDOM" + # init_token_ids: [101, 16014, 2065, 15792, 1997, + # 2023, 319, 2003, 3893, 1010, + # 4997, 2030, 8699, 102, 0, + # 0, 0, 0, 0, 0] model_config: batch_size: 1 d_ff: 2048 diff --git a/configs/t5/task_config/runner.yaml b/configs/t5/task_config/runner.yaml index a69f2577..88df2182 100644 --- a/configs/t5/task_config/runner.yaml +++ b/configs/t5/task_config/runner.yaml @@ -41,4 +41,9 @@ callbacks: save_checkpoint_steps: 1000 integrated_save: True async_save: False - - type: ObsMonitor \ No newline at end of file + - type: ObsMonitor + +metric: + type: BleuScore + n_gram: 4 + smooth: False \ No newline at end of file diff --git a/configs/t5/task_config/wmt16_dataset.yaml b/configs/t5/task_config/wmt16_dataset.yaml index 298cac78..a5917151 100644 --- a/configs/t5/task_config/wmt16_dataset.yaml +++ b/configs/t5/task_config/wmt16_dataset.yaml @@ -2,6 +2,7 @@ train_dataset: &train_dataset data_loader: type: WMT16DataLoader dataset_dir: "/your/wmt_en_ro" + stage: "train" tokenizer: type: t5_small src_max_length: 1024 # this will truncate the source input sequence @@ -13,7 +14,7 @@ train_dataset: &train_dataset num_parallel_workers: 8 python_multiprocessing: False drop_remainder: False - batch_size: 1 + batch_size: 32 repeat: 1 numa_enable: False prefetch_size: 1 @@ -21,3 +22,28 @@ train_dataset: &train_dataset train_dataset_task: type: TranslationDataset dataset_config: *train_dataset + +eval_dataset: &eval_dataset + data_loader: + type: WMT16DataLoader + dataset_dir: "/your/wmt_en_ro" + stage: "val" + tokenizer: + type: t5_small + src_max_length: 1024 # this will truncate the source input sequence + tgt_max_length: 128 + prefix: "translate the English to Romanian:" + stage: 'val' + input_columns: ["input_ids", "attention_mask", "labels"] # determinied by the model inputs + output_columns: ["input_ids", "attention_mask", "labels"] + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: False + batch_size: 32 + repeat: 1 + numa_enable: False + prefetch_size: 1 + +eval_dataset_task: + type: TranslationDataset + dataset_config: *eval_dataset \ No newline at end of file diff --git a/mindformers/models/t5/t5.py b/mindformers/models/t5/t5.py index a92dbf4c..ee21056e 100644 --- a/mindformers/models/t5/t5.py +++ b/mindformers/models/t5/t5.py @@ -1603,13 +1603,14 @@ class T5Model(BaseModel): target_ids=None, target_mask=None, memory_mask=None, - encoder_cache=None): + encoder_cache=None, + embedding_output=None): """T5Model with encoder and decoder.""" if source_mask is None and source_ids is not None: source_mask = self.ones_like(source_ids) source_mask = self._create_attention_mask_from_input_mask(source_mask) if source_ids is not None: - encoder_output = self.encoder_forward(source_ids, source_mask) + encoder_output = self.encoder_forward(source_ids, source_mask, embedding_output) else: encoder_output = encoder_cache @@ -1644,10 +1645,13 @@ class T5Model(BaseModel): return log_probs - def encoder_forward(self, source_ids, source_mask): + def encoder_forward(self, source_ids, source_mask, embedding_output=None): """Execute the forward process""" # process source sentence - src_embedding_output, _ = self.tfm_embedding_lookup(source_ids) + if embedding_output is None: + src_embedding_output, _ = self.tfm_embedding_lookup(source_ids) + else: + src_embedding_output = embedding_output # attention mask [batch_size, seq_length, seq_length] if len(F.shape(source_mask)) == 2: enc_attention_mask = self._create_attention_mask_from_input_mask(source_mask) diff --git a/mindformers/pet/models/__init__.py b/mindformers/pet/models/__init__.py index 25326a6f..bf40bb4d 100644 --- a/mindformers/pet/models/__init__.py +++ b/mindformers/pet/models/__init__.py @@ -4,3 +4,4 @@ from .gpt import * __all__ = [] __all__.extend(bert.__all__) __all__.extend(gpt.__all__) +__all__.extend(t5.__all__) \ No newline at end of file diff --git a/mindformers/pet/models/t5/__init__.py b/mindformers/pet/models/t5/__init__.py new file mode 100644 index 00000000..4d960d9d --- /dev/null +++ b/mindformers/pet/models/t5/__init__.py @@ -0,0 +1,5 @@ +from .t5_pet import (T5ModelWithPrompt, + T5ForConditionalGenerationWithPrompt) + +__all__ = [] +__all__.extend(t5_pet.__all__) \ No newline at end of file diff --git a/mindformers/pet/models/t5/t5_pet.py b/mindformers/pet/models/t5/t5_pet.py new file mode 100644 index 00000000..c0d0b918 --- /dev/null +++ b/mindformers/pet/models/t5/t5_pet.py @@ -0,0 +1,111 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" + +""" +from mindspore import Tensor +from mindspore.ops import operations as P +import mindspore.common.dtype as mstype + +from mindformers.models.t5.t5_config import T5Config +from mindformers.models.t5.t5 import T5ForConditionalGeneration, T5Model +from mindformers.pet.tuners.pet_adapter import PetAdapter +from mindformers.pet.tuners.prompt_adapter import PromptAdapter +from mindformers.tools.register.register import MindFormerModuleType, MindFormerRegister + +__all__ = ['T5ModelWithPrompt', 'T5ForConditionalGenerationWithPrompt'] + + +@MindFormerRegister.register(MindFormerModuleType.MODELS) +class T5ModelWithPrompt(T5Model): + def __init__(self, pet, config=None): + super(T5ModelWithPrompt, self).__init__(config) + self.pet = pet + self.pet_type = self.pet.pet_type + self.seq_length = config.seq_length + + self.total_virtual_tokens = self.pet.pet_config.num_virtual_tokens \ + * self.pet.pet_config.num_transformer_submodules + self.tfm_encoder = PromptAdapter.get_pet_model(self.tfm_encoder, self.pet) + + self.expand_dims = P.ExpandDims() + self.tile = P.Tile() + self.prompt_tokens = Tensor(list(range(0, self.total_virtual_tokens)), dtype=mstype.int32) + self.ones = P.Ones() + self.concat = P.Concat(axis=1) + + def construct(self, source_ids, source_mask, target_ids, target_mask, memory_mask, encoder_cache): + """Bidirectional Encoder Representations from Transformers.""" + # input mask + batch_size = P.Shape()(source_ids)[0] + if source_mask is None and source_ids is not None: + source_mask = self.ones_like(source_ids) + prompt_input_mask = self.ones((batch_size, self.total_virtual_tokens), mstype.int32) + source_mask = self.concat((prompt_input_mask, source_mask)) + + # embedding + src_embedding_output, _ = self.tfm_embedding_lookup(source_ids) + # embedding_output = self.embedding_postprocessor(token_type_ids, word_embeddings) + # prompt + prompt = self.tfm_encoder._cells[self.pet_type] + prompt_tokens = self.tile(self.expand_dims(self.prompt_tokens, 0), (batch_size, 1)) + prompt_output = prompt(prompt_tokens) + # concat embedding output + embedding_output = self.concat((prompt_output, src_embedding_output)) + + source_mask = self.slice(source_mask, + (0, 0), + (batch_size, self.seq_length), + (1, 1)) + + embedding_output = self.slice(embedding_output, + (0, 0, 0), + (batch_size, self.seq_length, self.hidden_size), + (1, 1, 1)) + + return super().construct(source_ids, source_mask, target_ids, target_mask, memory_mask, encoder_cache, embedding_output) + + def init_prompt_from_ids(self, token_ids): + # check length + if len(token_ids) != self.total_virtual_tokens: + raise ValueError(f'The length of token_ids({len(token_ids)}) should be same with total_virtual_tokens({self.total_virtual_tokens}).') + + name, para = self.find_embedding_table(self.word_embedding) + embedding_matrix = para.value().asnumpy() + prompt_embedding = embedding_matrix[token_ids] + prompt_embedding = Tensor(prompt_embedding) + + # load into prompt cell + name, para = self.find_embedding_table(self.bert_encoder) + para.set_data(prompt_embedding) + + def find_embedding_table(self, model): + for name, para in model.parameters_and_names(): + if 'embedding_table' in name: + return name, para + + +@MindFormerRegister.register(MindFormerModuleType.MODELS) +class T5ForConditionalGenerationWithPrompt(T5ForConditionalGeneration): + def __init__(self, config: T5Config=None, pet=None, **kwargs): + super().__init__(config) + self.t5 = T5ModelWithPrompt(pet, config) + # load ckpt + self.load_checkpoint(config) + # freeze pretrained model + PetAdapter.freeze_pretrained_model(self.t5, pet.pet_type) + # init prompt cell with token_ids + if pet.pet_config.prompt_init is "TEXT" and pet.pet_config.init_token_ids is not None: + self.t5.init_prompt_from_ids(pet.pet_config.init_token_ids) \ No newline at end of file diff --git a/mindformers/pet/pet_config.py b/mindformers/pet/pet_config.py index 570b8a57..399bd2eb 100644 --- a/mindformers/pet/pet_config.py +++ b/mindformers/pet/pet_config.py @@ -52,5 +52,9 @@ class PromptLearningConfig(PetConfig): class PrefixTuningConfig(PetConfig): + def __init__(self, **kwargs): + super().__init__(**kwargs) + +class PromptTuningConfig(PetConfig): def __init__(self, **kwargs): super().__init__(**kwargs) \ No newline at end of file diff --git a/mindformers/pet/tuners/prompt_adapter.py b/mindformers/pet/tuners/prompt_adapter.py new file mode 100644 index 00000000..97cd012e --- /dev/null +++ b/mindformers/pet/tuners/prompt_adapter.py @@ -0,0 +1,68 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +Note: Base Prompt Adapter algrithm for mindformers' pretrained model. +""" +from __future__ import absolute_import + + +from mindspore import nn +from mindspore import Tensor +from tk.delta.prompt_tuning import PromptTuning +from mindformers.pet.pet_config import PetConfig +from mindformers.pet.tuners.pet_adapter import PetAdapter +from mindformers.pet.constants import PetType +from mindformers.auto_class import AutoTokenizer +# from tk.delta.p_tuning import PTuning + +def add_prompt_cell(net, config): + if config.pet_type == PetType.P_TUNING: + # prompt = PTuning( + # token_dim=config.token_dim, + # encoder_hidden_size=config.encoder_hidden_size, + # num_virtual_tokens=config.num_virtual_tokens, + # encoder_reparameterization_type=config.encoder_reparameterization_type, + # num_transformer_submodules=config.num_transformer_submodules, + # ) + pass + elif config.pet_type == PetType.PROMPT_TUNING: + pet_config = config.pet_config + if pet_config.prompt_init == "TEXT" or pet_config.prompt_init == "RANDOM": + prompt = PromptTuning(num_virtual_tokens=pet_config.num_virtual_tokens, + token_dim=pet_config.token_dim, + num_transformer_submodules=pet_config.num_transformer_submodules, + prompt_init=pet_config.prompt_init, + init_token_ids=pet_config.init_token_ids) + else: + raise ValueError(f'prompt_init: {pet_config.prompt_init} is not supported') + else: + raise ValueError(f'pet_type: {config.pet_type} is not supported') + net._cells[config.pet_type] = prompt + return net + + +class PromptAdapter(PetAdapter): + r""" + PromptAdapter is the adapter to modify the pretrained model, which uses prompt_tuning or p_tuning algorithm. + + Args: + model (BaseModel): The base pretrained model of mindformers. + pet_config (PetConfig): The configurition of the Pet model. + """ + @classmethod + def get_pet_model(cls, model: nn.Cell = None, config: PetConfig = None): + model = model if model else PetAdapter.get_pretrained_model(config) + model = add_prompt_cell(model, config) + return model -- Gitee From 897636c65c23264c88b68aea3461ccaf5331bba9 Mon Sep 17 00:00:00 2001 From: xinlianglalala Date: Fri, 5 May 2023 11:09:00 +0800 Subject: [PATCH 2/2] add eval of t5 translation --- configs/t5/task_config/wmt16_dataset.yaml | 4 +- .../dataset/dataloader/wmt16_dataloader.py | 2 +- mindformers/dataset/translation_dataset.py | 4 +- .../translation/translation_finetune.py | 91 ++++++++++++++++++- 4 files changed, 93 insertions(+), 8 deletions(-) diff --git a/configs/t5/task_config/wmt16_dataset.yaml b/configs/t5/task_config/wmt16_dataset.yaml index a5917151..4f609a61 100644 --- a/configs/t5/task_config/wmt16_dataset.yaml +++ b/configs/t5/task_config/wmt16_dataset.yaml @@ -4,7 +4,7 @@ train_dataset: &train_dataset dataset_dir: "/your/wmt_en_ro" stage: "train" tokenizer: - type: t5_small + type: T5Tokenizer src_max_length: 1024 # this will truncate the source input sequence tgt_max_length: 128 prefix: "translate the English to Romanian:" @@ -29,7 +29,7 @@ eval_dataset: &eval_dataset dataset_dir: "/your/wmt_en_ro" stage: "val" tokenizer: - type: t5_small + type: T5Tokenizer src_max_length: 1024 # this will truncate the source input sequence tgt_max_length: 128 prefix: "translate the English to Romanian:" diff --git a/mindformers/dataset/dataloader/wmt16_dataloader.py b/mindformers/dataset/dataloader/wmt16_dataloader.py index 1bf30b04..3ccd7a08 100644 --- a/mindformers/dataset/dataloader/wmt16_dataloader.py +++ b/mindformers/dataset/dataloader/wmt16_dataloader.py @@ -106,7 +106,7 @@ class WMT16DataSet: if 'stage' != 'all': dataset_dict[stage] = read_and_add_to_stage(stage) else: - for item in ['train', 'dev', 'test']: + for item in ['train', 'val', 'test']: dataset_dict[stage] = read_and_add_to_stage(item) self.dataset_dict = dataset_dict diff --git a/mindformers/dataset/translation_dataset.py b/mindformers/dataset/translation_dataset.py index d3cc8069..a9ed299a 100644 --- a/mindformers/dataset/translation_dataset.py +++ b/mindformers/dataset/translation_dataset.py @@ -79,7 +79,9 @@ class TranslationDataset(BaseDataset): @classmethod def _tokenizer_map(cls, dataset, tokenizer_config): """Maps the tokenizer on the source and the output""" - tokenizer = AutoTokenizer.from_pretrained(tokenizer_config.type) + if tokenizer_config.type == "T5Tokenizer": + yaml_name_or_path = "t5_small" + tokenizer = AutoTokenizer.from_pretrained(yaml_name_or_path) prefix = tokenizer_config.prefix src_max_length = tokenizer_config.src_max_length tgt_max_length = tokenizer_config.tgt_max_length diff --git a/mindformers/trainer/translation/translation_finetune.py b/mindformers/trainer/translation/translation_finetune.py index f1b8c8ea..ee83ddc7 100644 --- a/mindformers/trainer/translation/translation_finetune.py +++ b/mindformers/trainer/translation/translation_finetune.py @@ -13,15 +13,18 @@ # limitations under the License. # ============================================================================ """Translation Modeling Trainer.""" +import time import os.path from typing import Optional, List, Union +import numpy as np from mindspore.train import Callback from mindspore.nn import TrainOneStepCell, Optimizer, Cell from mindspore.dataset import GeneratorDataset +from mindformers.core import build_metric from mindformers.dataset import BaseDataset -from mindformers.models import build_model, BaseModel, BaseTokenizer +from mindformers.models import build_model, BaseModel, BaseTokenizer, build_model, build_tokenizer from mindformers.tools.logger import logger from mindformers.tools.utils import count_params from mindformers.tools.register import MindFormerRegister,\ @@ -99,9 +102,89 @@ class TranslationTrainer(BaseTrainer): optimizer=optimizer, **kwargs) - def evaluate(self, *args, **kwargs): - raise NotImplementedError( - "The Translation task does not support evaluate.") + def evaluate(self, + config: Optional[Union[dict, MindFormerConfig, ConfigArguments, TrainingArguments]] = None, + network: Optional[Union[Cell, BaseModel]] = None, + dataset: Optional[Union[BaseDataset, GeneratorDataset]] = None, + callbacks: Optional[Union[Callback, List[Callback]]] = None, + compute_metrics: Optional[Union[dict, set]] = None, + **kwargs): + r"""Evaluate task for TokenClassification Trainer. + This function is used to evaluate the network. + + The trainer interface is used to quickly start training for general task. + It also allows users to customize the network, dataset, callbacks, compute_metrics. + + Args: + config (Optional[Union[dict, MindFormerConfig, ConfigArguments, TrainingArguments]]): + The task config which is used to configure the dataset, the hyper-parameter, optimizer, etc. + It supports config dict or MindFormerConfig or TrainingArguments or ConfigArguments class. + Default: None. + network (Optional[Union[Cell, BaseModel]]): The network for trainer. + It supports model name or BaseModel or MindSpore Cell class. + Default: None. + dataset (Optional[Union[BaseDataset]]): The evaluate dataset. + It support real dataset path or BaseDateset class or MindSpore Dataset class. + Default: None. + callbacks (Optional[Union[Callback, List[Callback]]]): The eval callback function. + It support CallBack or CallBack List of MindSpore. + Default: None. + compute_metrics (Optional[Union[dict, set]]): The metric of evaluating. + It support dict or set in MindSpore's Metric class. + Default: None. + """ + metric_name = "BLEU Metric" + kwargs.setdefault("metric_name", metric_name) + metric_name = kwargs.get("metric_name") + is_full_config = kwargs.get("is_full_config", False) + config = self.set_config(config, is_full_config) + + # build dataset + logger.info(".........Build Dataset For Evaluate..........") + if dataset is None: + dataset = self.create_eval_dataset() + + # build metric + logger.info(".........Build Compute Metrics For Evaluate..........") + if metric_name is None: + metric_name = self.model_name + "_metric" + compute_metrics = build_metric(config.metric) + + logger.info(".........Build Tokenizer for Evaluate..........") + tokenizer = build_tokenizer(config.eval_dataset_task.dataset_config.tokenizer) + + logger.info(".........Starting Init Evaluate Model..........") + model = build_model(config.model) + + logger.info(".........Starting Evaluate Model..........") + + for input in dataset.create_dict_iterator(): + source_ids = input["input_ids"] + attenion_mask = input["attention_mask"] + labels = input["labels"] + start_time = time.time() + + output_ids = model.generate(source_ids.asnumpy(), do_sample=config.model.model_config.do_sample, + max_length=config.model.model_config.max_decode_length) + gen_text = tokenizer.decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) + + labels_l = np.asarray(labels.asnumpy()) + target_text = tokenizer.decode(labels_l, skip_special_tokens=True, clean_up_tokenization_spaces=True) + + len_list = len(gen_text) + for i in range(len_list): + gen_list = [gen_text[i].split()] + target_list = [[target_text[i].split()]] + end_time = time.time() + avg_cost_time = (end_time - start_time) / source_ids.shape[0] + logger.info(f"every example cost time is : {avg_cost_time}") + compute_metrics.update(gen_list, target_list) + + output = compute_metrics.eval() + + logger.info("metric_name: %s", metric_name) + logger.info(output) + logger.info(".........Evaluate Over!.............") def predict(self, config: Optional[Union[dict, MindFormerConfig, ConfigArguments, TrainingArguments]] = None, -- Gitee