From a60209b743baac761777f8448f7c1cc213a9c665 Mon Sep 17 00:00:00 2001
From: xinlianglalala <huangxinliang4@huawei.com>
Date: Fri, 5 May 2023 10:48:24 +0800
Subject: [PATCH 1/2] add prompt_tuning adapter for t5

---
 configs/t5/model_config/t5_small.yaml     |  12 +++
 configs/t5/task_config/runner.yaml        |   7 +-
 configs/t5/task_config/wmt16_dataset.yaml |  28 +++++-
 mindformers/models/t5/t5.py               |  12 ++-
 mindformers/pet/models/__init__.py        |   1 +
 mindformers/pet/models/t5/__init__.py     |   5 +
 mindformers/pet/models/t5/t5_pet.py       | 111 ++++++++++++++++++++++
 mindformers/pet/pet_config.py             |   4 +
 mindformers/pet/tuners/prompt_adapter.py  |  68 +++++++++++++
 9 files changed, 242 insertions(+), 6 deletions(-)
 create mode 100644 mindformers/pet/models/t5/__init__.py
 create mode 100644 mindformers/pet/models/t5/t5_pet.py
 create mode 100644 mindformers/pet/tuners/prompt_adapter.py

diff --git a/configs/t5/model_config/t5_small.yaml b/configs/t5/model_config/t5_small.yaml
index 28641bfd..dc151165 100644
--- a/configs/t5/model_config/t5_small.yaml
+++ b/configs/t5/model_config/t5_small.yaml
@@ -1,6 +1,18 @@
 model:
   arch:
     type: T5ForConditionalGeneration
+    # type: T5ForConditionalGenerationWithPrompt
+    # pet:
+    #   pet_type: prompt_tuning
+    #   pet_config:
+    #     num_virtual_tokens: 20
+    #     token_dim: 768
+    #     num_transformer_submodules: 1
+    #     prompt_init: "RANDOM"
+    #     init_token_ids: [101, 16014, 2065, 15792, 1997, 
+    #                      2023, 319, 2003, 3893, 1010,
+    #                      4997, 2030, 8699, 102, 0, 
+    #                      0, 0, 0, 0, 0]
   model_config:
     batch_size: 1
     d_ff: 2048
diff --git a/configs/t5/task_config/runner.yaml b/configs/t5/task_config/runner.yaml
index a69f2577..88df2182 100644
--- a/configs/t5/task_config/runner.yaml
+++ b/configs/t5/task_config/runner.yaml
@@ -41,4 +41,9 @@ callbacks:
     save_checkpoint_steps: 1000
     integrated_save: True
     async_save: False
-  - type: ObsMonitor
\ No newline at end of file
+  - type: ObsMonitor
+
+metric:
+  type: BleuScore
+  n_gram: 4
+  smooth: False
\ No newline at end of file
diff --git a/configs/t5/task_config/wmt16_dataset.yaml b/configs/t5/task_config/wmt16_dataset.yaml
index 298cac78..a5917151 100644
--- a/configs/t5/task_config/wmt16_dataset.yaml
+++ b/configs/t5/task_config/wmt16_dataset.yaml
@@ -2,6 +2,7 @@ train_dataset: &train_dataset
   data_loader:
     type: WMT16DataLoader
     dataset_dir: "/your/wmt_en_ro"
+    stage: "train"
   tokenizer:
     type: t5_small
     src_max_length: 1024 # this will truncate the source input sequence
@@ -13,7 +14,7 @@ train_dataset: &train_dataset
   num_parallel_workers: 8
   python_multiprocessing: False
   drop_remainder: False
-  batch_size: 1
+  batch_size: 32
   repeat: 1
   numa_enable: False
   prefetch_size: 1
@@ -21,3 +22,28 @@ train_dataset: &train_dataset
 train_dataset_task:
   type: TranslationDataset
   dataset_config: *train_dataset
+
+eval_dataset: &eval_dataset
+  data_loader:
+    type: WMT16DataLoader
+    dataset_dir: "/your/wmt_en_ro"
+    stage: "val"
+  tokenizer:
+    type: t5_small
+    src_max_length: 1024 # this will truncate the source input sequence
+    tgt_max_length: 128
+    prefix: "translate the English to Romanian:"
+    stage: 'val'
+  input_columns: ["input_ids", "attention_mask", "labels"] # determinied by the model inputs
+  output_columns: ["input_ids", "attention_mask", "labels"]
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: False
+  batch_size: 32
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+
+eval_dataset_task:
+  type: TranslationDataset
+  dataset_config: *eval_dataset
\ No newline at end of file
diff --git a/mindformers/models/t5/t5.py b/mindformers/models/t5/t5.py
index a92dbf4c..ee21056e 100644
--- a/mindformers/models/t5/t5.py
+++ b/mindformers/models/t5/t5.py
@@ -1603,13 +1603,14 @@ class T5Model(BaseModel):
                   target_ids=None,
                   target_mask=None,
                   memory_mask=None,
-                  encoder_cache=None):
+                  encoder_cache=None,
+                  embedding_output=None):
         """T5Model with encoder and decoder."""
         if source_mask is None and source_ids is not None:
             source_mask = self.ones_like(source_ids)
             source_mask = self._create_attention_mask_from_input_mask(source_mask)
         if source_ids is not None:
-            encoder_output = self.encoder_forward(source_ids, source_mask)
+            encoder_output = self.encoder_forward(source_ids, source_mask, embedding_output)
         else:
             encoder_output = encoder_cache
 
@@ -1644,10 +1645,13 @@ class T5Model(BaseModel):
 
         return log_probs
 
-    def encoder_forward(self, source_ids, source_mask):
+    def encoder_forward(self, source_ids, source_mask, embedding_output=None):
         """Execute the forward process"""
         # process source sentence
-        src_embedding_output, _ = self.tfm_embedding_lookup(source_ids)
+        if embedding_output is None:
+            src_embedding_output, _ = self.tfm_embedding_lookup(source_ids)
+        else:
+            src_embedding_output = embedding_output
         # attention mask [batch_size, seq_length, seq_length]
         if len(F.shape(source_mask)) == 2:
             enc_attention_mask = self._create_attention_mask_from_input_mask(source_mask)
diff --git a/mindformers/pet/models/__init__.py b/mindformers/pet/models/__init__.py
index 25326a6f..bf40bb4d 100644
--- a/mindformers/pet/models/__init__.py
+++ b/mindformers/pet/models/__init__.py
@@ -4,3 +4,4 @@ from .gpt import *
 __all__ = []
 __all__.extend(bert.__all__)
 __all__.extend(gpt.__all__)
+__all__.extend(t5.__all__)
\ No newline at end of file
diff --git a/mindformers/pet/models/t5/__init__.py b/mindformers/pet/models/t5/__init__.py
new file mode 100644
index 00000000..4d960d9d
--- /dev/null
+++ b/mindformers/pet/models/t5/__init__.py
@@ -0,0 +1,5 @@
+from .t5_pet import (T5ModelWithPrompt,
+                     T5ForConditionalGenerationWithPrompt)
+
+__all__ = []
+__all__.extend(t5_pet.__all__)
\ No newline at end of file
diff --git a/mindformers/pet/models/t5/t5_pet.py b/mindformers/pet/models/t5/t5_pet.py
new file mode 100644
index 00000000..c0d0b918
--- /dev/null
+++ b/mindformers/pet/models/t5/t5_pet.py
@@ -0,0 +1,111 @@
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+
+"""
+from mindspore import Tensor
+from mindspore.ops import operations as P
+import mindspore.common.dtype as mstype
+
+from mindformers.models.t5.t5_config import T5Config
+from mindformers.models.t5.t5 import T5ForConditionalGeneration, T5Model
+from mindformers.pet.tuners.pet_adapter import PetAdapter
+from mindformers.pet.tuners.prompt_adapter import PromptAdapter
+from mindformers.tools.register.register import MindFormerModuleType, MindFormerRegister
+
+__all__ = ['T5ModelWithPrompt', 'T5ForConditionalGenerationWithPrompt']
+
+
+@MindFormerRegister.register(MindFormerModuleType.MODELS)
+class T5ModelWithPrompt(T5Model):
+    def __init__(self, pet, config=None):
+        super(T5ModelWithPrompt, self).__init__(config)
+        self.pet = pet
+        self.pet_type = self.pet.pet_type
+        self.seq_length = config.seq_length
+
+        self.total_virtual_tokens = self.pet.pet_config.num_virtual_tokens \
+                                    * self.pet.pet_config.num_transformer_submodules
+        self.tfm_encoder = PromptAdapter.get_pet_model(self.tfm_encoder, self.pet)
+
+        self.expand_dims = P.ExpandDims()
+        self.tile = P.Tile()
+        self.prompt_tokens = Tensor(list(range(0, self.total_virtual_tokens)), dtype=mstype.int32)
+        self.ones = P.Ones()
+        self.concat = P.Concat(axis=1)
+
+    def construct(self, source_ids, source_mask, target_ids, target_mask, memory_mask, encoder_cache):
+        """Bidirectional Encoder Representations from Transformers."""
+        # input mask
+        batch_size = P.Shape()(source_ids)[0]
+        if source_mask is None and source_ids is not None:
+            source_mask = self.ones_like(source_ids)
+        prompt_input_mask = self.ones((batch_size, self.total_virtual_tokens), mstype.int32)
+        source_mask = self.concat((prompt_input_mask, source_mask))
+
+        # embedding
+        src_embedding_output, _ = self.tfm_embedding_lookup(source_ids)
+        # embedding_output = self.embedding_postprocessor(token_type_ids, word_embeddings)
+        # prompt
+        prompt = self.tfm_encoder._cells[self.pet_type]
+        prompt_tokens = self.tile(self.expand_dims(self.prompt_tokens, 0), (batch_size, 1))
+        prompt_output = prompt(prompt_tokens)
+        # concat embedding output
+        embedding_output = self.concat((prompt_output, src_embedding_output))
+        
+        source_mask = self.slice(source_mask,
+                                (0, 0),
+                                (batch_size, self.seq_length),
+                                (1, 1))
+                                
+        embedding_output = self.slice(embedding_output,
+                                      (0, 0, 0),
+                                      (batch_size, self.seq_length, self.hidden_size),
+                                      (1, 1, 1))
+
+        return super().construct(source_ids, source_mask, target_ids, target_mask, memory_mask, encoder_cache, embedding_output)
+
+    def init_prompt_from_ids(self, token_ids):
+        # check length
+        if len(token_ids) != self.total_virtual_tokens:
+            raise ValueError(f'The length of token_ids({len(token_ids)}) should be same with total_virtual_tokens({self.total_virtual_tokens}).')
+        
+        name, para = self.find_embedding_table(self.word_embedding)
+        embedding_matrix = para.value().asnumpy()
+        prompt_embedding = embedding_matrix[token_ids]
+        prompt_embedding = Tensor(prompt_embedding)
+
+        # load into prompt cell
+        name, para = self.find_embedding_table(self.bert_encoder)
+        para.set_data(prompt_embedding)
+
+    def find_embedding_table(self, model):
+        for name, para in model.parameters_and_names():
+            if 'embedding_table' in name:
+                return name, para
+            
+
+@MindFormerRegister.register(MindFormerModuleType.MODELS)
+class T5ForConditionalGenerationWithPrompt(T5ForConditionalGeneration):
+    def __init__(self, config: T5Config=None, pet=None, **kwargs):
+        super().__init__(config)
+        self.t5 = T5ModelWithPrompt(pet, config)
+        # load ckpt
+        self.load_checkpoint(config)
+        # freeze pretrained model
+        PetAdapter.freeze_pretrained_model(self.t5, pet.pet_type)
+        # init prompt cell with token_ids
+        if pet.pet_config.prompt_init is "TEXT" and pet.pet_config.init_token_ids is not None:
+            self.t5.init_prompt_from_ids(pet.pet_config.init_token_ids)
\ No newline at end of file
diff --git a/mindformers/pet/pet_config.py b/mindformers/pet/pet_config.py
index 570b8a57..399bd2eb 100644
--- a/mindformers/pet/pet_config.py
+++ b/mindformers/pet/pet_config.py
@@ -52,5 +52,9 @@ class PromptLearningConfig(PetConfig):
 
 
 class PrefixTuningConfig(PetConfig):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+class PromptTuningConfig(PetConfig):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
\ No newline at end of file
diff --git a/mindformers/pet/tuners/prompt_adapter.py b/mindformers/pet/tuners/prompt_adapter.py
new file mode 100644
index 00000000..97cd012e
--- /dev/null
+++ b/mindformers/pet/tuners/prompt_adapter.py
@@ -0,0 +1,68 @@
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+Note: Base Prompt Adapter algrithm for mindformers' pretrained model.
+"""
+from __future__ import absolute_import
+
+
+from mindspore import nn
+from mindspore import Tensor
+from tk.delta.prompt_tuning import PromptTuning
+from mindformers.pet.pet_config import PetConfig
+from mindformers.pet.tuners.pet_adapter import PetAdapter
+from mindformers.pet.constants import PetType
+from mindformers.auto_class import AutoTokenizer
+# from tk.delta.p_tuning import PTuning
+
+def add_prompt_cell(net, config):
+    if config.pet_type == PetType.P_TUNING:
+        # prompt = PTuning(
+        #     token_dim=config.token_dim,
+        #     encoder_hidden_size=config.encoder_hidden_size,
+        #     num_virtual_tokens=config.num_virtual_tokens,
+        #     encoder_reparameterization_type=config.encoder_reparameterization_type,
+        #     num_transformer_submodules=config.num_transformer_submodules,
+        # )
+        pass
+    elif config.pet_type == PetType.PROMPT_TUNING:
+        pet_config = config.pet_config
+        if pet_config.prompt_init == "TEXT" or pet_config.prompt_init == "RANDOM":
+            prompt = PromptTuning(num_virtual_tokens=pet_config.num_virtual_tokens,
+                                  token_dim=pet_config.token_dim,
+                                  num_transformer_submodules=pet_config.num_transformer_submodules,
+                                  prompt_init=pet_config.prompt_init,
+                                  init_token_ids=pet_config.init_token_ids)
+        else:
+            raise ValueError(f'prompt_init: {pet_config.prompt_init} is not supported')
+    else:
+        raise ValueError(f'pet_type: {config.pet_type} is not supported')
+    net._cells[config.pet_type] = prompt
+    return net
+
+
+class PromptAdapter(PetAdapter):
+    r"""
+    PromptAdapter is the adapter to modify the pretrained model, which uses prompt_tuning or p_tuning algorithm.
+
+    Args:
+        model (BaseModel): The base pretrained model of mindformers.
+        pet_config (PetConfig): The configurition of the Pet model.
+    """
+    @classmethod
+    def get_pet_model(cls, model: nn.Cell = None, config: PetConfig = None):
+        model = model if model else PetAdapter.get_pretrained_model(config)
+        model = add_prompt_cell(model, config)
+        return model
-- 
Gitee


From 897636c65c23264c88b68aea3461ccaf5331bba9 Mon Sep 17 00:00:00 2001
From: xinlianglalala <huangxinliang4@huawei.com>
Date: Fri, 5 May 2023 11:09:00 +0800
Subject: [PATCH 2/2] add eval of t5 translation

---
 configs/t5/task_config/wmt16_dataset.yaml     |  4 +-
 .../dataset/dataloader/wmt16_dataloader.py    |  2 +-
 mindformers/dataset/translation_dataset.py    |  4 +-
 .../translation/translation_finetune.py       | 91 ++++++++++++++++++-
 4 files changed, 93 insertions(+), 8 deletions(-)

diff --git a/configs/t5/task_config/wmt16_dataset.yaml b/configs/t5/task_config/wmt16_dataset.yaml
index a5917151..4f609a61 100644
--- a/configs/t5/task_config/wmt16_dataset.yaml
+++ b/configs/t5/task_config/wmt16_dataset.yaml
@@ -4,7 +4,7 @@ train_dataset: &train_dataset
     dataset_dir: "/your/wmt_en_ro"
     stage: "train"
   tokenizer:
-    type: t5_small
+    type: T5Tokenizer
     src_max_length: 1024 # this will truncate the source input sequence
     tgt_max_length: 128
     prefix: "translate the English to Romanian:"
@@ -29,7 +29,7 @@ eval_dataset: &eval_dataset
     dataset_dir: "/your/wmt_en_ro"
     stage: "val"
   tokenizer:
-    type: t5_small
+    type: T5Tokenizer
     src_max_length: 1024 # this will truncate the source input sequence
     tgt_max_length: 128
     prefix: "translate the English to Romanian:"
diff --git a/mindformers/dataset/dataloader/wmt16_dataloader.py b/mindformers/dataset/dataloader/wmt16_dataloader.py
index 1bf30b04..3ccd7a08 100644
--- a/mindformers/dataset/dataloader/wmt16_dataloader.py
+++ b/mindformers/dataset/dataloader/wmt16_dataloader.py
@@ -106,7 +106,7 @@ class WMT16DataSet:
         if 'stage' != 'all':
             dataset_dict[stage] = read_and_add_to_stage(stage)
         else:
-            for item in ['train', 'dev', 'test']:
+            for item in ['train', 'val', 'test']:
                 dataset_dict[stage] = read_and_add_to_stage(item)
 
         self.dataset_dict = dataset_dict
diff --git a/mindformers/dataset/translation_dataset.py b/mindformers/dataset/translation_dataset.py
index d3cc8069..a9ed299a 100644
--- a/mindformers/dataset/translation_dataset.py
+++ b/mindformers/dataset/translation_dataset.py
@@ -79,7 +79,9 @@ class TranslationDataset(BaseDataset):
     @classmethod
     def _tokenizer_map(cls, dataset, tokenizer_config):
         """Maps the tokenizer on the source and the output"""
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_config.type)
+        if tokenizer_config.type == "T5Tokenizer":
+            yaml_name_or_path = "t5_small"
+        tokenizer = AutoTokenizer.from_pretrained(yaml_name_or_path)
         prefix = tokenizer_config.prefix
         src_max_length = tokenizer_config.src_max_length
         tgt_max_length = tokenizer_config.tgt_max_length
diff --git a/mindformers/trainer/translation/translation_finetune.py b/mindformers/trainer/translation/translation_finetune.py
index f1b8c8ea..ee83ddc7 100644
--- a/mindformers/trainer/translation/translation_finetune.py
+++ b/mindformers/trainer/translation/translation_finetune.py
@@ -13,15 +13,18 @@
 # limitations under the License.
 # ============================================================================
 """Translation Modeling Trainer."""
+import time
 import os.path
 from typing import Optional, List, Union
 
+import numpy as np
 from mindspore.train import Callback
 from mindspore.nn import TrainOneStepCell, Optimizer, Cell
 from mindspore.dataset import GeneratorDataset
 
+from mindformers.core import build_metric
 from mindformers.dataset import BaseDataset
-from mindformers.models import build_model, BaseModel, BaseTokenizer
+from mindformers.models import build_model, BaseModel, BaseTokenizer, build_model, build_tokenizer
 from mindformers.tools.logger import logger
 from mindformers.tools.utils import count_params
 from mindformers.tools.register import MindFormerRegister,\
@@ -99,9 +102,89 @@ class TranslationTrainer(BaseTrainer):
             optimizer=optimizer,
             **kwargs)
 
-    def evaluate(self, *args, **kwargs):
-        raise NotImplementedError(
-            "The Translation task does not support evaluate.")
+    def evaluate(self,
+                 config: Optional[Union[dict, MindFormerConfig, ConfigArguments, TrainingArguments]] = None,
+                 network: Optional[Union[Cell, BaseModel]] = None,
+                 dataset: Optional[Union[BaseDataset, GeneratorDataset]] = None,
+                 callbacks: Optional[Union[Callback, List[Callback]]] = None,
+                 compute_metrics: Optional[Union[dict, set]] = None,
+                 **kwargs):
+        r"""Evaluate task for TokenClassification Trainer.
+        This function is used to evaluate the network.
+
+        The trainer interface is used to quickly start training for general task.
+        It also allows users to customize the network, dataset, callbacks, compute_metrics.
+
+        Args:
+            config (Optional[Union[dict, MindFormerConfig, ConfigArguments, TrainingArguments]]):
+                The task config which is used to configure the dataset, the hyper-parameter, optimizer, etc.
+                It supports config dict or MindFormerConfig or TrainingArguments or ConfigArguments class.
+                Default: None.
+            network (Optional[Union[Cell, BaseModel]]): The network for trainer.
+                It supports model name or BaseModel or MindSpore Cell class.
+                Default: None.
+            dataset (Optional[Union[BaseDataset]]): The evaluate dataset.
+                It support real dataset path or BaseDateset class or MindSpore Dataset class.
+                Default: None.
+            callbacks (Optional[Union[Callback, List[Callback]]]): The eval callback function.
+                It support CallBack or CallBack List of MindSpore.
+                Default: None.
+            compute_metrics (Optional[Union[dict, set]]): The metric of evaluating.
+                It support dict or set in MindSpore's Metric class.
+                Default: None.
+        """
+        metric_name = "BLEU Metric"
+        kwargs.setdefault("metric_name", metric_name)
+        metric_name = kwargs.get("metric_name")
+        is_full_config = kwargs.get("is_full_config", False)
+        config = self.set_config(config, is_full_config)
+
+        # build dataset
+        logger.info(".........Build Dataset For Evaluate..........")
+        if dataset is None:
+            dataset = self.create_eval_dataset()
+
+        # build metric
+        logger.info(".........Build Compute Metrics For Evaluate..........")
+        if metric_name is None:
+            metric_name = self.model_name + "_metric"
+        compute_metrics = build_metric(config.metric)
+
+        logger.info(".........Build Tokenizer for Evaluate..........")
+        tokenizer = build_tokenizer(config.eval_dataset_task.dataset_config.tokenizer)
+
+        logger.info(".........Starting Init Evaluate Model..........")
+        model = build_model(config.model)
+
+        logger.info(".........Starting Evaluate Model..........")
+
+        for input in dataset.create_dict_iterator():
+            source_ids = input["input_ids"]
+            attenion_mask = input["attention_mask"]
+            labels = input["labels"]
+            start_time = time.time()
+             
+            output_ids = model.generate(source_ids.asnumpy(), do_sample=config.model.model_config.do_sample,
+                                        max_length=config.model.model_config.max_decode_length)
+            gen_text = tokenizer.decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) 
+
+            labels_l = np.asarray(labels.asnumpy())
+            target_text = tokenizer.decode(labels_l, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+  
+            len_list = len(gen_text)
+            for  i in range(len_list):
+                gen_list = [gen_text[i].split()]
+                target_list = [[target_text[i].split()]]
+                end_time = time.time()
+                avg_cost_time = (end_time - start_time) / source_ids.shape[0]
+                logger.info(f"every example cost time is : {avg_cost_time}")
+                compute_metrics.update(gen_list, target_list)
+
+        output = compute_metrics.eval()
+
+        logger.info("metric_name: %s", metric_name)
+        logger.info(output)
+        logger.info(".........Evaluate Over!.............")
 
     def predict(self,
                 config: Optional[Union[dict, MindFormerConfig, ConfigArguments, TrainingArguments]] = None,
-- 
Gitee