diff --git a/mindspore/core/ops/decoder_layer.cc b/mindspore/core/ops/decoder_layer.cc index 91d725c5b1522b24be5515fa767e3078087019a7..47a9e3e800442ee5319fd023f4accad32fa6376e 100644 --- a/mindspore/core/ops/decoder_layer.cc +++ b/mindspore/core/ops/decoder_layer.cc @@ -43,6 +43,9 @@ void DecoderLayer::set_eps_layernorm2(float eps_layernorm2) { void DecoderLayer::set_eps_layernorm3(float eps_layernorm3) { (void)this->AddAttr(kDecoderLayerEpsLayerNorm3, api::MakeValue(eps_layernorm3)); } +void DecoderLayer::set_eps_layernorm4(float eps_layernorm4) { + (void)this->AddAttr(kDecoderLayerEpsLayerNorm4, api::MakeValue(eps_layernorm4)); +} void DecoderLayer::set_ffn_hidden_size(int64_t ffn_hidden_size) { (void)this->AddAttr(kDecoderLayerFfnHiddenSize, api::MakeValue(ffn_hidden_size)); } @@ -55,6 +58,8 @@ void DecoderLayer::set_position_bias2(bool position_bias2) { void DecoderLayer::set_scale1(float scale1) { (void)this->AddAttr(kDecoderLayerScale1, api::MakeValue(scale1)); } void DecoderLayer::set_scale2(float scale2) { (void)this->AddAttr(kDecoderLayerScale2, api::MakeValue(scale2)); } void DecoderLayer::set_act_type(ActType act_type) { (void)this->AddAttr(kActivationType, api::MakeValue(act_type)); } +void DecoderLayer::set_layer_norm(bool layer_norm) { (void)this->AddAttr(kLayerNorm, api::MakeValue(layer_norm)); } + int64_t DecoderLayer::get_head_num() const { auto value_ptr = this->GetAttr(kDecoderLayerNumHeads); return GetValue(value_ptr); @@ -81,6 +86,10 @@ float DecoderLayer::get_eps_layernorm3() const { auto value_ptr = this->GetAttr(kDecoderLayerEpsLayerNorm3); return GetValue(value_ptr); } +float DecoderLayer::get_eps_layernorm4() const { + auto value_ptr = this->GetAttr(kDecoderLayerEpsLayerNorm4); + return GetValue(value_ptr); +} int64_t DecoderLayer::get_ffn_hidden_size() const { auto value_ptr = this->GetAttr(kDecoderLayerFfnHiddenSize); return GetValue(value_ptr); @@ -108,22 +117,27 @@ ActType DecoderLayer::get_act_type() const { } return ActType(GetValue(value_ptr)); } - +bool DecoderLayer::get_layer_norm() const { + auto value_ptr = this->GetAttr(kLayerNorm); + return GetValue(value_ptr); +} void DecoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, - float eps_layernorm3, int64_t ffn_hidden_size, bool position_bias1, bool position_bias2, - bool post_layernorm, float scale1, float scale2, ActType act_type) { + float eps_layernorm3, float eps_layernorm4, int64_t ffn_hidden_size, bool position_bias1, bool position_bias2, + bool post_layernorm, float scale1, float scale2, ActType act_type, bool layer_norm) { this->set_head_num(head_num); this->set_head_size(head_size); this->set_post_layernorm(post_layernorm); this->set_eps_layernorm1(eps_layernorm1); this->set_eps_layernorm2(eps_layernorm2); this->set_eps_layernorm3(eps_layernorm3); + this->set_eps_layernorm4(eps_layernorm4); this->set_ffn_hidden_size(ffn_hidden_size); this->set_position_bias1(position_bias1); this->set_position_bias2(position_bias2); this->set_act_type(act_type); this->set_scale1(scale1); this->set_scale2(scale2); + this->set_layer_norm(layer_norm); } REGISTER_PRIMITIVE_C(kNameDecoderLayer, DecoderLayer); } // namespace mindspore::ops diff --git a/mindspore/core/ops/decoder_layer.h b/mindspore/core/ops/decoder_layer.h index b196689eb2f37d575abd34409e2f7f070439a93e..b0e8827d18b84018d4122b2175287eee6c781064 100644 --- a/mindspore/core/ops/decoder_layer.h +++ b/mindspore/core/ops/decoder_layer.h @@ -64,39 +64,45 @@ class MIND_API DecoderLayer : public BaseOperator { /// \param[in] eps_layernorm1 Define eps layernorm1. /// \param[in] eps_layernorm2 Define eps layernorm2. /// \param[in] eps_layernorm3 Define eps layernorm3. + /// \param[in] eps_layernorm4 Define eps layernorm4. /// \param[in] ffn_hidden_size Define ffn hidden size. /// \param[in] position_bias1 Define position_bias1. /// \param[in] position_bias2 Define position_bias2. /// \param[in] scale1 Define scale1. /// \param[in] scale2 Define scale2. /// \param[in] act_type Define act_type. - void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, float eps_layernorm3, + /// \param[in] layer_norm Define act_type. + void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, float eps_layernorm3, float eps_layernorm4, int64_t ffn_hidden_size, bool position_bias1, bool position_bias2, bool post_layernorm, float scale1 = 1.0f, - float scale2 = 1.0f, ActType act_type = ActType::ActType_Gelu); + float scale2 = 1.0f, ActType act_type = ActType::ActType_Gelu, bool layer_norm = false); void set_head_num(int64_t head_num); void set_head_size(int64_t head_size); void set_post_layernorm(bool post_layernorm); void set_eps_layernorm1(float eps_layernorm1); void set_eps_layernorm2(float eps_layernorm2); - void set_eps_layernorm3(float eps_layernorm2); + void set_eps_layernorm3(float eps_layernorm3); + void set_eps_layernorm4(float eps_layernorm4); void set_ffn_hidden_size(int64_t ffn_hidden_size); void set_position_bias1(bool position_bias1); void set_position_bias2(bool position_bias2); void set_scale1(float scale1); void set_scale2(float scale2); void set_act_type(ActType act_type); + void set_layer_norm(bool layer_norm); int64_t get_head_num() const; int64_t get_head_size() const; bool get_post_layernorm() const; float get_eps_layernorm1() const; float get_eps_layernorm2() const; float get_eps_layernorm3() const; + float get_eps_layernorm4() const; int64_t get_ffn_hidden_size() const; bool get_position_bias1() const; bool get_position_bias2() const; float get_scale1() const; float get_scale2() const; ActType get_act_type() const; + bool get_layer_norm() const; }; } // namespace ops } // namespace mindspore diff --git a/mindspore/core/ops/encoder_layer.cc b/mindspore/core/ops/encoder_layer.cc index 276d10de2238aa95c752c2c90c0b6c2a234be503..06dd3ae6630c4760b0596a44bd37663b1cbce65e 100644 --- a/mindspore/core/ops/encoder_layer.cc +++ b/mindspore/core/ops/encoder_layer.cc @@ -40,6 +40,9 @@ void EncoderLayer::set_eps_layernorm1(float eps_layernorm1) { void EncoderLayer::set_eps_layernorm2(float eps_layernorm2) { (void)this->AddAttr(kEncoderLayerEpsLayerNorm2, api::MakeValue(eps_layernorm2)); } +void EncoderLayer::set_eps_layernorm3(float eps_layernorm3) { + (void)this->AddAttr(kEncoderLayerEpsLayerNorm3, api::MakeValue(eps_layernorm3)); +} void EncoderLayer::set_ffn_hidden_size(int64_t ffn_hidden_size) { (void)this->AddAttr(kEncoderLayerFfnHiddenSize, api::MakeValue(ffn_hidden_size)); } @@ -47,12 +50,13 @@ void EncoderLayer::set_position_bias(bool position_bias) { (void)this->AddAttr(kPositionBias, api::MakeValue(position_bias)); } void EncoderLayer::set_scale(float scale) { (void)this->AddAttr(kScale, api::MakeValue(scale)); } +void EncoderLayer::set_layer_norm(bool layer_norm) { (void)this->AddAttr(kLayerNorm, api::MakeValue(layer_norm)); } + void EncoderLayer::set_act_type(ActType act_type) { (void)this->AddAttr(kActivationType, api::MakeValue(act_type));} int64_t EncoderLayer::get_head_num() const { auto value_ptr = this->GetAttr(kEncoderLayerNumHeads); return GetValue(value_ptr); } - int64_t EncoderLayer::get_head_size() const { auto value_ptr = this->GetAttr(kEncoderLayerSizePerHead); return GetValue(value_ptr); @@ -70,6 +74,10 @@ float EncoderLayer::get_eps_layernorm2() const { auto value_ptr = this->GetAttr(kEncoderLayerEpsLayerNorm2); return GetValue(value_ptr); } +float EncoderLayer::get_eps_layernorm3() const { + auto value_ptr = this->GetAttr(kEncoderLayerEpsLayerNorm3); + return GetValue(value_ptr); +} int64_t EncoderLayer::get_ffn_hidden_size() const { auto value_ptr = this->GetAttr(kEncoderLayerFfnHiddenSize); return GetValue(value_ptr); @@ -89,18 +97,24 @@ ActType EncoderLayer::get_act_type() const { } return ActType(GetValue(value_ptr)); } -void EncoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, - int64_t ffn_hidden_size, bool position_bias, bool post_layernorm, float scale, - ActType act_type) { +bool EncoderLayer::get_layer_norm() const { + auto value_ptr = this->GetAttr(kLayerNorm); + return GetValue(value_ptr); +} +void EncoderLayer::Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, + float eps_layernorm3, int64_t ffn_hidden_size, bool position_bias, bool post_layernorm, + float scale, ActType act_type, bool layer_norm) { this->set_head_num(head_num); this->set_head_size(head_size); this->set_post_layernorm(post_layernorm); this->set_eps_layernorm1(eps_layernorm1); this->set_eps_layernorm2(eps_layernorm2); + this->set_eps_layernorm3(eps_layernorm3); this->set_ffn_hidden_size(ffn_hidden_size); this->set_position_bias(position_bias); this->set_act_type(act_type); this->set_scale(scale); + this->set_layer_norm(layer_norm); } REGISTER_PRIMITIVE_C(kNameEncoderLayer, EncoderLayer); } // namespace mindspore::ops diff --git a/mindspore/core/ops/encoder_layer.h b/mindspore/core/ops/encoder_layer.h index b0466be467af79f7d7cbea4ac5916ca8dc24a9b8..dd7794d5ed4c428c6258d8e625c57c517d89b330 100644 --- a/mindspore/core/ops/encoder_layer.h +++ b/mindspore/core/ops/encoder_layer.h @@ -42,30 +42,36 @@ class MIND_API EncoderLayer : public BaseOperator { /// \param[in] head_size Define size per head. /// \param[in] eps_layernorm1 Define eps layernorm1. /// \param[in] eps_layernorm2 Define eps layernorm2. + /// \param[in] eps_layernorm3 Define eps layernorm3. /// \param[in] ffn_hidden_size Define ffn hidden size. /// \param[in] position_bias Define position_bias. /// \param[in] scale Define scale. /// \param[in] act_type Define act_type. - void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, int64_t ffn_hidden_size, - bool position_bias, bool post_layernorm, float scale = 1.0f, ActType act_type = ActType::ActType_Gelu); + /// \param[in] layer_norm Define act_type. + void Init(int64_t head_num, int64_t head_size, float eps_layernorm1, float eps_layernorm2, float eps_layernorm3, int64_t ffn_hidden_size, + bool position_bias, bool post_layernorm, float scale = 1.0f, ActType act_type = ActType::ActType_Gelu, bool layer_norm = false); void set_head_num(int64_t head_num); void set_head_size(int64_t head_size); void set_post_layernorm(bool post_layernorm); void set_eps_layernorm1(float eps_layernorm1); void set_eps_layernorm2(float eps_layernorm2); + void set_eps_layernorm3(float eps_layernorm3); void set_ffn_hidden_size(int64_t ffn_hidden_size); void set_position_bias(bool position_bias); void set_scale(float scale); void set_act_type(ActType act_type); + void set_layer_norm(bool layer_norm); int64_t get_head_num() const; int64_t get_head_size() const; bool get_post_layernorm() const; float get_eps_layernorm1() const; float get_eps_layernorm2() const; + float get_eps_layernorm3() const; int64_t get_ffn_hidden_size() const; bool get_position_bias() const; float get_scale() const; ActType get_act_type() const; + bool get_layer_norm() const; }; } // namespace ops } // namespace mindspore diff --git a/mindspore/core/ops/op_name.h b/mindspore/core/ops/op_name.h index caf14dc257aa0ddb524c31041716754b1661f3bb..f54e7ae6e3285117162cbe7ff3f526d48a2842f7 100644 --- a/mindspore/core/ops/op_name.h +++ b/mindspore/core/ops/op_name.h @@ -385,6 +385,7 @@ constexpr auto kEncoderLayerPostLayernorm = "post_layernorm"; constexpr auto kEncoderLayerFfnHiddenSize = "ffn_hidden_size"; constexpr auto kEncoderLayerEpsLayerNorm1 = "eps_layernorm1"; constexpr auto kEncoderLayerEpsLayerNorm2 = "eps_layernorm2"; +constexpr auto kEncoderLayerEpsLayerNorm3 = "eps_layernorm3"; constexpr auto kDecoderLayerNumHeads = "head_num"; constexpr auto kDecoderLayerSizePerHead = "head_size"; constexpr auto kDecoderLayerPostLayernorm = "post_layernorm"; @@ -392,8 +393,10 @@ constexpr auto kDecoderLayerFfnHiddenSize = "ffn_hidden_size"; constexpr auto kDecoderLayerEpsLayerNorm1 = "eps_layernorm1"; constexpr auto kDecoderLayerEpsLayerNorm2 = "eps_layernorm2"; constexpr auto kDecoderLayerEpsLayerNorm3 = "eps_layernorm3"; +constexpr auto kDecoderLayerEpsLayerNorm4 = "eps_layernorm4"; constexpr auto kDecoderLayerPositionBias1 = "position_bias1"; constexpr auto kDecoderLayerPositionBias2 = "position_bias2"; +constexpr auto kLayerNorm = "layer_norm"; constexpr auto kDecoderLayerScale1 = "scale1"; constexpr auto kDecoderLayerScale2 = "scale2"; constexpr auto kPositionBias = "position_bias"; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc index 22f0e43a091785902996e1679beec86cc89701fd..ef93f6be387ea13a94b845d73d5408ef28189b91 100755 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/decoder_tensorrt.cc @@ -29,6 +29,7 @@ #include "src/fastertransformer/utils/cuda_utils.h" #include "src/fastertransformer/utils/allocator.h" #include "src/fastertransformer/kernels/layernorm_kernels.h" +#include "src/extendrt/delegate/tensorrt/tensorrt_utils.h" namespace mindspore::lite { namespace { @@ -37,7 +38,7 @@ constexpr std::size_t kTwo = 2; int DecoderTensorRT::IsSupport(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, const std::vector &out_tensors) { - if (in_tensors.size() != C23NUM && in_tensors.size() != C16NUM) { + if (in_tensors.size() != C23NUM && in_tensors.size() != C16NUM && in_tensors.size() != C17NUM) { MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); return RET_ERROR; } @@ -99,6 +100,7 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { } fastertransformer::decoderParamRun params; cublasHandle_t cublas_handle = GetCublasHandle(); + params.common_param.eft = false; params.common_param.cublas_handle = cublas_handle; params.common_param.head_num = decoder_op->get_head_num(); params.common_param.head_size = decoder_op->get_head_size(); @@ -107,6 +109,7 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.decoder.eps1 = decoder_op->get_eps_layernorm1(); params.decoder.eps2 = decoder_op->get_eps_layernorm2(); params.decoder.eps3 = decoder_op->get_eps_layernorm3(); + params.decoder.eps4 = decoder_op->get_eps_layernorm4(); params.ffn_param.ffn_param.ffn_hidden_size = decoder_op->get_ffn_hidden_size(); params.ffn_param.ffn_param.ffn_fp16 = is_ffn_fp16_; params.ffn_param.ffn_param.act_type = (fastertransformer::ActType)(decoder_op->get_act_type()); @@ -124,6 +127,7 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { params.attn2.attn.scale = decoder_op->get_scale2(); params.attn2.attn.mask = true; params.decoder.has_beta = !params.attn1.attn.position_bias; + params.decoder.is_layernorm = decoder_op->get_layer_norm(); auto compute_type = runtime_->GetRuntimePrecisionMode(); if (is_ffn_fp16_) { size_t start_fp16 = (params.attn1.attn.position_bias) ? C13NUM : C18NUM; @@ -144,11 +148,16 @@ int DecoderTensorRT::AddInnerOp(TensorRTContext *ctx) { nvinfer1::ITensor *input_tensor = input(ctx, 0).trt_tensor_; auto plugin = std::make_shared(input_tensor->getName(), compute_type, params, device_id_); const int input_number = inputs().size(); - nvinfer1::ITensor *inputTensors[input_number]; + nvinfer1::ITensor *inputTensors[input_number + C2NUM]; for (int i = 0; i < input_number; i++) { inputTensors[i] = input(ctx, i).trt_tensor_; } - nvinfer1::IPluginV2Layer *decoder_layer = ctx->network()->addPluginV2(inputTensors, input_number, *plugin); + + auto network_input1 = ctx->network()->getInput(0); + auto network_input2 = ctx->network()->getInput(C3NUM); + inputTensors[input_number] = network_input1; + inputTensors[input_number + C1NUM] = network_input2; + nvinfer1::IPluginV2Layer *decoder_layer = ctx->network()->addPluginV2(inputTensors, input_number + C2NUM, *plugin); if (decoder_layer == nullptr) { MS_LOG(ERROR) << "add decoder op failed for TensorRT."; return RET_ERROR; @@ -197,8 +206,9 @@ int DecoderPlugin::RunCudaDecoder(const nvinfer1::PluginTensorDesc *inputDesc, bool DecoderPlugin::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *tensorsDesc, int nbInputs, int nbOutputs) noexcept { auto type = (compute_type_ == RuntimePrecisionMode_FP16) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT; - for (int i = 0; i < pos; i++) { - if (tensorsDesc[pos].type != tensorsDesc[i].type) return false; + if (pos == nbInputs - 1 || pos == nbInputs - 2) { + bool res = (tensorsDesc[pos].type == nvinfer1::DataType::kINT32)? true:false; + return res; } bool res = (tensorsDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) && (tensorsDesc[pos].type == type); return res; @@ -208,7 +218,7 @@ void DecoderPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) noexcept { const int request_batch_size = static_cast(in[0].desc.dims.d[0]); const int request_src_seq_len = static_cast(in[0].desc.dims.d[1]); - const int request_tgt_seq_len = request_src_seq_len; + const int request_tgt_seq_len = params_.attn1.attn.position_bias ? static_cast(in[10].desc.dims.d[2]) : static_cast(in[14].desc.dims.d[2]); params_.common_param.batch_size = request_batch_size; params_.common_param.src_seq_len = request_src_seq_len; params_.common_param.tgt_seq_len = request_tgt_seq_len; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc index 7eb3653cceb163a858b4f673208f523257b6f1aa..b7b5e29b9e2269545cc4e995ccb7d982cb2bd43d 100755 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.cc @@ -39,7 +39,7 @@ constexpr std::size_t kTwo = 2; int EncoderTensorRT::IsSupport(const BaseOperatorPtr &base_operator, const std::vector &in_tensors, const std::vector &out_tensors) { - if (in_tensors.size() != C14NUM && in_tensors.size() != C9NUM && in_tensors.size() != C13NUM) { + if (in_tensors.size() != C14NUM && in_tensors.size() != C11NUM && in_tensors.size() != C9NUM && in_tensors.size() != C10NUM && in_tensors.size() != C13NUM) { MS_LOG(ERROR) << "Unsupported input tensor size, size is " << in_tensors.size(); return RET_ERROR; } @@ -101,19 +101,21 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { MS_LOG(ERROR) << "op action convert failed"; return RET_ERROR; } - cublasHandle_t cublas_handle = GetCublasHandle(); fastertransformer::encoderParamRun params; + cublasHandle_t cublas_handle = GetCublasHandle(); //update commonparam + params.common_param.eft = false; params.common_param.cublas_handle =cublas_handle; params.common_param.head_num = encoder_op->get_head_num(); params.common_param.head_size = encoder_op->get_head_size(); params.common_param.hidden_size = params.common_param.head_num * params.common_param.head_size; //connect commonparam to attention and ffn - //update encoder_param_ + params.encoder.is_layernorm = encoder_op->get_layer_norm(); params.encoder.layernorm_post = encoder_op->get_post_layernorm(); params.encoder.eps1 = encoder_op->get_eps_layernorm1(); params.encoder.eps2 = encoder_op->get_eps_layernorm2(); + params.encoder.eps3 = encoder_op->get_eps_layernorm3(); params.ffn_param.ffn_param.ffn_hidden_size = encoder_op->get_ffn_hidden_size(); params.ffn_param.ffn_param.ffn_fp16 = is_ffn_fp16_; params.attn.attn.is_cross = false; @@ -150,11 +152,14 @@ int EncoderTensorRT::AddInnerOp(TensorRTContext *ctx) { auto plugin = std::make_shared(input_tensor->getName(), compute_type, params, device_id_); const int input_number = inputs().size(); - nvinfer1::ITensor *inputTensors[input_number]; + nvinfer1::ITensor *inputTensors[input_number+1]; for (int i = 0; i < input_number; i++) { inputTensors[i] = input(ctx, i).trt_tensor_; } - nvinfer1::IPluginV2Layer *encoder_layer = ctx->network()->addPluginV2(inputTensors, input_number, *plugin); + auto network_input = ctx->network()->getInput(0); + // fastertransformer::printTensor((char*)"network_input",(int*)network_input,5); + inputTensors[input_number] = network_input; + nvinfer1::IPluginV2Layer *encoder_layer = ctx->network()->addPluginV2(inputTensors, input_number+1, *plugin); if (encoder_layer == nullptr) { MS_LOG(ERROR) << "add encoder op failed for TensorRT."; return RET_ERROR; @@ -204,8 +209,9 @@ int EncoderPlugin::RunCudaEncoder(const nvinfer1::PluginTensorDesc *inputDesc, bool EncoderPlugin::supportsFormatCombination(int pos, const nvinfer1::PluginTensorDesc *tensorsDesc, int nbInputs, int nbOutputs) noexcept { auto type = (compute_type_ == RuntimePrecisionMode_FP16) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT; - for (int i = 0; i < pos; i++) { - if (tensorsDesc[pos].type != tensorsDesc[i].type) return false; + if (pos == nbInputs - 1) { + bool res = (tensorsDesc[pos].type == nvinfer1::DataType::kINT32)? true:false; + return res; } bool res = (tensorsDesc[pos].format == nvinfer1::TensorFormat::kLINEAR) && (tensorsDesc[pos].type == type); return res; diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h index ae6133c0ef40373b1cd58769f61175fe3b300da8..555c97acd5bcef08ebca00fed8806f22a88c872b 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/encoder_tensorrt.h @@ -23,6 +23,7 @@ #include "src/extendrt/delegate/tensorrt/op/tensorrt_plugin.h" #include "src/extendrt/delegate/tensorrt/cuda_impl/cudnn_utils.h" #include "src/fastertransformer/layers/ms_layers/encoder.h" +#include "src/fastertransformer/layers/ms_layers/debug_utils.h" namespace mindspore::lite { class EncoderTensorRT : public TensorRTOp { diff --git a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc index 75cb121d986924b271f08975724ddd28f7b3cf85..03abb00bca3be6f1257635565f5975c9f864fb5b 100644 --- a/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc +++ b/mindspore/lite/src/extendrt/delegate/tensorrt/op/mha_tensorrt.cc @@ -131,7 +131,7 @@ int MhaPlugin::RunCudaMha(const nvinfer1::PluginTensorDesc *inputDesc, const nvi const int attn_mask_tensor_idx = 7 + cross_tensor_offset; const int bias_qkv_tensor_idx = 5 + cross_tensor_offset; const int weight_qkv_tensor_idx = 3; - const int position_bias_tensor_idx = 6 + cross_tensor_offset; + const int position_bias_tensor_idx = 5 + cross_tensor_offset; common_param_.algo = algoId; common_param_.stream = stream; void *inputs_attn[num_of_inputs_]; @@ -148,8 +148,8 @@ int MhaPlugin::RunCudaMha(const nvinfer1::PluginTensorDesc *inputDesc, const nvi inputs_attn[index++] = const_cast(inputs[bias_qkv_tensor_idx]); } if (params_.attn.position_bias) { + inputs_attn[index++] = const_cast(inputs[attn_mask_tensor_idx - C1NUM]); inputs_attn[index++] = const_cast(inputs[position_bias_tensor_idx]); - inputs_attn[index++] = const_cast(inputs[attn_mask_tensor_idx - C2NUM]); } else { inputs_attn[index++] = const_cast(inputs[attn_mask_tensor_idx]); } @@ -186,6 +186,10 @@ void MhaPlugin::configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in, int common_param_.batch_size = request_batch_size; common_param_.src_seq_len = request_src_seq_len; common_param_.tgt_seq_len = request_tgt_seq_len; + common_param_.h_token_num = common_param_.batch_size * common_param_.src_seq_len; + common_param_.h_token_num2 = common_param_.batch_size * common_param_.tgt_seq_len; + params_.attn.padding_offset = nullptr; + params_.attn.padding_offset2 = nullptr; num_of_inputs_ = nbInputs; num_of_outputs_ = nbOutputs; } diff --git a/mindspore/lite/tools/benchmark/benchmark_base.h b/mindspore/lite/tools/benchmark/benchmark_base.h index 1511c0cf719c2439e2ced66d3d57132ea73c62db..e5820b421730e4a01cfed20a73c4f07345900107 100644 --- a/mindspore/lite/tools/benchmark/benchmark_base.h +++ b/mindspore/lite/tools/benchmark/benchmark_base.h @@ -329,6 +329,7 @@ class MS_API BenchmarkBase { auto tolerance = absoluteTolerance + relativeTolerance * fabs(calibTensor->data.at(j)); auto absoluteError = std::fabs(msTensorData[j] - calibTensor->data.at(j)); + std::cout<<"j = "<("gamma4"); + MS_CHECK_TRUE_RET(gamma4_ != nullptr, false); + beta4_ = std::make_shared("beta4"); + MS_CHECK_TRUE_RET(beta4_ != nullptr, false); weight_attn_qkv_ = std::make_shared("weight_attn_qkv"); MS_CHECK_TRUE_RET(weight_attn_qkv_ != nullptr, false); weight_attn_q_ = std::make_shared("weight_attn_q_"); @@ -100,6 +104,8 @@ bool DecoderLayerFusion::Init() const { MS_CHECK_TRUE_RET(eps2_ != nullptr, false); eps3_ = std::make_shared("eps3_"); MS_CHECK_TRUE_RET(eps3_ != nullptr, false); + eps4_ = std::make_shared("eps4_"); + MS_CHECK_TRUE_RET(eps4_ != nullptr, false); return true; } @@ -144,9 +150,8 @@ VectorRef DecoderLayerFusion::DefineLayerNorm(VectorRef input, VarPtr gamma, Var auto mul = VectorRef({is_mul, real_div, gamma}); return mul; } - VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = true, bool layernorm_fusion = false, - bool is_position_bias = false, bool mask = true) const { + bool is_position_bias = false, bool mask = true, bool is_layer_norm=false) const { auto is_reshape1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-decoder"); MS_CHECK_TRUE_RET(is_reshape1 != nullptr, {}); auto var1 = std::make_shared("var1-reshape"); @@ -166,7 +171,6 @@ VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = tr } if (mask) inputs.push_back(mask_); auto attention = VectorRef(inputs); - // return attention; if (is_position_bias) { tuple4 = attention; } else { @@ -242,7 +246,10 @@ VectorRef DecoderLayerFusion::DefinePatternDecoderLayer(bool post_layernorm = tr auto reshape4 = (post_layernorm) ? VectorRef({is_reshape4, tuple3, var4}) : VectorRef({is_reshape4, add3, var4}); auto is_add4 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add4"); auto add4 = VectorRef({is_add4, reshape4, reshape3}); - return add4; + if(is_layer_norm) + return DefineLayerNorm(add4, gamma4_, beta4_, eps4_); + else + return add4; } std::unordered_map DecoderLayerFusion::DefinePatterns() const { @@ -251,6 +258,7 @@ std::unordered_map DecoderLayerFusion::DefinePatterns() MS_LOG(ERROR) << "initial member failed."; return patterns; } + patterns[kPatternDecoderLayerNormT5Pre] = DefinePatternDecoderLayer(false, false, true, true, true); patterns[kPatternDecoderLayerPre] = DefinePatternDecoderLayer(false, true, false, true); patterns[kPatternDecoderLayerPost] = DefinePatternDecoderLayer(true, true, false, true); patterns[kPatternDecoderLayerNormPre] = DefinePatternDecoderLayer(false, false, false, true); @@ -265,9 +273,13 @@ AnfNodePtr DecoderLayerFusion::Process(const std::string &pattern_name, const mi if (func_graph == nullptr || node == nullptr || equiv == nullptr) { return nullptr; } - if (pattern_name == kPatternDecoderT5Pre || pattern_name == kPatternDecoderT5Post) { + if (pattern_name == kPatternDecoderT5Pre || pattern_name == kPatternDecoderT5Post || pattern_name == kPatternDecoderLayerNormT5Pre) { is_position_bias_ = true; } + is_layernorm_ = false; + if(pattern_name == kPatternDecoderLayerNormT5Pre){ + is_layernorm_ = true; + } if (pattern_name == kPatternDecoderLayerPre || pattern_name == kPatternDecoderLayerPost) { is_layernorm_fusion_ = true; } @@ -342,7 +354,7 @@ STATUS DecoderLayerFusion::GetEps(const EquivPtr &equiv, VarPtr node_name, float } STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, - int *head_size, float *eps1, float *eps2, float *eps3, bool *is_position_bias1, + int *head_size, float *eps1, float *eps2, float *eps3, float *eps4, bool *is_position_bias1, bool *is_position_bias2, float *scale1, float *scale2) const { auto attn_input = GetAttribute(func_graph, equiv, is_attention_); MS_ASSERT(attn_input != nullptr); @@ -401,6 +413,13 @@ STATUS DecoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq MS_LOG(ERROR) << "not found eps3"; return RET_ERROR; } + if(is_layernorm_) + { + if (GetEps(equiv, eps4_, eps4) != RET_OK) { + MS_LOG(ERROR) << "not found eps4"; + return RET_ERROR; + } + } } if (!is_position_bias_) { if (!IsActGELU(func_graph, equiv)) { @@ -425,16 +444,17 @@ std::shared_ptr DecoderLayerFusion::CreatePrim(const FuncGrap float eps1 = 1e-6; float eps2 = 1e-6; float eps3 = 1e-6; + float eps4 = 1e-6; bool is_position_bias1 = false; bool is_position_bias2 = false; float scale1 = 1.0f; float scale2 = 1.0f; - if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &eps3, &is_position_bias1, + if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &eps3, &eps4, &is_position_bias1, &is_position_bias2, &scale1, &scale2)) { return nullptr; } - decoder_layer_prim->Init(head_num, head_size, eps1, eps2, eps3, ffn_hidden_size, is_position_bias1, is_position_bias2, - post_layernorm, scale1, scale2, act_type_); + decoder_layer_prim->Init(head_num, head_size, eps1, eps2, eps3, eps4, ffn_hidden_size, is_position_bias1, is_position_bias2, + post_layernorm, scale1, scale2, act_type_, is_layernorm_); return decoder_layer_prim; } @@ -449,7 +469,7 @@ CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphP auto encoder_output = utils::cast((*equiv)[encoder_output_]); MS_ASSERT(encoder_output != nullptr); AnfNodePtr position_bias, input_mask, bias_attn_o, bias_attn_qkv, beta1, beta2, bias_m, bias_p, beta3, - bias_attn_cross_qkv, bias_attn_cross_o, position_bias_cross; + bias_attn_cross_qkv, bias_attn_cross_o, position_bias_cross, gamma4, beta4; auto weight_qkv = utils::cast((*equiv)[weight_attn_qkv_]); auto weight_attn_o = utils::cast((*equiv)[weight_attn_o_]); auto weight_attn_q = utils::cast((*equiv)[weight_attn_q_]); @@ -470,10 +490,15 @@ CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphP beta1 = utils::cast((*equiv)[beta1_]); beta2 = utils::cast((*equiv)[beta2_]); beta3 = utils::cast((*equiv)[beta3_]); + if(is_layernorm_) + beta4 = utils::cast((*equiv)[beta4_]); } auto gamma1 = utils::cast((*equiv)[gamma1_]); auto gamma2 = utils::cast((*equiv)[gamma2_]); auto gamma3 = utils::cast((*equiv)[gamma3_]); + if(is_layernorm_) + gamma4 = utils::cast((*equiv)[gamma4_]); + input_mask = mask ? utils::cast((*equiv)[mask_]) : nullptr; auto cross_mask = utils::cast((*equiv)[cross_mask_]); auto base_shape_ptr = weight_m->Shape(); @@ -498,6 +523,7 @@ CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphP if (mask) new_node_inputs.push_back(cross_mask); new_node_inputs.insert(new_node_inputs.end(), {position_bias_cross, weight_attn_cross_o, gamma3, weight_m, weight_p}); + if(is_layernorm_) new_node_inputs.push_back(gamma4); } else { new_node_inputs.insert(new_node_inputs.end(), {beta1, weight_qkv, bias_attn_qkv}); if (mask) new_node_inputs.push_back(input_mask); @@ -506,6 +532,7 @@ CNodePtr DecoderLayerFusion::CreateMaskedDecoderLayerFusionNode(const FuncGraphP if (mask) new_node_inputs.push_back(cross_mask); new_node_inputs.insert(new_node_inputs.end(), {weight_attn_cross_o, bias_attn_cross_o, gamma3, beta3, weight_m, bias_m, weight_p, bias_p}); + if(is_layernorm_) new_node_inputs.insert(new_node_inputs.end(),{gamma4,beta4}); } auto new_node = func_graph->NewCNode(new_node_inputs); MS_CHECK_TRUE_RET(new_node != nullptr, nullptr); diff --git a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h index f5faec283e2fca3114817db87edb7e554b6e14e6..b7accd37869fa57f61f4364eee86dd39081ff384 100644 --- a/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/decoder_layer_fusion.h @@ -32,8 +32,8 @@ namespace mindspore { namespace opt { class DecoderLayerFusion : public MultiplePatternProcessPass { public: - explicit DecoderLayerFusion(const std::string &name = "DecoderLayerFusion", bool multigraph = true) - : MultiplePatternProcessPass(name, multigraph) {} + explicit DecoderLayerFusion(bool layer_norm, const std::string &name = "DecoderLayerFusion", bool multigraph = true) + : MultiplePatternProcessPass(name, multigraph) { layer_norm_ = layer_norm;} ~DecoderLayerFusion() override = default; @@ -46,7 +46,7 @@ class DecoderLayerFusion : public MultiplePatternProcessPass { private: VectorRef DefinePatternDecoderLayer(bool post_layernorm, bool layernorm_fusion, bool is_position_bias, - bool mask) const; + bool mask, bool is_layer_norm) const; VectorRef getTuple(bool post_layernorm, bool layernorm_fusion, bool is_position_bias) const; VectorRef DefineLayerNorm(VectorRef input, VarPtr gamma, VarPtr beta, VarPtr eps) const; CNodePtr CreateMaskedDecoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, @@ -54,17 +54,18 @@ class DecoderLayerFusion : public MultiplePatternProcessPass { std::shared_ptr CreatePrim(const FuncGraphPtr &func_graph, const EquivPtr &equiv, bool post_layernorm, int64_t ffn_hidden_size) const; lite::STATUS CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, int *head_size, - float *eps1, float *eps2, float *eps3, bool *is_position_bias1, bool *is_position_bias2, + float *eps1, float *eps2, float *eps3, float *eps4, bool *is_position_bias1, bool *is_position_bias2, float *scale1, float *scale2) const; AnfNodePtr GetAttribute(const FuncGraphPtr &func_graph, const EquivPtr &equiv, VarPtr node_name) const; bool IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv) const; lite::STATUS GetEps(const EquivPtr &equiv, VarPtr node_name, float *eps) const; - + VectorRef DefineDecoderLayerNorm(VectorRef input, VarPtr gamma, VarPtr eps) const ; protected: const std::string kPatternDecoderLayerPre = "PatternDecoderLayerPre"; const std::string kPatternDecoderLayerPost = "PatternDecoderLayerPost"; - const std::string kPatternDecoderLayerNormPre = "kPatternDecoderLayerNormPre"; - const std::string kPatternDecoderLayerNormPost = "kPatternDecoderLayerNormPost"; + const std::string kPatternDecoderLayerNormPre = "PatternDecoderLayerNormPre"; + const std::string kPatternDecoderLayerNormPost = "PatternDecoderLayerNormPost"; + const std::string kPatternDecoderLayerNormT5Pre = "PatternDecoderLayerNormT5Pre"; const std::string kPatternDecoderT5Pre = "PatternDecoderT5Pre"; const std::string kPatternDecoderT5Post = "PatternDecoderT5Post"; mutable VarPtr hidden_stats_{nullptr}; @@ -75,7 +76,9 @@ class DecoderLayerFusion : public MultiplePatternProcessPass { mutable VarPtr beta2_{nullptr}; mutable VarPtr gamma2_{nullptr}; mutable VarPtr gamma3_{nullptr}; + mutable VarPtr gamma4_{nullptr}; mutable VarPtr beta3_{nullptr}; + mutable VarPtr beta4_{nullptr}; mutable VarPtr weight_attn_qkv_{nullptr}; mutable VarPtr weight_attn_qkv_cross_{nullptr}; mutable VarPtr weight_attn_o_{nullptr}; @@ -104,9 +107,12 @@ class DecoderLayerFusion : public MultiplePatternProcessPass { mutable VarPtr eps1_{nullptr}; mutable VarPtr eps2_{nullptr}; mutable VarPtr eps3_{nullptr}; + mutable VarPtr eps4_{nullptr}; mutable bool is_position_bias_{false}; mutable bool is_layernorm_fusion_{false}; + mutable bool is_layernorm_{false}; mutable ActType act_type_{ActType::ActType_No}; + mutable bool layer_norm_; }; } // namespace opt } // namespace mindspore diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc index ada4b7e856ea534fec4812e3eba94f2677231365..c7bf1e506acb7a4b92d7a73112ec539c893e9f15 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.cc @@ -42,6 +42,10 @@ bool EncoderLayerFusion::Init() const { MS_CHECK_TRUE_RET(beta2_ != nullptr, false); gamma2_ = std::make_shared("gamma2"); MS_CHECK_TRUE_RET(gamma2_ != nullptr, false); + beta3_ = std::make_shared("beta3"); + MS_CHECK_TRUE_RET(beta3_ != nullptr, false); + gamma3_ = std::make_shared("gamma3"); + MS_CHECK_TRUE_RET(gamma3_ != nullptr, false); weight_attn_qkv_ = std::make_shared("weight_attn_qkv"); MS_CHECK_TRUE_RET(weight_attn_qkv_ != nullptr, false); weight_attn_o_ = std::make_shared(IsParamNode, "weight_attn_o"); @@ -70,10 +74,12 @@ bool EncoderLayerFusion::Init() const { MS_CHECK_TRUE_RET(position_bias_ != nullptr, false); is_act_ = std::make_shared(std::bind(IsOpType, p1, prim::kPrimActivation), "activation"); MS_CHECK_TRUE_RET(is_act_ != nullptr, {}); - eps1_ = std::make_shared("position_bias"); + eps1_ = std::make_shared("eps1_"); MS_CHECK_TRUE_RET(eps1_ != nullptr, false); - eps2_ = std::make_shared("position_bias"); + eps2_ = std::make_shared("eps2_"); MS_CHECK_TRUE_RET(eps2_ != nullptr, false); + eps3_ = std::make_shared("eps3_"); + MS_CHECK_TRUE_RET(eps3_ != nullptr, false); return true; } @@ -137,9 +143,9 @@ VectorRef EncoderLayerFusion::DefineLayerNorm(bool is_position_bias, VectorRef i return scale; } } - + VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = true, bool layernorm_fusion = false, - bool is_position_bias = false, bool mask = true) const { + bool is_position_bias = false, bool mask = true, bool is_layer_norm = false) const { VectorRef tuple, tuple2, tuple3, reshape2, matmul1, inputs; auto is_reshape1 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimReshape), "reshape-encoder"); MS_CHECK_TRUE_RET(is_reshape1 != nullptr, {}); @@ -211,6 +217,8 @@ VectorRef EncoderLayerFusion::DefinePatternEncoderLayer(bool post_layernorm = tr auto reshape3 = VectorRef({is_reshape3, matmul2, var3}); auto is_add3 = std::make_shared(std::bind(IsOpType, p1, prim::kPrimAddFusion), "is_add3"); auto add3 = VectorRef({is_add3, reshape2, reshape3}); + if(is_layer_norm) + return DefineLayerNorm(is_position_bias, add3, gamma3_, beta3_, eps3_); if (!post_layernorm || !layernorm_fusion) { return add3; } @@ -241,6 +249,7 @@ std::unordered_map EncoderLayerFusion::DefinePatterns() MS_LOG(ERROR) << "initial member failed."; return patterns; } + patterns[kPatternEncoderLayerNormT5Pre] = DefinePatternEncoderLayer(false, false, true, true, true); patterns[kPatternEncoderLayerPre] = DefinePatternEncoderLayer(false); patterns[kPatternEncoderLayerPost] = DefinePatternEncoderLayer(true); patterns[kPatternEncoderLayerPostNorm] = DefinePatternEncoderLayer(true, true); @@ -255,9 +264,11 @@ AnfNodePtr EncoderLayerFusion::Process(const std::string &pattern_name, const mi if (func_graph == nullptr || node == nullptr || equiv == nullptr) { return nullptr; } + is_layernorm_ = false; + if(pattern_name == kPatternEncoderLayerNormT5Pre) is_layernorm_ = true; if (pattern_name == kPatternEncoderLayerPostNorm || pattern_name == kPatternEncoderLayerPreNorm) is_layernorm_fusion_ = true; - if (pattern_name == kPatternEncoderLayerT5Pre || pattern_name == kPatternEncoderLayerT5Post) is_position_bias_ = true; + if (pattern_name == kPatternEncoderLayerT5Pre || pattern_name == kPatternEncoderLayerT5Post || pattern_name == kPatternEncoderLayerNormT5Pre) is_position_bias_ = true; bool mask = true; bool post_layernorm = false; if (pattern_name == kPatternEncoderLayerPost || pattern_name == kPatternEncoderLayerPostNorm || @@ -330,7 +341,7 @@ AnfNodePtr EncoderLayerFusion::GetAttribute(const FuncGraphPtr &func_graph, cons } STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, - int *head_size, float *eps1, float *eps2, float *scale) const { + int *head_size, float *eps1, float *eps2, float *eps3, float *scale) const { auto attn_input = GetAttribute(func_graph, equiv, is_attention_); MS_ASSERT(attn_input != nullptr); auto attn_prim = ops::GetOperator(attn_input); @@ -367,6 +378,12 @@ STATUS EncoderLayerFusion::CheckPattern(const FuncGraphPtr &func_graph, const Eq MS_LOG(ERROR) << "not found eps2"; return RET_ERROR; } + if(is_layernorm_){ + if (GetEps(equiv, eps3_, eps3) != RET_OK) { + MS_LOG(ERROR) << "not found eps3"; + return RET_ERROR; + } + } } if (!is_position_bias_) { if (!IsActGELU(func_graph, equiv, is_act_)) { @@ -390,12 +407,13 @@ std::shared_ptr EncoderLayerFusion::CreatePrim(const FuncGrap int head_size = 0; float eps1 = 1e-5; float eps2 = 1e-5; + float eps3 = 1e-5; float scale = 1.0f; - if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &scale)) { + if (CheckPattern(func_graph, equiv, &head_num, &head_size, &eps1, &eps2, &eps3, &scale)) { return nullptr; } - encoder_layer_prim->Init(head_num, head_size, eps1, eps2, ffn_hidden_size, is_position_bias_, post_layernorm, scale, - act_type_); + encoder_layer_prim->Init(head_num, head_size, eps1, eps2, eps3, ffn_hidden_size, is_position_bias_, post_layernorm, scale, + act_type_, is_layernorm_); return encoder_layer_prim; } @@ -406,7 +424,8 @@ CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphP MS_ASSERT(equiv != nullptr); MS_ASSERT(node != nullptr); auto input = utils::cast((*equiv)[input_]); - AnfNodePtr position_bias, input_mask, bias_attn_o, bias_attn_qkv, beta1, beta2, bias_m, bias_p; + AnfNodePtr position_bias, input_mask, bias_attn_o, bias_attn_qkv, beta1, beta2, bias_m, bias_p, + beta3, gamma3; auto weight_qkv = utils::cast((*equiv)[weight_attn_qkv_]); auto weight_attn_o = utils::cast((*equiv)[weight_attn_o_]); auto weight_m = utils::cast((*equiv)[weight_m_]); @@ -418,9 +437,13 @@ CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphP bias_p = utils::cast((*equiv)[bias_p_]); beta1 = utils::cast((*equiv)[beta1_]); beta2 = utils::cast((*equiv)[beta2_]); + if(is_layernorm_) + beta3 = utils::cast((*equiv)[beta3_]); } auto gamma1 = utils::cast((*equiv)[gamma1_]); auto gamma2 = utils::cast((*equiv)[gamma2_]); + if(is_layernorm_) + gamma3 = utils::cast((*equiv)[gamma3_]); input_mask = mask ? utils::cast((*equiv)[mask_]) : nullptr; auto base_shape_ptr = weight_m->Shape(); MS_EXCEPTION_IF_NULL(base_shape_ptr); @@ -441,6 +464,7 @@ CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphP new_node_inputs.insert(new_node_inputs.end(), {gamma1, weight_qkv}); if (mask) new_node_inputs.push_back(input_mask); new_node_inputs.insert(new_node_inputs.end(), {position_bias, weight_attn_o, gamma2, weight_m, weight_p}); + if(is_layernorm_) new_node_inputs.push_back(gamma3); } else { if (!post_layernorm) { new_node_inputs.insert(new_node_inputs.end(), {gamma1, beta1, weight_qkv, bias_attn_qkv}); @@ -453,6 +477,7 @@ CNodePtr EncoderLayerFusion::CreateMaskedEncoderLayerFusionNode(const FuncGraphP new_node_inputs.insert(new_node_inputs.end(), {weight_attn_o, bias_attn_o, gamma1, beta1, weight_m, bias_m, weight_p, bias_p, gamma2, beta2}); } + if(is_layernorm_) new_node_inputs.insert(new_node_inputs.end(), {gamma3, beta3}); } auto new_node = func_graph->NewCNode(new_node_inputs); MS_CHECK_TRUE_RET(new_node != nullptr, nullptr); diff --git a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h index 9f93f60396226caa25d3b40d5b73b63a104f3df1..a037d0d958c5f96797d05875febc842c39abebf7 100644 --- a/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/encoder_layer_fusion.h @@ -49,10 +49,13 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { const std::string kPatternEncoderLayerPre = "PatternTEncoderLayerPre"; const std::string kPatternEncoderLayerPostNorm = "PatternTEncoderLayerPostNorm"; const std::string kPatternEncoderLayerPreNorm = "PatternTEncoderLayerPreNorm"; - const std::string kPatternEncoderLayerT5Post = "kPatternEncoderLayerT5Post"; - const std::string kPatternEncoderLayerT5Pre = "kPatternEncoderLayerT5Pre"; + const std::string kPatternEncoderLayerT5Post = "PatternEncoderLayerT5Post"; + const std::string kPatternEncoderLayerT5Pre = "PatternEncoderLayerT5Pre"; + const std::string kPatternEncoderLayerNormT5Pre = "PatternEncoderLayerNormT5Pre"; + VectorRef DefinePatternEncoderLayer(bool post_layernorm, bool layernorm_fusion, bool is_position_bias_, - bool mask) const; VectorRef getTuple(bool post_layernorm, bool layernorm_fusion, bool is_position_bias) const; + bool mask, bool is_layer_norm) const; + VectorRef getTuple(bool post_layernorm, bool layernorm_fusion, bool is_position_bias) const; VectorRef DefineLayerNorm(bool is_position_bias, VectorRef input, VarPtr gamma, VarPtr beta, VarPtr eps) const; CNodePtr CreateMaskedEncoderLayerFusionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, const AnfNodePtr &node, bool post_layernorm = true, @@ -61,7 +64,7 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { bool IsActGELU(const FuncGraphPtr &func_graph, const EquivPtr &equiv, const VarPtr &input_prim) const; lite::STATUS GetEps(const EquivPtr &equiv, VarPtr node_name, float *eps) const; lite::STATUS CheckPattern(const FuncGraphPtr &func_graph, const EquivPtr &equiv, int *head_num, int *head_size, - float *eps1, float *eps2, float *scale) const; + float *eps1, float *eps2, float *eps3, float *scale) const; std::shared_ptr CreatePrim(const FuncGraphPtr &func_graph, const EquivPtr &equiv, bool post_layernorm, int64_t ffn_hidden_size) const; @@ -72,6 +75,8 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { mutable VarPtr gamma1_{nullptr}; mutable VarPtr beta2_{nullptr}; mutable VarPtr gamma2_{nullptr}; + mutable VarPtr beta3_{nullptr}; + mutable VarPtr gamma3_{nullptr}; mutable VarPtr weight_attn_qkv_{nullptr}; mutable VarPtr weight_attn_qkv_cross_{nullptr}; mutable VarPtr weight_attn_o_{nullptr}; @@ -91,6 +96,8 @@ class EncoderLayerFusion : public MultiplePatternProcessPass { mutable VarPtr is_act_{nullptr}; mutable VarPtr eps1_{nullptr}; mutable VarPtr eps2_{nullptr}; + mutable VarPtr eps3_{nullptr}; + mutable bool is_layernorm_{false}; }; } // namespace opt } // namespace mindspore diff --git a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc index d9b2ed45a1cf7b9602d6a7e3e9a520e66cf2c3ff..9354c554bb45f1ffba93035b500bbf818276d8a2 100644 --- a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc +++ b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.cc @@ -613,7 +613,7 @@ bool MultiHeadAttentionFusion::CheckPattern(const EquivPtr &equiv, int *head_num } *head_num = out.at(0); *head_size = out.at(1); - scale_ = 1.0f / sqrtf(*head_size * 1.0f); + scale_ = (scale_ == 0.0f) ? 1.0f / sqrtf(*head_size * 1.0f) : scale_; return true; } @@ -623,19 +623,20 @@ AnfNodePtr MultiHeadAttentionFusion::Process(const std::string &pattern_name, co if (func_graph == nullptr || node == nullptr || equiv == nullptr) { return nullptr; } + scale_ = 0.0f; if ((pattern_name == kMPAWithMaskPatternName) || (pattern_name == kMPAWithMaskPatternNamePA) || (pattern_name == kMPAWithMaskPatternNameT5) || (pattern_name == kMPAWithMaskPatternNameT5New) || (pattern_name == kMPAWithMaskTransposePatternNameT5New) || (pattern_name == kMPAWithMaskPatternNameT5New2)) { if (pattern_name == kMPAWithMaskPatternNameT5New || pattern_name == kMPAWithMaskTransposePatternNameT5New || pattern_name == kMPAWithMaskPatternNameT5New2) { t5_x_ = true; - scale_ = (pattern_name == kMPAWithMaskPatternNameT5New2) ? 1.0f : scale_; + scale_ = 1.0f; } - return CreateMaskedMultiHeadAttentionNode(func_graph, equiv, node->fullname_with_scope(), true); + return CreateMaskedMultiHeadAttentionNode(func_graph, equiv, node, true); } if (pattern_name == kMPAPatternName || pattern_name == kMPAPatternNameSwin1 || pattern_name == kMPAPatternNameSwin2 || pattern_name == kMPAPatternNamePA) - return CreateMaskedMultiHeadAttentionNode(func_graph, equiv, node->fullname_with_scope(), false); + return CreateMaskedMultiHeadAttentionNode(func_graph, equiv, node, false); return nullptr; } @@ -863,7 +864,7 @@ std::vector MultiHeadAttentionFusion::GetNewNodeInputs(const EquivPt } CNodePtr MultiHeadAttentionFusion::CreateMaskedMultiHeadAttentionNode(const FuncGraphPtr &func_graph, - const EquivPtr &equiv, const string &base_name, + const EquivPtr &equiv, const mindspore::AnfNodePtr &node, bool mask) const { MS_ASSERT(func_graph != nullptr); MS_ASSERT(equiv != nullptr); @@ -885,7 +886,7 @@ CNodePtr MultiHeadAttentionFusion::CreateMaskedMultiHeadAttentionNode(const Func auto c_bias = ConcatTensors({bias_q_tensor, bias_k_tensor, bias_v_tensor}); c_bias_param = func_graph->add_parameter(); MS_CHECK_TRUE_RET(c_bias_param != nullptr, nullptr); - c_bias_param->set_name(base_name + "/bias_qkv"); + c_bias_param->set_name(node->fullname_with_scope() + "/bias_qkv"); if (lite::InitParameterFromTensorInfo(c_bias_param, c_bias) != lite::RET_OK) { MS_LOG(ERROR) << "Init parameter from tensor info failed."; return nullptr; @@ -912,7 +913,7 @@ CNodePtr MultiHeadAttentionFusion::CreateMaskedMultiHeadAttentionNode(const Func MS_LOG(ERROR) << "Init parameter from tensor info failed."; return nullptr; } - c_weight_param->set_name(base_name + "/weight_qkv"); + c_weight_param->set_name(node->fullname_with_scope() + "/weight_qkv"); ParameterPtr q_weight_param; if (cross) { q_weight_param = func_graph->add_parameter(); @@ -932,12 +933,15 @@ CNodePtr MultiHeadAttentionFusion::CreateMaskedMultiHeadAttentionNode(const Func return nullptr; } } - new_node->set_fullname_with_scope(base_name + "/attention"); + new_node->set_fullname_with_scope(node->fullname_with_scope() + "/attention"); CNodePtr ret_node; if (vnode) { auto get_item_node = MakeGetTuple(func_graph, new_node, knode, vnode); ret_node = get_item_node; } else { + auto old_node = node->cast(); + MS_CHECK_TRUE_RET(old_node->abstract() != nullptr, nullptr); + new_node->set_abstract(old_node->abstract()->Clone()); ret_node = new_node; } RemoveRedundantInput(func_graph, redundant); diff --git a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h index 345616ed4aee9b1859074d49ca5bb55d1b446a5a..9af12e4c2d2dfb7058f0928f0d9ebc2e8b5919dc 100644 --- a/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h +++ b/mindspore/lite/tools/optimizer/fusion/multi_head_attention_fusion.h @@ -60,7 +60,7 @@ class MultiHeadAttentionFusion : public MultiplePatternProcessPass { // create masked-multi-head-attention CNodePtr CreateMaskedMultiHeadAttentionNode(const FuncGraphPtr &func_graph, const EquivPtr &equiv, - const std::string &base_name, bool mask = true) const; + const mindspore::AnfNodePtr &node, bool mask = true) const; // check pattern bool CheckPattern(const EquivPtr &equiv, int *head_num, int *head_size) const; CNodePtr CreateOutputGetItem(const FuncGraphPtr &func_graph, const CNodePtr &node, const int item_index) const; diff --git a/trc/readers/mindir/readir.cc b/trc/readers/mindir/readir.cc index 98455e56035d9292b80e75a6e521634dc51089e4..bcbb8fd231634a3d2c99dc210c7c183427568720 100644 --- a/trc/readers/mindir/readir.cc +++ b/trc/readers/mindir/readir.cc @@ -118,23 +118,24 @@ float GetFloatSum(const float *data, int size) { } void printTensor(int id, TensorProto const &tensor, int limit = 0) { - std::cout << id << ")" << (tensor.has_name() ? tensor.name() : "no name") << std::endl; + std::cout << "tensor #"<< id << std::endl; + if (tensor.has_name()) { + std::cout << "name=" << tensor.name() << std::endl; + } if (tensor.has_doc_string()) { std::cout << "doc_string = " << tensor.doc_string(); } if (tensor.has_ref_key()) { std::cout << "ref_key=" << tensor.ref_key() << std::endl; } - - std::cout << " " - << "t:" << EnumAttributeType(tensor.data_type()) << ""; + std::cout << "data_type=" << EnumAttributeType(tensor.data_type()) << std::endl; if (tensor.has_raw_data()) { const char *data = tensor.raw_data().data(); std::cout << " size=(" << tensor.raw_data().size() << ")\n"; if (tensor.data_type() == TensorProto_DataType_FLOAT) { const float *float_data = reinterpret_cast(data); size_t size = tensor.raw_data().size() / sizeof(float); - std::cout << "data="; + std::cout << "data:"; for (size_t i = 0; i < std::min(size, static_cast(limit)); i++) { std::cout << float_data[i] << " "; } @@ -185,14 +186,20 @@ void printTensor(int id, TensorProto const &tensor, int limit = 0) { } void printAttr(int i, const AttributeProto &attr) { - std::cout << i << ":" << attr.name() << " " - << "( " << attr.ref_attr_name() << ")[" << EnumAttributeType(attr.type()) << "]" << std::endl << "{" ; - if (attr.has_t()) { - std::cout << "tensor:"; - printTensor(0, attr.t()); - } - if (attr.tensors_size() > 0) { - std::cout << "tensors:" << attr.tensors_size() << "\n"; + std::cout << "attr #" << i << std::endl; + if (attr.has_name()) { + std::cout << "name=" << attr.name() << std::endl; + } + if (attr.has_ref_attr_name()) { + std::cout << "ref_attr_name=" << attr.ref_attr_name() << std::endl; + } + std::cout << "type=" << EnumAttributeType(attr.type()) << std::endl; + if (attr.has_t()) { + std::cout << "t="; + printTensor(0, attr.t()); + } + if (attr.tensors_size() > 0) { + std::cout << "tensors: (" << attr.tensors_size() << ")" << std::endl; for (int i = 0; i < attr.tensors_size(); i++) printTensor(i, attr.tensors(i)); } @@ -232,7 +239,7 @@ void printAttr(int i, const AttributeProto &attr) { case AttributeProto_AttributeType_TUPLE: case AttributeProto_AttributeType_LIST: if (attr.values_size() > 0) { - std::cout << "values:" << std::endl; + std::cout << "values:" << "(" << attr.values_size() << ")" << std::endl; for (int i = 0; i < attr.values_size(); i++) { printAttr(i, attr.values(i)); } @@ -244,44 +251,44 @@ void printAttr(int i, const AttributeProto &attr) { std::cout << "}\n"; } -void printValue(ValueInfoProto const &val) { +void printValue(int id, ValueInfoProto const &val) { + std::cout << "value #" < &map) { - if (node.op_type().find("TupleGetItem") != std::string::npos) + auto op_type = node.op_type(); + if (op_type.find("TupleGetItem") != std::string::npos || op_type.find("MakeTuple") != std::string::npos) { auto th = map.find(node.input(0)); if (th != map.end()) { @@ -559,10 +567,15 @@ void printOut(ModelProto &model) { } for (int i = 0; i < graph.output_size(); i++) { const ValueInfoProto &v = graph.output(i); + if (v.tensor_size() > 1) { + + std::cout << v.tensor(0).name() << std::endl; + } else { auto t = map.find(v.name()); if (t != map.end()) { printOut(t->second,map); } + } } } diff --git a/trc/transformer/MultiHeadTester.py b/trc/transformer/MultiHeadTester.py index 286bc5b75d17d7003b70b68a045b6fa6a2e00bfe..a3ff09ccc184ce59c7082437f13e6df786b31f02 100755 --- a/trc/transformer/MultiHeadTester.py +++ b/trc/transformer/MultiHeadTester.py @@ -1309,6 +1309,7 @@ class TransformerEncoderLayerX(Cell): input_x = x else: input_x = self.layernorm1(x) + # return input_x input_x = F.cast(input_x, self.dtype) # indicate whether reset saved states @@ -1327,7 +1328,6 @@ class TransformerEncoderLayerX(Cell): attention, layer_present = self.attention(input_x, input_x, input_x, input_mask, self.key_past, self.value_past, batch_valid_length) - # For post-layernorm the inputs for residual path are output of self-attention and output of layernorm if self.post_layernorm_residual: x = self.add(input_x, attention) diff --git a/trc/transformer/cfg_bert.config b/trc/transformer/cfg_bert.config index 0071f96e928177136b8cc66a55627f982911ef49..5f053e985f4111e98b1a9f12d2eb68334b718d83 100644 --- a/trc/transformer/cfg_bert.config +++ b/trc/transformer/cfg_bert.config @@ -1,2 +1,3 @@ [gpu_context] -input_shape=input_ids:[1,256];token_type_ids:[1,256];input_mask:[1,256] \ No newline at end of file +input_shape=input_ids:[1,128];token_type_ids:[1,128];input_mask:[1,128,128] + diff --git a/trc/transformer/deploy.sh b/trc/transformer/deploy.sh index 001af7d5e01363fa9cb1346d27a306d578884e19..836780dcdff5836f59ef2b1abf94132cebf33a4f 100755 --- a/trc/transformer/deploy.sh +++ b/trc/transformer/deploy.sh @@ -5,13 +5,15 @@ system=${base}/trc/system_test/release/ubuntu_x86/mindspore-lite-${version}-linu benchmark=${system}/tools/benchmark/benchmark readir=${base}/trc/readers/mindir/readir server=caspi -gpu_id=0 -while getopts "ctG:" opt ; do +gpu_id=6 +while getopts "ctdG:" opt ; do case "${opt}" in t) time=true ;; G) gpu_id=$OPTARG ;; + d) + gbd=true ;; *) echo "Unknown option ${opt}!" ;; @@ -55,7 +57,13 @@ echo ${input_files} command="cd ${PWD} && " command+="LD_LIBRARY_PATH=${system}/runtime/lib:${system}/tools/converter/lib CUDA_VISIBLE_DEVICES=${gpu_id} " # command+=" NVIDIA_TF32_OVERRIDE=0 " +# command+="${benchmark} --modelFile=$1 --numThreads=1 --warmUpLoopCount=10 --loopCount=1000 --modelType=MindIR " +if [ ${gbd} == true ] +then + command+="gdb --args " +fi command+="${benchmark} --modelFile=$1 --numThreads=1 --warmUpLoopCount=10 --loopCount=1000 --modelType=MindIR " + command+="--inDataFile=\"${input_files}\" " if [ "${time}" == "" ] then @@ -65,7 +73,7 @@ if [ -f cfg_${model}.config ]; then command+="--configFile=cfg_${model}.config " fi command+="--device=GPU " -#command+="--enableFp16=true" +# command+="--enableFp16=true" echo command=${command} echo ${command} > execute.sh rsync -v execute.sh ${server}:${PWD} diff --git a/trc/transformer/ftBench.py b/trc/transformer/ftBench.py index cc0cfcebbb1a84e39f3683fa5b2211d455e8a1fd..0be9ff687c6ca3cf854fb2d430f33a79e1a60643 100755 --- a/trc/transformer/ftBench.py +++ b/trc/transformer/ftBench.py @@ -12,14 +12,14 @@ f.close() system = f'{base}/trc/system_test/release/ubuntu_x86/mindspore-lite-{version}-linux-x64' benchmark = f'{system}/tools/benchmark' work_dir=f'{base}/trc/transformer' -image = "private_transformer:0.1" -server = "caspi" +image = "private_transformer:0.2" +server = "local" enable_fp16 = "false" suffix="fp32" usage='enter the correct parameters: app=ch\\trc, act=runtime\\be, loop count=int>=0, server=local\\num of server\nif app=trc and act=be loop count must be 1' app='ch' act='be' -cuda_visible_dev=6 +cuda_visible_dev=3 loop_count=1 if len(sys.argv)>2 or len(sys.argv)==1: parameters=sys.argv[1:] @@ -130,7 +130,6 @@ for line_model_arg in models_arg: os.system(f"ssh {server} 'cd {system}/.. && tar -xzf {system}/../mindspore-lite-{version}-linux-x64.tar.gz'") os.system(f"rsync -v {base}/trc/transformer/*{model_name}* {server}:{base}/trc/transformer/") os.system(f"./deploy.sh convv_{model_name}_fwd.mindir") - os.system(f"./deploy.sh convv_{model_name}_fwd.mindir run") # os.system(f"ssh {server} 'cd {benchmark} && CUDA_VISIBLE_DEVICES={cuda_visible_dev} LD_LIBRARY_PATH={system}/runtime/lib:{system}/tools/converter/lib ./benchmark {benchmark_args}'" ) # os.system(f"mkdir {base}/trc/transformer/{model_name}{numcount}") # os.system(f"cp {base}/trc/transformer/{model_name}* {base}/trc/transformer/{model_name}{numcount}/") diff --git a/trc/transformer/models.txt b/trc/transformer/models.txt index b69028b155b878826aae6f5042efd36ed0908c41..0ddf679a10cdbf4afff73c08be8cc681177ac3bb 100755 --- a/trc/transformer/models.txt +++ b/trc/transformer/models.txt @@ -1,8 +1,8 @@ --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 - --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 +# +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 @@ -14,11 +14,11 @@ #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer #run the following tests before push --b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 +#-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 #-b 1 -l 66 -s 128 -t 256 -H 12 -S 768 -p 0 -m mha_cross --b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross --b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 --b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross +#-b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross +#-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 +#-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross #-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer #-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer @@ -30,13 +30,13 @@ #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -f 3072 -x 0 -m transformer_encoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer_t5 #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer_t5 --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer --b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_decoder_layer #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer @@ -78,9 +78,14 @@ #-b 8 -l 12 -H 4 -S 512 -s 128 -P 1 -m transformer_encoder_layer #-b 1 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer #-b 4 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer -#-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -m transformer_encoder_layer +#-b 8 -l 12 -H 12 -S 768 -s 128 -P 0 -c true -m transformer_encoder_layer + #-b 1 -l 12 -H 4 -S 512 -s 128 -P 0 -f 3072 -m transformer_encoder_layer #-b 64 -l 12 -H 12 -S 768 -s 128 -m bert +#-b 16 -l 12 -H 12 -S 768 -s 512 -c 75 -m bert +#-b 32 -l 12 -H 12 -S 768 -s 512 -c 75 -m bert +#-b 64 -l 12 -H 12 -S 768 -s 512 -c 75 -m bert + #-b 64 -l 24 -H 12 -S 768 -s 128 -m bert # ------------------------- Tests coverage ----------------------------------- @@ -101,7 +106,6 @@ #-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -m transformer_decoder_layer #-b 1 -l 2 -H 2 -S 8 -s 20 -f 1024 -P True -m bert #-b 1 -l 2 -H 2 -S 2 -s 128 -m T5 -#-b 1 -l 2 -H 2 -S 8 -s 20 -f 1024 -P 1 -m bert #-b 1 -l 12 -H 12 -S 768 -s 128 -m bert #-b 8 -l 12 -H 12 -S 768 -s 128 -m bert @@ -144,3 +148,62 @@ #-b 1 -l 6 -s 512 -t 512 -H 8 -S 512 -f 2048 -m transformer #-b 1 -l 6 -s 128 -t 128 -H 12 -S 768 -f 3072 -m transformer #-b 1 -l 6 -s 512 -t 512 -H 12 -S 768 -f 3072 -m transformer + +#my-tests +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 1 -m transformer_encoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer_t5 +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 1 -m transformer_decoder_layer_t5 +#-b 1 -l 66 -s 128 -H 12 -S 768 -p 0 -m mha_x1 +#-b 1 -l 66 -s 20 -t 20 -H 3 -S 15 -p 0 -m mha_cross +#-b 1 -l 66 -s 20 -H 4 -S 768 -p 0 -m mha_T5 +#-b 1 -l 66 -s 20 -t 40 -H 4 -S 768 -p 0 -m mha_T5_cross +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_decoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 0 -x 0 -m transformer_encoder_layer +#-b 1 -l 12 -H 2 -S 8 -s 20 -f 32 -P 1 -x 0 -m transformer_encoder_layer +-b 6 -l 12 -H 12 -S 768 -s 128 -f 3072 -P 0 -c 75 -m bert +#-b 6 -l 12 -s 128 -t 256 -H 8 -S 512 -f 2048 -c 75 -m T5 + +#-b 1 -s 128 -t 256 -H 8 -S 512 -m mha_T5_cross + +#fusion encoder + decoder: +#0.854797% LAYERNORM+128+256 +#0.861399% LAYERNORM+128+128 +#0.21935% NO LAYERNORM+128+256 :(trt = 0.256139%) +#0.399801% NO LAYERNORM+128+128 + +#0.223942% NO LAYERNORM + MHA_FUSION + 128+256 + +#0.766314% NO LAYERNORM +128+256 + VSL +#0.57975% NO LAYERNORM +256+128 + VSL +#1.04311% NO LAYERNORM +128+128 + VSL +#1.04311% NO LAYERNORM +256+256 + VSL + +#vsl+memory_mask row+col : 0.496992% + tgt!=src : 0.766314% + pos : 0.686856% +#+ all col+row : 0.742878% + not pos = 0.67844% +# +tgt!=src : 0.734175% + not pos = 0.687988% +#vsl+memory_mask col : 0.496992% + tgt!=src : 0.766314% +#vsl+memory_mask row : 88.2373% + +#1 with norm 0.315469% whithout 0.168434% + +#fp32 no-vsl no-layer-norm = 0.427564% +#fp32 no-vsl layer-norm = 0.427583% +#fp32 vsl no-layer-norm = 0.427564% +#fp32 vsl layer-norm = 0.427583% + +#full model + vsl = 0.151727% + no-tf = 0.205553% + tgt!=src = 0.204537% + +#T5 6 batch +vsl : 0.66678% +# no vsl : 0.324536% +#T5 1 batch vsl : 0.224442% +#no vsl : 0.199856% + +#without layernorm encoder and decoder +#T5 6 batch +vsl : 0.226612% without decoder_layernorm 0.226612% +# no vsl :0.333789% without decoder_layernorm : 0.333875% trt : 0.314171% + +#with encoder + decoder convert layer-norm 0.666895% +#without 0.326567% +#with mix 1.82171% diff --git a/trc/transformer/t.config b/trc/transformer/t.config index 0fad133d432b210e3d49d70c6a36f480ff877951..9b6909c5b952013619917aab943a2d0c5e27e6ff 100755 --- a/trc/transformer/t.config +++ b/trc/transformer/t.config @@ -3,3 +3,4 @@ #fusion_blacklists="EncoderLayerFusion","DecoderLayerFusion" #fusion_blacklists="DecoderLayerFusion" #fusion_blacklists="EncoderLayerFusion" + diff --git a/trc/transformer/train_transformer_export.py b/trc/transformer/train_transformer_export.py index 6356559f794df884b1b8bf8721bae7a55eadc87b..cc99185ad65bceed5bcff2a46c5cda978d84f6a4 100755 --- a/trc/transformer/train_transformer_export.py +++ b/trc/transformer/train_transformer_export.py @@ -354,6 +354,8 @@ def transformer_encoder_layer_create(): # saveCalib('Default/Add-op267', np.array(y), f_y)#2 dims elif app=="trc": + if compress: + y = pruneTensor(y,seq_len,1) saveT(y, name + "_output1.fp" + suffix) def transformer_encoder_layer_t5_create(): @@ -368,12 +370,17 @@ def transformer_encoder_layer_t5_create(): encoder_input_value = M.Tensor(np.random.normal(0., 0.5, (batch, seq, hid_size)), M.float32) encoder_input_mask = M.Tensor(np.random.normal(0., 0.5, (batch, seq, seq)), M.float32) pos = M.Tensor(np.random.normal(0., 0.5, (batch, head_num, seq, tgt_seq_len)), M.float32) + source_ids = Tensor(np.ones((batch,seq)), M.int32) # encoder_input_value = M.Tensor(np.zeros((batch, seq, hid_size)), M.float32) # encoder_input_mask = M.Tensor(np.zeros((batch, seq, seq)), M.float32) q = model.attention.dense1.weight.asnumpy()#.transpose() # hid_size x hid_size k = model.attention.dense2.weight.asnumpy()#.transpose() v = model.attention.dense3.weight.asnumpy()#.transpose() - + actual_seq = seq // 2 + if compress: + input_value = source_ids.asnumpy() + input_value[:,actual_seq:] = 0 + source_ids = M.Tensor.from_numpy(input_value) w = np.concatenate((q, k, v)) # 3xhid_size x hid_size w = w.transpose() # hid_size x 3xhid_size wt = M.Tensor(w, w_compute_type) @@ -388,6 +395,7 @@ def transformer_encoder_layer_t5_create(): saveT(encoder_input_value, name + "_input1.fp" + suffix) saveT(encoder_input_mask, name + "_input2.fp" + suffix) saveT(pos, name + "_input3.fp" + suffix) + saveT(source_ids, name + "_input4.fp" + suffix) saveT(gl1, name + "_weight1.fp" + suffix) saveT(wt, name + "_weight2.fp" + suffix) saveT(wp, name + "_weight3.fp" + suffix) @@ -404,6 +412,10 @@ def transformer_encoder_layer_t5_create(): y = model(encoder_input_value, encoder_input_mask,pos) print('name=',name) export(model, encoder_input_value, encoder_input_mask,pos, file_name= name + "_fwd", file_format='MINDIR') + if compress: + y_num = y.asnumpy() + y_num[:,actual_seq:,:] = 0 + y = M.Tensor.from_numpy(y_num) # if app=="ch": f_y=open(f'./{name}_output.txt','w') out_name='output1' @@ -926,14 +938,15 @@ def bert_create(): base = repo.working_tree_dir name = "bert" str=" " - os.system(f"python {base}/../transformer_repo/pretrain_{name}.py {str.join(sys.argv[1:-4])} " ) + os.system(f"python {base}/../transformer_repo/pretrain_{name}.py {str.join(sys.argv[1:-4])} > bert.txt" ) def T5_create(): + M.context.set_context(mode=M.context.PYNATIVE_MODE) repo = git.Repo('.', search_parent_directories=True) base = repo.working_tree_dir name = "T5" str=" " - os.system(f"python {base}/../transformer_repo/pretrain_{name}.py {str.join(sys.argv[1:-4])} " ) + os.system(f"python {base}/../transformer_repo/pretrain_{name}.py {str.join(sys.argv[1:-4])} > T5.txt" ) def vit_create(): repo = git.Repo('.', search_parent_directories=True) base = repo.working_tree_dir