diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8caa766c80fdeb4ba5df1e4b3b89fc98d7ee7e80 --- /dev/null +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp @@ -0,0 +1,122 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + */ +#include "reduce_max_custom_tiling.h" +#include "register/op_def_registry.h" +#include "tiling/platform/platform_ascendc.h" + +constexpr int32_t ELE_NUM_PER_BANK = 16; // half +constexpr int32_t BATCH_PER_CORE = 32; + +namespace optiling { +static ge::graphStatus TilingFunc(gert::TilingContext* context) +{ + ReduceMaxTilingData tiling; + const gert::StorageShape* xShapePtr = context->GetInputShape(0); + const gert::Shape& xShape = xShapePtr->GetStorageShape(); + const gert::RuntimeAttrs* attrs = context->GetAttrs(); + const uint32_t* reduceDim = attrs->GetAttrPointer(0); + const uint32_t xEleNum = context->GetInputTensor(0)->GetShapeSize(); + const uint32_t colNum = xShape.GetDim(*reduceDim); + const uint32_t rowNum = xEleNum / colNum; + + auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + const uint32_t actCoreNum = ascendcPlatform.GetCoreNumAiv(); + uint32_t oneRepeatCalcount = BATCH_PER_CORE; + uint32_t allTasks = (rowNum + oneRepeatCalcount - 1) / oneRepeatCalcount; + uint32_t usedCoreNum = allTasks; + if (usedCoreNum > actCoreNum) { + usedCoreNum = actCoreNum; + } + uint32_t calTaskPerCore = allTasks / usedCoreNum; + uint32_t theSplitCore = allTasks % usedCoreNum; + uint32_t lastTaskTail = rowNum % oneRepeatCalcount; + if (lastTaskTail == 0) { + lastTaskTail = oneRepeatCalcount; + } + uint32_t colCalcount = (colNum + ELE_NUM_PER_BANK - 1) / ELE_NUM_PER_BANK * ELE_NUM_PER_BANK; + + tiling.set_coreNum(usedCoreNum); + tiling.set_realColVal(colNum); + tiling.set_oneCalNum(oneRepeatCalcount); + tiling.set_calTaskPerCore(calTaskPerCore); + tiling.set_theSplitCore(theSplitCore); + tiling.set_roundColVal(colCalcount); + tiling.set_lastTaskTail(lastTaskTail); + + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + context->SetTilingKey(1); + context->SetBlockDim(usedCoreNum); + size_t *currentWorkspace = context->GetWorkspaceSizes(1); + currentWorkspace[0] = 0; + return ge::GRAPH_SUCCESS; +} +} // namespace optiling + +namespace ge { +static graphStatus InferShape(gert::InferShapeContext* context) +{ + const gert::Shape* xShape = context->GetInputShape(0); + const gert::RuntimeAttrs* attrs = context->GetAttrs(); + const uint32_t* reduceDim = attrs->GetAttrPointer(0); + const uint32_t* isKeepDim = attrs->GetAttrPointer(1); + gert::Shape* yShape = context->GetOutputShape(0); + gert::Shape* idxShape = context->GetOutputShape(1); + + for (size_t i = 0; i < xShape->GetDimNum(); i++) { + if (i == *reduceDim) { + if (*isKeepDim) { + yShape->AppendDim(1); + } else { + continue; + } + } else { + yShape->AppendDim(xShape->GetDim(i)); + } + } + *idxShape = *yShape; + + return GRAPH_SUCCESS; +} +} // namespace ge + +namespace ops { +class ReduceMaxCustom : public OpDef { +public: + explicit ReduceMaxCustom(const char* name) : OpDef(name) + { + this->Input("x") + .ParamType(REQUIRED) + .DataType({ ge::DT_FLOAT16 }) + .Format({ ge::FORMAT_ND }) + .UnknownShapeFormat({ ge::FORMAT_ND }); + this->Output("y") + .ParamType(REQUIRED) + .DataType({ ge::DT_FLOAT16 }) + .Format({ ge::FORMAT_ND }) + .UnknownShapeFormat({ ge::FORMAT_ND }); + this->Output("idx") + .ParamType(REQUIRED) + .DataType({ ge::DT_INT32 }) + .Format({ ge::FORMAT_ND }) + .UnknownShapeFormat({ ge::FORMAT_ND }); + + this->Attr("reduceDim") + .AttrType(REQUIRED) + .Int(); + this->Attr("isKeepDim") + .AttrType(OPTIONAL) + .Int(1); + + this->SetInferShape(ge::InferShape); + + this->AICore() + .SetTiling(optiling::TilingFunc); + + this->AICore().AddConfig("ascend910"); + } +}; + +OP_ADD(ReduceMaxCustom); +} // namespace ops diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom_tiling.h b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..05f0f88708cca3b971c118e56badfd368bc3a140 --- /dev/null +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom_tiling.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + */ +#ifndef REDUCE_MAX_CUSTOM_TILING_H +#define REDUCE_MAX_CUSTOM_TILING_H +#include "register/tilingdata_base.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(ReduceMaxTilingData) + TILING_DATA_FIELD_DEF(uint32_t, coreNum); + TILING_DATA_FIELD_DEF(uint32_t, realColVal); + TILING_DATA_FIELD_DEF(uint32_t, oneCalNum); + TILING_DATA_FIELD_DEF(uint32_t, calTaskPerCore); + TILING_DATA_FIELD_DEF(uint32_t, theSplitCore); + TILING_DATA_FIELD_DEF(uint32_t, roundColVal); + TILING_DATA_FIELD_DEF(uint32_t, lastTaskTail); +END_TILING_DATA_DEF; + +REGISTER_TILING_DATA_CLASS(ReduceMaxCustom, ReduceMaxTilingData) +} +#endif // REDUCE_MAX_CUSTOM_TILING_H diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..05d53edbe694bbede84223ffe1d70b0b9bb03236 --- /dev/null +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp @@ -0,0 +1,127 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + */ +#include "kernel_operator.h" +using namespace AscendC; + +constexpr int32_t BUFFER_NUM = 2; + +template +class KernelReduceMax { +public: + __aicore__ inline KernelReduceMax() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR idx, uint32_t coreNum, uint32_t realColVal, + uint32_t oneCalNum, uint32_t calTaskPerCore, uint32_t theSplitCore, + uint32_t roundColVal, uint32_t lastTaskTail) + { + this->isLastCore = GetBlockIdx() == GetBlockNum() - 1 ? true : false; + this->colCalcount = roundColVal; + this->oneRepeatCalcount = oneCalNum; + this->colNum = realColVal; + this->tailSize = roundColVal - realColVal; + this->lastTaskTail = lastTaskTail; + + int32_t offsetX = (calTaskPerCore + 1) * GetBlockIdx() * oneCalNum * realColVal; + int32_t offsetY = (calTaskPerCore + 1) * GetBlockIdx() * oneCalNum; + if (theSplitCore <= GetBlockIdx()) { + offsetX = (calTaskPerCore + 1) * theSplitCore * oneCalNum * realColVal + + (GetBlockIdx() - theSplitCore) * oneCalNum * calTaskPerCore * realColVal; + offsetY = (calTaskPerCore + 1) * theSplitCore * oneCalNum + + (GetBlockIdx() - theSplitCore) * oneCalNum * calTaskPerCore; + } + + xGm.SetGlobalBuffer((__gm__ T*)x + offsetX, this->colCalcount); + yGm.SetGlobalBuffer((__gm__ T*)y + offsetY, this->oneRepeatCalcount); + idxGm.SetGlobalBuffer((__gm__ uint32_t*)idx + offsetY, this->oneRepeatCalcount); + + pipe.InitBuffer(inQueueX, BUFFER_NUM, this->colCalcount * sizeof(T)); + pipe.InitBuffer(outQueueY, BUFFER_NUM, this->oneRepeatCalcount * sizeof(T)); + pipe.InitBuffer(outQueueIdx, BUFFER_NUM, this->oneRepeatCalcount * sizeof(uint32_t)); + pipe.InitBuffer(yTmp, this->colCalcount * sizeof(T)); + pipe.InitBuffer(workTmp, this->colCalcount * sizeof(T)); + yTmpLocal = yTmp.Get(); + workTmpLocal = workTmp.Get(); + } + __aicore__ inline void Process(int32_t loopCount, uint32_t innerTask) + { + for (int32_t i = 0; i < loopCount; i++) { + if (this->isLastCore && i == loopCount - 1) { + innerTask = this->lastTaskTail; + } + yLocal = outQueueY.AllocTensor(); + idxLocal = outQueueIdx.AllocTensor(); + for (int32_t j = 0; j < innerTask; j++) { + CopyIn(i, j); + Compute(j); + } + outQueueY.EnQue(yLocal); + outQueueIdx.EnQue(idxLocal); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t i, int32_t j) + { + xLocal = inQueueX.AllocTensor(); + DataCopy(xLocal, xGm[(i * this->oneRepeatCalcount + j) * this->colNum], this->colCalcount); + inQueueX.EnQue(xLocal); + } + __aicore__ inline void Compute(int32_t j) + { + xLocal = inQueueX.DeQue(); + ReduceMax(yTmpLocal, xLocal, workTmpLocal, this->colCalcount, true); + yLocal.SetValue(j, yTmpLocal.GetValue(0)); + T indexVal = yTmpLocal.GetValue(1); + uint32_t index = 0; + index = *reinterpret_cast(&indexVal); + idxLocal.SetValue(j, index); + inQueueX.FreeTensor(xLocal); + } + __aicore__ inline void CopyOut(int32_t i) + { + yLocal = outQueueY.DeQue(); + idxLocal = outQueueIdx.DeQue(); + DataCopy(yGm[i * this->oneRepeatCalcount], yLocal, this->oneRepeatCalcount); + DataCopy(idxGm[i * this->oneRepeatCalcount], idxLocal, this->oneRepeatCalcount); + outQueueY.FreeTensor(yLocal); + outQueueIdx.FreeTensor(idxLocal); + } + +private: + TPipe pipe; + TQue inQueueX; + TQue outQueueY, outQueueIdx; + + TBuf yTmp, workTmp; + GlobalTensor xGm, yGm; + GlobalTensor idxGm; + + LocalTensor yTmpLocal, workTmpLocal; + LocalTensor xLocal, yLocal; + LocalTensor idxLocal; + + int32_t colCalcount; + int32_t oneRepeatCalcount; + int32_t colNum; + int32_t tailSize; + int32_t lastTaskTail; + bool isLastCore; +}; + +extern "C" __global__ __aicore__ void reduce_max_custom(GM_ADDR x, GM_ADDR y, GM_ADDR idx, GM_ADDR workspace, + GM_ADDR tiling) +{ + GET_TILING_DATA(tilingData, tiling); + KernelReduceMax op; + op.Init(x, y, idx, tilingData.coreNum, tilingData.realColVal, tilingData.oneCalNum, + tilingData.calTaskPerCore, tilingData.theSplitCore, tilingData.roundColVal, + tilingData.lastTaskTail); + int32_t loopCount = tilingData.calTaskPerCore + 1; + if (tilingData.theSplitCore <= GetBlockIdx()) { + loopCount = tilingData.calTaskPerCore; + } + if (TILING_KEY_IS(1)) { + op.Process(loopCount, tilingData.oneCalNum); + } +} diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/ReduceMaxCustomKernelNpu.cpp b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/ReduceMaxCustomKernelNpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f20f309c5ac58d213c2c9adb7934943bd29c6e25 --- /dev/null +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/ReduceMaxCustomKernelNpu.cpp @@ -0,0 +1,32 @@ +#include + +#include "torch_npu/csrc/framework/utils/OpAdapter.h" +#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" +#include "torch_npu/csrc/aten/NPUNativeFunctions.h" +#include "torch_npu/csrc/aten/ops/op_api/op_api_common.h" + +namespace at_npu { +namespace native { +using torch::autograd::Function; +using torch::autograd::AutogradContext; + +tuple NPUNativeFunctions::npu_reduce_max_custom(const at::Tensor& x, + int64_t reduceDim, + int64_t isKeepDim) { + at::IntArrayRef dim = reduceDim; + bool keepDimBool = true ? isKeepDim : false; + at::Tensor y = OpPreparation::ApplyTensorWithoutFormat(x, reduce_ops_npu_output_size(x, dim, true)); + at::Tensor idx = NPUNativeFunctions::npu_dtype_cast(y, at::kInt); + int64_t lastDimVal = x.sizes().size() - 1; + at::Tensor xTrans = x; + for (int64_t i = reduceDim; i < lastDimVal; i++) { + xTrans = xTrans.transpose(i, i + 1).contiguous(); + } + EXEC_NPU_CMD(aclnnReduceMaxCustom, xTrans, lastDimVal, isKeepDim, y, idx); + auto outputShape = reduce_ops_npu_output_size(x, dim, keepDimBool); + y = y.reshape(outputShape); + idx = idx.reshape(outputShape); + return tuple(y, idx); +} +} // namespace native +} // namespace at_npu diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/npu_native_functions.yaml b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/npu_native_functions.yaml index 1ad90dbcf92109350e13c240c91cd478d6ef9f95..ac6766375bcb24413ec3018ad55d645a73370e11 100644 --- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/npu_native_functions.yaml +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/npu_native_functions.yaml @@ -1 +1,2 @@ -- func: npu_add_custom(Tensor x, Tensor y) -> Tensor \ No newline at end of file +- func: npu_add_custom(Tensor x, Tensor y) -> Tensor +- func: npu_reduce_max_custom(Tensor x, int reduceDim, int isKeepDim=1) -> (Tensor, Tensor) diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/readme.md b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/readme.md index 104a863eff330f97b54f6639139a5bdc591a79f2..846beba3407d9b8ffdf10ff8b4eab73bbbd2151c 100644 --- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/readme.md +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/readme.md @@ -1,5 +1,2 @@ -# acl samples -bash run.sh ${is_dynamic}(0/1) ${replay_mode}(/batch/iterator) - -# run static op (depend on chip version) -bash run.sh 0 +# run pytorch samples +bash run.sh diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh index e2da06738b483b8eddd875f9186e9c5ea2f0e078..97d52d2a455f03f94ae330f90c146b8dc06459bf 100644 --- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh @@ -10,9 +10,7 @@ CURRENT_DIR=$( ); cd $CURRENT_DIR # 导出环境变量 -IS_DYNAMIC=$1 -REPLAY_MODE=$2 -PYTORCH_VERSION=1.11.0 +PYTORCH_VERSION=5.0.rc2-pytorch1.11.0 PTA_DIR=pytorch-v${PYTORCH_VERSION} if [ ! $ASCEND_HOME_DIR ]; then @@ -58,19 +56,52 @@ print(get_soc_version()) SOC_SHORT_VERSION=`echo $SOC_VERSION_CONCAT | cut -d ',' -f 2` } -function main() { - if [[ ${IS_DYNAMIC}"x" = "x" ]]; then - echo "ERROR: IS_DYNAMIC is invalid!" +# 校验summary文件夹 +function check_summary() { + summary_filename_list=( + api_statistic_0_1 + op_statistic_0_1 + op_summary_0_1 + prof_rule_0 + task_time_0_1 + ) + files=$(ls ./device_*/summary/) + if [ $(ls ./device_*/summary/ | wc -l) -eq ${#summary_filename_list[@]} ];then + for filename in ${summary_filename_list[@]}; do + if [[ *$filename* == $files ]];then + echo "ERROR: summary files not exist" + return 1 + fi + done + echo "INFO: All summary result exist" + else + echo "ERROR: check summary result fail" return 1 fi +} - if [[ ${REPLAY_MODE}"x" = "x" || ${REPLAY_MODE} = "batch" || ${REPLAY_MODE} = "iterator" ]]; then - echo "INFO: REPLAY_MODE valid : ${REPLAY_MODE}" +# 校验timeline文件夹 +function check_timeline() { + timeline_filename_list=( + msprof_0_1 + task_time_0_1 + ) + files=$(ls ./device_*/timeline/) + if [ $(ls ./device_*/timeline/ | wc -l) -eq ${#timeline_filename_list[@]} ];then + for filename in ${timeline_filename_list[@]}; do + if [[ *$filename* == $files ]];then + echo "ERROR: timeline files not exist" + return 1 + fi + done + echo "INFO: All timeline result exist" else - echo "ERROR: REPLAY_MODE is invalid!" + echo "ERROR: check timeline result fail" return 1 fi +} +function main() { # 清除遗留生成文件和日志文件 rm -rf $HOME/ascend/log/* rm -rf $ASCEND_OPP_PATH/vendors/* @@ -79,7 +110,7 @@ function main() { # 生成自定义算子工程样例 JSON_NAME=add_custom CAMEL_JSON_NAME=`echo $JSON_NAME | sed -r 's/(^|-|_)(\w)/\U\2/g'` - msopgen gen -i op_dev/${JSON_NAME}.json -f tf -c ai_core-${SOC_SHORT_VERSION} -lan cpp -out ./custom_op + msopgen gen -i op_dev/${JSON_NAME}.json -f tf -c ai_core-${SOC_FULL_VERSION} -lan cpp -out ./custom_op if [ $? -ne 0 ]; then echo "ERROR: msopgen custom op sample failed!" return 1 @@ -91,13 +122,6 @@ function main() { echo "ERROR: copy custom op files failed!" return 1 fi - if [[ $IS_DYNAMIC != 1 ]]; then - if [[ $REPLAY_MODE = "batch" ]]; then - sed -i "s/set(BATCH_MODE_REPLAY_LIST/set(BATCH_MODE_REPLAY_LIST ${CAMEL_JSON_NAME}/g" `grep "set(BATCH_MODE_REPLAY_LIST" -rl custom_op/op_kernel/CMakeLists.txt` - elif [[ $REPLAY_MODE = "iterator" ]]; then - sed -i "s/set(ITERATOR_MODE_REPLAY_LIST/set(ITERATOR_MODE_REPLAY_LIST ${CAMEL_JSON_NAME}/g" `grep "set(ITERATOR_MODE_REPLAY_LIST" -rl custom_op/op_kernel/CMakeLists.txt` - fi - fi sed -i "s#/usr/local/Ascend/latest#$ASCEND_HOME_DIR#g" `grep "/usr/local/Ascend/latest" -rl custom_op/CMakePresets.json` # 构建自定义算子包并安装 @@ -109,17 +133,22 @@ function main() { echo "INFO: build and install custom op run package success!" # PTA源码仓,可以自行放置zip包 - if [ ! -f "v${PYTORCH_VERSION}.zip" ]; then - wget https://gitee.com/ascend/pytorch/repository/archive/v${PYTORCH_VERSION}.zip --no-check-certificate + if [ ! -d "${PTA_DIR}" ]; then + if [ ! -f "v${PYTORCH_VERSION}.zip" ]; then + wget https://gitee.com/ascend/pytorch/repository/archive/v${PYTORCH_VERSION}.zip --no-check-certificate + fi + unzip -o -q v${PYTORCH_VERSION}.zip fi - rm -rf ${PTA_DIR}; unzip -o -q v${PYTORCH_VERSION}.zip # PTA自定义算子注册 - FUNCTION_REGISTE_FIELD=`cat pytorch_patch/npu_native_functions.yaml` + FUNCTION_REGISTE_FIELD="pytorch_patch/npu_native_functions.yaml" FUNCTION_REGISTE_FILE="${PTA_DIR}/torch_npu/csrc/aten/npu_native_functions.yaml" - if ! grep -q "\ $FUNCTION_REGISTE_FIELD" $FUNCTION_REGISTE_FILE; then - sed -i "/custom:/a \ $FUNCTION_REGISTE_FIELD" $FUNCTION_REGISTE_FILE - fi + cat $FUNCTION_REGISTE_FIELD | while read line + do + if ! grep -q "\ $line" $FUNCTION_REGISTE_FILE; then + sed -i "/custom:/a \ $line" $FUNCTION_REGISTE_FILE + fi + done # PTA自定义算子适配文件 cp -rf pytorch_patch/*.cpp ${PTA_DIR}/torch_npu/csrc/aten/ops/op_api @@ -136,61 +165,22 @@ function main() { # 解析dump文件为numpy文件 files=$(ls ./prof_total) - cd $CURRENT_DIR/prof_total/$files - msprof --export=on --output=$CURRENT_DIR/prof_total/$files - if [[ $? -eq 0 ]];then - echo "INFO: parse success" - else - echo "ERROR: pasrse failed" - return 1 - fi + for line in $files; + do + cd $CURRENT_DIR/prof_total/$line + msprof --export=on --output=$CURRENT_DIR/prof_total/$line + if [[ $? -eq 0 ]];then + echo "INFO: parse success" + else + echo "ERROR: pasrse failed" + return 1 + fi - # 校验summary文件夹 - summary_list=( - acl_0_1.csv - acl_statistic_0_1.csv - ge_op_execute_0_1.csv - op_statistic_0_1.csv - op_summary_0_1.csv - prof_rule_0.json - runtime_api_0_1.csv - task_time_0_1.csv - ) - if [ $(ls ./device_*/summary/ | wc -l) -eq ${#summary_list[@]} ];then - for summary in ${summary_list[@]}; do - if [ ! -f $(pwd)/device_0/summary/$summary ];then - echo "ERROR: summary files not exist" - return 1 - fi - done - echo "INFO: All summary result exist" - else - echo "ERROR: check summary result fail" - return 1 - fi + check_summary + check_timeline + done - # 校验timeline文件夹 - timeline_list=( - acl_0_1.json - ge_op_execute_0_1.json - msprof_0_1.json - runtime_api_0_1.json - task_time_0_1.json - thread_group_0_1.json - ) - if [ $(ls ./device_*/timeline/ | wc -l) -eq ${#timeline_list[@]} ];then - for timeline in ${timeline_list[@]}; do - if [ ! -f $(pwd)/device_0/timeline/$timeline ];then - echo "ERROR: timeline files not exist" - return 1 - fi - done - echo "INFO: timeline files exist" - else - echo "ERROR: timeline files not exist" - return 1 - fi - echo "INFO: Ascend C Add Custom SUCCESS" + echo "INFO: Test Ascend C Custom Op SUCCESS" } check_soc_version diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py index 5f7a07ca4fca7c7f192f1744a58c35d7a3c2db55..ac716c674745fa0059a84b5474bf71ae98ab561a 100644 --- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py +++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py @@ -24,5 +24,25 @@ class TestCustomAdd(TestCase): self.assertRtolEqual(output, x + y) +class TestReduceMaxCustom(TestCase): + def test_reduce_max_custom(self): + x_shape = [8, 255, 64, 13] + reduce_dim = 2 + assert x_shape[reduce_dim] % 16 == 0 + is_keep_dim = 1 + keep_dim = True if is_keep_dim else False + x = torch.rand(x_shape, dtype=torch.float16) + + prof_path = "./prof_total" + with torch.npu.profile(prof_path) as prof: + torch.npu.synchronize() + npu_out = torch_npu.npu_reduce_max_custom(x.npu(), reduce_dim, is_keep_dim) + torch.npu.synchronize() + cpu_out = torch.max(x, reduce_dim, keep_dim) + + assert torch.allclose(npu_out[0].cpu(), cpu_out[0], rtol=1e-3, atol=1e-3) + assert torch.allclose(npu_out[1].cpu().long(), cpu_out[1], rtol=1e-3, atol=1e-3) + + if __name__ == "__main__": run_tests()