diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8caa766c80fdeb4ba5df1e4b3b89fc98d7ee7e80
--- /dev/null
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ */
+#include "reduce_max_custom_tiling.h"
+#include "register/op_def_registry.h"
+#include "tiling/platform/platform_ascendc.h"
+
+constexpr int32_t ELE_NUM_PER_BANK = 16; // half
+constexpr int32_t BATCH_PER_CORE = 32;
+
+namespace optiling {
+static ge::graphStatus TilingFunc(gert::TilingContext* context)
+{
+    ReduceMaxTilingData tiling;
+    const gert::StorageShape* xShapePtr = context->GetInputShape(0);
+    const gert::Shape& xShape = xShapePtr->GetStorageShape();
+    const gert::RuntimeAttrs* attrs = context->GetAttrs();
+    const uint32_t* reduceDim = attrs->GetAttrPointer<uint32_t>(0);
+    const uint32_t xEleNum = context->GetInputTensor(0)->GetShapeSize();
+    const uint32_t colNum = xShape.GetDim(*reduceDim);
+    const uint32_t rowNum = xEleNum / colNum;
+
+    auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo());
+    const uint32_t actCoreNum = ascendcPlatform.GetCoreNumAiv();
+    uint32_t oneRepeatCalcount = BATCH_PER_CORE;
+    uint32_t allTasks = (rowNum + oneRepeatCalcount - 1) / oneRepeatCalcount;
+    uint32_t usedCoreNum = allTasks;
+    if (usedCoreNum > actCoreNum) {
+        usedCoreNum = actCoreNum;
+    }
+    uint32_t calTaskPerCore = allTasks / usedCoreNum;
+    uint32_t theSplitCore = allTasks % usedCoreNum;
+    uint32_t lastTaskTail = rowNum % oneRepeatCalcount;
+    if (lastTaskTail == 0) {
+        lastTaskTail = oneRepeatCalcount;
+    }
+    uint32_t colCalcount = (colNum + ELE_NUM_PER_BANK - 1) / ELE_NUM_PER_BANK * ELE_NUM_PER_BANK;
+
+    tiling.set_coreNum(usedCoreNum);
+    tiling.set_realColVal(colNum);
+    tiling.set_oneCalNum(oneRepeatCalcount);
+    tiling.set_calTaskPerCore(calTaskPerCore);
+    tiling.set_theSplitCore(theSplitCore);
+    tiling.set_roundColVal(colCalcount);
+    tiling.set_lastTaskTail(lastTaskTail);
+    
+    tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
+    context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
+    context->SetTilingKey(1);
+    context->SetBlockDim(usedCoreNum);
+    size_t *currentWorkspace = context->GetWorkspaceSizes(1);
+    currentWorkspace[0] = 0;
+    return ge::GRAPH_SUCCESS;
+}
+} // namespace optiling
+
+namespace ge {
+static graphStatus InferShape(gert::InferShapeContext* context)
+{
+    const gert::Shape* xShape = context->GetInputShape(0);
+    const gert::RuntimeAttrs* attrs = context->GetAttrs();
+    const uint32_t* reduceDim = attrs->GetAttrPointer<uint32_t>(0);
+    const uint32_t* isKeepDim = attrs->GetAttrPointer<uint32_t>(1);
+    gert::Shape* yShape = context->GetOutputShape(0);
+    gert::Shape* idxShape = context->GetOutputShape(1);
+    
+    for (size_t i = 0; i < xShape->GetDimNum(); i++) {
+        if (i == *reduceDim) {
+            if (*isKeepDim) {
+                yShape->AppendDim(1);
+            } else {
+                continue;
+            }
+        } else {
+            yShape->AppendDim(xShape->GetDim(i));
+        }
+    } 
+    *idxShape = *yShape;
+
+    return GRAPH_SUCCESS;
+}
+} // namespace ge
+
+namespace ops {
+class ReduceMaxCustom : public OpDef {
+public:
+    explicit ReduceMaxCustom(const char* name) : OpDef(name)
+    {
+        this->Input("x")
+            .ParamType(REQUIRED)
+            .DataType({ ge::DT_FLOAT16 })
+            .Format({ ge::FORMAT_ND })
+            .UnknownShapeFormat({ ge::FORMAT_ND });
+        this->Output("y")
+            .ParamType(REQUIRED)
+            .DataType({ ge::DT_FLOAT16 })
+            .Format({ ge::FORMAT_ND })
+            .UnknownShapeFormat({ ge::FORMAT_ND });
+        this->Output("idx")
+            .ParamType(REQUIRED)
+            .DataType({ ge::DT_INT32 })
+            .Format({ ge::FORMAT_ND })
+            .UnknownShapeFormat({ ge::FORMAT_ND });
+
+        this->Attr("reduceDim")
+            .AttrType(REQUIRED)
+            .Int();
+        this->Attr("isKeepDim")
+            .AttrType(OPTIONAL)
+            .Int(1);
+
+        this->SetInferShape(ge::InferShape);
+
+        this->AICore()
+            .SetTiling(optiling::TilingFunc);
+
+        this->AICore().AddConfig("ascend910");
+    }
+};
+
+OP_ADD(ReduceMaxCustom);
+} // namespace ops
diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom_tiling.h b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom_tiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..05f0f88708cca3b971c118e56badfd368bc3a140
--- /dev/null
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_host/reduce_max_custom_tiling.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ */
+#ifndef REDUCE_MAX_CUSTOM_TILING_H
+#define REDUCE_MAX_CUSTOM_TILING_H
+#include "register/tilingdata_base.h"
+
+namespace optiling {
+BEGIN_TILING_DATA_DEF(ReduceMaxTilingData)
+  TILING_DATA_FIELD_DEF(uint32_t, coreNum);
+  TILING_DATA_FIELD_DEF(uint32_t, realColVal);
+  TILING_DATA_FIELD_DEF(uint32_t, oneCalNum);
+  TILING_DATA_FIELD_DEF(uint32_t, calTaskPerCore);
+  TILING_DATA_FIELD_DEF(uint32_t, theSplitCore);
+  TILING_DATA_FIELD_DEF(uint32_t, roundColVal);
+  TILING_DATA_FIELD_DEF(uint32_t, lastTaskTail);
+END_TILING_DATA_DEF;
+
+REGISTER_TILING_DATA_CLASS(ReduceMaxCustom, ReduceMaxTilingData)
+}
+#endif // REDUCE_MAX_CUSTOM_TILING_H
diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..05d53edbe694bbede84223ffe1d70b0b9bb03236
--- /dev/null
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/acl_invocation/op_dev/op_kernel/reduce_max_custom.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ */
+#include "kernel_operator.h"
+using namespace AscendC;
+
+constexpr int32_t BUFFER_NUM = 2;
+
+template <typename T>
+class KernelReduceMax {
+public:
+    __aicore__ inline KernelReduceMax() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR idx, uint32_t coreNum, uint32_t realColVal,
+                                uint32_t oneCalNum, uint32_t calTaskPerCore, uint32_t theSplitCore,
+                                uint32_t roundColVal, uint32_t lastTaskTail)
+    {
+        this->isLastCore = GetBlockIdx() == GetBlockNum() - 1 ? true : false;
+        this->colCalcount = roundColVal;
+        this->oneRepeatCalcount = oneCalNum;
+        this->colNum = realColVal;
+        this->tailSize = roundColVal - realColVal;
+        this->lastTaskTail = lastTaskTail;
+
+        int32_t offsetX = (calTaskPerCore + 1) * GetBlockIdx() * oneCalNum * realColVal;
+        int32_t offsetY = (calTaskPerCore + 1) * GetBlockIdx() * oneCalNum;
+        if (theSplitCore <= GetBlockIdx()) {
+            offsetX = (calTaskPerCore + 1) * theSplitCore * oneCalNum * realColVal + 
+                      (GetBlockIdx() - theSplitCore) * oneCalNum * calTaskPerCore * realColVal;
+            offsetY = (calTaskPerCore + 1) * theSplitCore * oneCalNum + 
+                      (GetBlockIdx() - theSplitCore) * oneCalNum * calTaskPerCore;
+        }
+
+        xGm.SetGlobalBuffer((__gm__ T*)x + offsetX, this->colCalcount);
+        yGm.SetGlobalBuffer((__gm__ T*)y + offsetY, this->oneRepeatCalcount);
+        idxGm.SetGlobalBuffer((__gm__ uint32_t*)idx + offsetY, this->oneRepeatCalcount);
+
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, this->colCalcount * sizeof(T));
+        pipe.InitBuffer(outQueueY, BUFFER_NUM, this->oneRepeatCalcount * sizeof(T));
+        pipe.InitBuffer(outQueueIdx, BUFFER_NUM, this->oneRepeatCalcount * sizeof(uint32_t));
+        pipe.InitBuffer(yTmp, this->colCalcount * sizeof(T));
+        pipe.InitBuffer(workTmp, this->colCalcount * sizeof(T));
+        yTmpLocal = yTmp.Get<T>();
+        workTmpLocal = workTmp.Get<T>();
+    }
+    __aicore__ inline void Process(int32_t loopCount, uint32_t innerTask)
+    {
+        for (int32_t i = 0; i < loopCount; i++) {
+            if (this->isLastCore && i == loopCount - 1) {
+                innerTask = this->lastTaskTail;
+            }
+            yLocal = outQueueY.AllocTensor<T>();
+            idxLocal = outQueueIdx.AllocTensor<uint32_t>();
+            for (int32_t j = 0; j < innerTask; j++) {
+                CopyIn(i, j);
+                Compute(j);
+            }
+            outQueueY.EnQue<T>(yLocal);
+            outQueueIdx.EnQue<uint32_t>(idxLocal);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t i, int32_t j)
+    {
+        xLocal = inQueueX.AllocTensor<T>();
+        DataCopy(xLocal, xGm[(i * this->oneRepeatCalcount + j) * this->colNum], this->colCalcount);
+        inQueueX.EnQue(xLocal);
+    }
+    __aicore__ inline void Compute(int32_t j)
+    {
+        xLocal = inQueueX.DeQue<T>();
+        ReduceMax(yTmpLocal, xLocal, workTmpLocal, this->colCalcount, true);
+        yLocal.SetValue(j, yTmpLocal.GetValue(0));
+        T indexVal = yTmpLocal.GetValue(1);
+        uint32_t index = 0;
+        index = *reinterpret_cast<uint32_t*>(&indexVal);
+        idxLocal.SetValue(j, index);
+        inQueueX.FreeTensor(xLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t i)
+    {
+        yLocal = outQueueY.DeQue<T>();
+        idxLocal = outQueueIdx.DeQue<uint32_t>();
+        DataCopy(yGm[i * this->oneRepeatCalcount], yLocal, this->oneRepeatCalcount);
+        DataCopy(idxGm[i * this->oneRepeatCalcount], idxLocal, this->oneRepeatCalcount);
+        outQueueY.FreeTensor(yLocal);
+        outQueueIdx.FreeTensor(idxLocal);
+    }
+
+private:
+    TPipe pipe;
+    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueY, outQueueIdx;
+
+    TBuf<QuePosition::VECCALC> yTmp, workTmp;
+    GlobalTensor<T> xGm, yGm;
+    GlobalTensor<uint32_t> idxGm;
+
+    LocalTensor<T> yTmpLocal, workTmpLocal;
+    LocalTensor<T> xLocal, yLocal;
+    LocalTensor<uint32_t> idxLocal;
+
+    int32_t colCalcount;
+    int32_t oneRepeatCalcount;
+    int32_t colNum;
+    int32_t tailSize;
+    int32_t lastTaskTail;
+    bool isLastCore;
+};
+
+extern "C" __global__ __aicore__ void reduce_max_custom(GM_ADDR x, GM_ADDR y, GM_ADDR idx, GM_ADDR workspace,
+                                                        GM_ADDR tiling)
+{
+    GET_TILING_DATA(tilingData, tiling);
+    KernelReduceMax<half> op;
+    op.Init(x, y, idx, tilingData.coreNum, tilingData.realColVal, tilingData.oneCalNum,
+            tilingData.calTaskPerCore, tilingData.theSplitCore, tilingData.roundColVal,
+            tilingData.lastTaskTail);
+    int32_t loopCount = tilingData.calTaskPerCore + 1;
+    if (tilingData.theSplitCore <= GetBlockIdx()) {
+        loopCount = tilingData.calTaskPerCore;
+    }
+    if (TILING_KEY_IS(1)) {
+        op.Process(loopCount, tilingData.oneCalNum);
+    }
+}
diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/ReduceMaxCustomKernelNpu.cpp b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/ReduceMaxCustomKernelNpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f20f309c5ac58d213c2c9adb7934943bd29c6e25
--- /dev/null
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/ReduceMaxCustomKernelNpu.cpp
@@ -0,0 +1,32 @@
+#include <torch/csrc/autograd/custom_function.h>
+
+#include "torch_npu/csrc/framework/utils/OpAdapter.h"
+#include "torch_npu/csrc/framework/utils/CalcuOpUtil.h"
+#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
+#include "torch_npu/csrc/aten/ops/op_api/op_api_common.h"
+
+namespace at_npu {
+namespace native {
+using torch::autograd::Function;
+using torch::autograd::AutogradContext;
+
+tuple<at::Tensor, at::Tensor> NPUNativeFunctions::npu_reduce_max_custom(const at::Tensor& x,
+                                                                        int64_t reduceDim,
+                                                                        int64_t isKeepDim) {
+    at::IntArrayRef dim = reduceDim;
+    bool keepDimBool = true ? isKeepDim : false;
+    at::Tensor y = OpPreparation::ApplyTensorWithoutFormat(x, reduce_ops_npu_output_size(x, dim, true));
+    at::Tensor idx = NPUNativeFunctions::npu_dtype_cast(y, at::kInt);
+    int64_t lastDimVal = x.sizes().size() - 1;
+    at::Tensor xTrans = x;
+    for (int64_t i = reduceDim; i < lastDimVal; i++) {
+        xTrans = xTrans.transpose(i, i + 1).contiguous();
+    }
+    EXEC_NPU_CMD(aclnnReduceMaxCustom, xTrans, lastDimVal, isKeepDim, y, idx);
+    auto outputShape = reduce_ops_npu_output_size(x, dim, keepDimBool);
+    y = y.reshape(outputShape);
+    idx = idx.reshape(outputShape);
+    return tuple<at::Tensor, at::Tensor>(y, idx);
+}
+} // namespace native
+} // namespace at_npu
diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/npu_native_functions.yaml b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/npu_native_functions.yaml
index 1ad90dbcf92109350e13c240c91cd478d6ef9f95..ac6766375bcb24413ec3018ad55d645a73370e11 100644
--- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/npu_native_functions.yaml
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/pytorch_patch/npu_native_functions.yaml
@@ -1 +1,2 @@
-- func: npu_add_custom(Tensor x, Tensor y) -> Tensor
\ No newline at end of file
+- func: npu_add_custom(Tensor x, Tensor y) -> Tensor
+- func: npu_reduce_max_custom(Tensor x, int reduceDim, int isKeepDim=1) -> (Tensor, Tensor)
diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/readme.md b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/readme.md
index 104a863eff330f97b54f6639139a5bdc591a79f2..846beba3407d9b8ffdf10ff8b4eab73bbbd2151c 100644
--- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/readme.md
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/readme.md
@@ -1,5 +1,2 @@
-# acl samples
-bash run.sh ${is_dynamic}(0/1) ${replay_mode}(/batch/iterator)
-
-# run static op (depend on chip version)
-bash run.sh 0
+# run pytorch samples
+bash run.sh
diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh
index e2da06738b483b8eddd875f9186e9c5ea2f0e078..97d52d2a455f03f94ae330f90c146b8dc06459bf 100644
--- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/run.sh
@@ -10,9 +10,7 @@ CURRENT_DIR=$(
 ); cd $CURRENT_DIR
 
 # 导出环境变量
-IS_DYNAMIC=$1
-REPLAY_MODE=$2
-PYTORCH_VERSION=1.11.0
+PYTORCH_VERSION=5.0.rc2-pytorch1.11.0
 PTA_DIR=pytorch-v${PYTORCH_VERSION}
 
 if [ ! $ASCEND_HOME_DIR ]; then
@@ -58,19 +56,52 @@ print(get_soc_version())
     SOC_SHORT_VERSION=`echo $SOC_VERSION_CONCAT | cut -d ',' -f 2`
 }
 
-function main() {
-    if [[ ${IS_DYNAMIC}"x" = "x" ]]; then
-        echo "ERROR: IS_DYNAMIC is invalid!"
+# 校验summary文件夹
+function check_summary() {
+    summary_filename_list=(
+        api_statistic_0_1
+        op_statistic_0_1
+        op_summary_0_1
+        prof_rule_0
+        task_time_0_1
+    )
+    files=$(ls ./device_*/summary/)
+    if [ $(ls ./device_*/summary/ | wc -l) -eq ${#summary_filename_list[@]} ];then
+        for filename in ${summary_filename_list[@]}; do
+            if [[ *$filename* == $files ]];then
+                echo "ERROR: summary files not exist"
+                return 1
+            fi
+        done
+        echo "INFO: All summary result exist"
+    else
+        echo "ERROR: check summary result fail"
         return 1
     fi
+}
 
-    if [[ ${REPLAY_MODE}"x" = "x" || ${REPLAY_MODE} = "batch" || ${REPLAY_MODE} = "iterator" ]]; then
-        echo "INFO: REPLAY_MODE valid : ${REPLAY_MODE}"
+# 校验timeline文件夹
+function check_timeline() {
+    timeline_filename_list=(
+        msprof_0_1
+        task_time_0_1
+    )
+    files=$(ls ./device_*/timeline/)
+    if [ $(ls ./device_*/timeline/ | wc -l) -eq ${#timeline_filename_list[@]} ];then
+        for filename in ${timeline_filename_list[@]}; do
+            if [[ *$filename* == $files ]];then
+                echo "ERROR: timeline files not exist"
+                return 1
+            fi
+        done
+        echo "INFO: All timeline result exist"
     else
-        echo "ERROR: REPLAY_MODE is invalid!"
+        echo "ERROR: check timeline result fail"
         return 1
     fi
+}
 
+function main() {
     # 清除遗留生成文件和日志文件
     rm -rf $HOME/ascend/log/*
     rm -rf $ASCEND_OPP_PATH/vendors/*
@@ -79,7 +110,7 @@ function main() {
     # 生成自定义算子工程样例
     JSON_NAME=add_custom
     CAMEL_JSON_NAME=`echo $JSON_NAME | sed -r 's/(^|-|_)(\w)/\U\2/g'`
-    msopgen gen -i op_dev/${JSON_NAME}.json -f tf -c ai_core-${SOC_SHORT_VERSION} -lan cpp -out ./custom_op
+    msopgen gen -i op_dev/${JSON_NAME}.json -f tf -c ai_core-${SOC_FULL_VERSION} -lan cpp -out ./custom_op
     if [ $? -ne 0 ]; then
         echo "ERROR: msopgen custom op sample failed!"
         return 1
@@ -91,13 +122,6 @@ function main() {
         echo "ERROR: copy custom op files failed!"
         return 1
     fi
-    if [[ $IS_DYNAMIC != 1 ]]; then
-        if [[ $REPLAY_MODE = "batch" ]]; then
-            sed -i "s/set(BATCH_MODE_REPLAY_LIST/set(BATCH_MODE_REPLAY_LIST ${CAMEL_JSON_NAME}/g" `grep "set(BATCH_MODE_REPLAY_LIST" -rl custom_op/op_kernel/CMakeLists.txt`
-        elif [[ $REPLAY_MODE = "iterator" ]]; then
-            sed -i "s/set(ITERATOR_MODE_REPLAY_LIST/set(ITERATOR_MODE_REPLAY_LIST ${CAMEL_JSON_NAME}/g" `grep "set(ITERATOR_MODE_REPLAY_LIST" -rl custom_op/op_kernel/CMakeLists.txt`
-        fi
-    fi
     sed -i "s#/usr/local/Ascend/latest#$ASCEND_HOME_DIR#g" `grep "/usr/local/Ascend/latest" -rl custom_op/CMakePresets.json`
 
     # 构建自定义算子包并安装
@@ -109,17 +133,22 @@ function main() {
     echo "INFO: build and install custom op run package success!"
 
     # PTA源码仓，可以自行放置zip包
-    if [ ! -f "v${PYTORCH_VERSION}.zip" ]; then
-        wget https://gitee.com/ascend/pytorch/repository/archive/v${PYTORCH_VERSION}.zip --no-check-certificate
+    if [ ! -d "${PTA_DIR}" ]; then
+        if [ ! -f "v${PYTORCH_VERSION}.zip" ]; then
+            wget https://gitee.com/ascend/pytorch/repository/archive/v${PYTORCH_VERSION}.zip --no-check-certificate
+        fi
+        unzip -o -q v${PYTORCH_VERSION}.zip
     fi
-    rm -rf ${PTA_DIR}; unzip -o -q v${PYTORCH_VERSION}.zip
 
     # PTA自定义算子注册
-    FUNCTION_REGISTE_FIELD=`cat pytorch_patch/npu_native_functions.yaml`
+    FUNCTION_REGISTE_FIELD="pytorch_patch/npu_native_functions.yaml"
     FUNCTION_REGISTE_FILE="${PTA_DIR}/torch_npu/csrc/aten/npu_native_functions.yaml"
-    if ! grep -q "\  $FUNCTION_REGISTE_FIELD" $FUNCTION_REGISTE_FILE; then
-        sed -i "/custom:/a \  $FUNCTION_REGISTE_FIELD" $FUNCTION_REGISTE_FILE
-    fi
+    cat $FUNCTION_REGISTE_FIELD | while read line
+    do
+        if ! grep -q "\  $line" $FUNCTION_REGISTE_FILE; then
+            sed -i "/custom:/a \  $line" $FUNCTION_REGISTE_FILE
+        fi
+    done
     # PTA自定义算子适配文件
     cp -rf pytorch_patch/*.cpp ${PTA_DIR}/torch_npu/csrc/aten/ops/op_api
 
@@ -136,61 +165,22 @@ function main() {
 
     # 解析dump文件为numpy文件
     files=$(ls ./prof_total)
-    cd $CURRENT_DIR/prof_total/$files
-    msprof --export=on --output=$CURRENT_DIR/prof_total/$files
-    if [[ $? -eq 0 ]];then
-        echo "INFO: parse success"
-    else
-        echo "ERROR: pasrse failed"
-        return 1
-    fi
+    for line in $files;
+    do 
+        cd $CURRENT_DIR/prof_total/$line
+        msprof --export=on --output=$CURRENT_DIR/prof_total/$line
+        if [[ $? -eq 0 ]];then
+            echo "INFO: parse success"
+        else
+            echo "ERROR: pasrse failed"
+            return 1
+        fi
 
-    # 校验summary文件夹
-    summary_list=(
-        acl_0_1.csv
-        acl_statistic_0_1.csv
-        ge_op_execute_0_1.csv
-        op_statistic_0_1.csv
-        op_summary_0_1.csv
-        prof_rule_0.json
-        runtime_api_0_1.csv
-        task_time_0_1.csv
-    )
-    if [ $(ls ./device_*/summary/ | wc -l) -eq ${#summary_list[@]} ];then
-        for summary in ${summary_list[@]}; do
-            if [ ! -f $(pwd)/device_0/summary/$summary ];then
-                echo "ERROR: summary files not exist"
-                return 1
-            fi
-        done
-        echo "INFO: All summary result exist"
-    else
-        echo "ERROR: check summary result fail"
-        return 1
-    fi
+        check_summary
+        check_timeline
+    done
 
-    # 校验timeline文件夹
-    timeline_list=(
-        acl_0_1.json
-        ge_op_execute_0_1.json
-        msprof_0_1.json
-        runtime_api_0_1.json
-        task_time_0_1.json
-        thread_group_0_1.json
-    )
-    if [ $(ls ./device_*/timeline/ | wc -l) -eq ${#timeline_list[@]} ];then
-        for timeline in ${timeline_list[@]}; do
-            if [ ! -f $(pwd)/device_0/timeline/$timeline ];then
-                echo "ERROR: timeline files not exist"
-                return 1
-            fi
-        done
-        echo "INFO: timeline files exist"
-    else
-       echo "ERROR: timeline files not exist"
-       return 1
-    fi
-    echo "INFO: Ascend C Add Custom SUCCESS"
+    echo "INFO: Test Ascend C Custom Op SUCCESS"
 }
 
 check_soc_version
diff --git a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py
index 5f7a07ca4fca7c7f192f1744a58c35d7a3c2db55..ac716c674745fa0059a84b5474bf71ae98ab561a 100644
--- a/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py
+++ b/cplusplus/level1_single_api/4_op_dev/6_ascendc_custom_op/pytorch_invocation/test_ops_custom.py
@@ -24,5 +24,25 @@ class TestCustomAdd(TestCase):
         self.assertRtolEqual(output, x + y)
 
 
+class TestReduceMaxCustom(TestCase):
+    def test_reduce_max_custom(self):
+        x_shape = [8, 255, 64, 13]
+        reduce_dim = 2
+        assert x_shape[reduce_dim] % 16 == 0
+        is_keep_dim = 1
+        keep_dim = True if is_keep_dim else False
+        x = torch.rand(x_shape, dtype=torch.float16)
+
+        prof_path = "./prof_total"
+        with torch.npu.profile(prof_path) as prof:
+            torch.npu.synchronize()
+            npu_out = torch_npu.npu_reduce_max_custom(x.npu(), reduce_dim, is_keep_dim)
+            torch.npu.synchronize()
+        cpu_out = torch.max(x, reduce_dim, keep_dim)
+
+        assert torch.allclose(npu_out[0].cpu(), cpu_out[0], rtol=1e-3, atol=1e-3)
+        assert torch.allclose(npu_out[1].cpu().long(), cpu_out[1], rtol=1e-3, atol=1e-3)
+
+
 if __name__ == "__main__":
     run_tests()