From 5aec59e330e4f28e9489fc36889c1d175e22c4c1 Mon Sep 17 00:00:00 2001 From: zgx950813 Date: Mon, 9 Sep 2024 17:16:03 +0800 Subject: [PATCH] add testcase for simplify aclnn --- .../AclNNInvocationNaive/CMakeLists.txt | 62 +++++++ .../AclNNInvocationNaive/README.md | 60 +++++++ .../AclNNInvocationNaive/main.cpp | 167 ++++++++++++++++++ .../AclNNInvocationNaive/run.sh | 82 +++++++++ .../FrameworkLaunchOpGen/AddCustom.json | 40 +++++ .../AddCustom/op_host/add_custom.cpp | 76 ++++++++ .../AddCustom/op_host/add_custom_tiling.h | 22 +++ .../AddCustom/op_kernel/add_custom.cpp | 93 ++++++++++ .../FrameworkLaunchOpGen/README.md | 6 + .../FrameworkLaunchOpGen/install.sh | 51 ++++++ 10 files changed, 659 insertions(+) create mode 100644 operator/AddCustomSample/FrameworkLaunchOpGen/AclNNInvocationNaive/CMakeLists.txt create mode 100644 operator/AddCustomSample/FrameworkLaunchOpGen/AclNNInvocationNaive/README.md create mode 100644 operator/AddCustomSample/FrameworkLaunchOpGen/AclNNInvocationNaive/main.cpp create mode 100644 operator/AddCustomSample/FrameworkLaunchOpGen/AclNNInvocationNaive/run.sh create mode 100644 operator/AddCustomSample/FrameworkLaunchOpGen/AddCustom.json create mode 100644 operator/AddCustomSample/FrameworkLaunchOpGen/AddCustom/op_host/add_custom.cpp create mode 100644 operator/AddCustomSample/FrameworkLaunchOpGen/AddCustom/op_host/add_custom_tiling.h create mode 100644 operator/AddCustomSample/FrameworkLaunchOpGen/AddCustom/op_kernel/add_custom.cpp create mode 100644 operator/AddCustomSample/FrameworkLaunchOpGen/README.md create mode 100644 operator/AddCustomSample/FrameworkLaunchOpGen/install.sh diff --git a/operator/AddCustomSample/FrameworkLaunchOpGen/AclNNInvocationNaive/CMakeLists.txt b/operator/AddCustomSample/FrameworkLaunchOpGen/AclNNInvocationNaive/CMakeLists.txt new file mode 100644 index 000000000..7457b7c3e --- /dev/null +++ b/operator/AddCustomSample/FrameworkLaunchOpGen/AclNNInvocationNaive/CMakeLists.txt @@ -0,0 +1,62 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved. + +# CMake lowest version requirement +cmake_minimum_required(VERSION 3.5.1) + +# project information +project(acl_execute_add) + +# Compile options +add_compile_options(-std=c++11) + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "./") + +set(INC_PATH $ENV{DDK_PATH}) + +if (NOT DEFINED ENV{DDK_PATH}) + set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest") + message(STATUS "set default INC_PATH: ${INC_PATH}") +else () + message(STATUS "env INC_PATH: ${INC_PATH}") +endif() + +set(CUST_PKG_PATH "/root/z00557504/samples_smoke/operator/AddCustomSample/FrameworkLaunchOpGen/CustomOp/build_out/op_api") + +set(LIB_PATH $ENV{NPU_HOST_LIB}) + +# Dynamic libraries in the stub directory can only be used for compilation +if (NOT DEFINED ENV{NPU_HOST_LIB}) + set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/acllib/lib64/stub/") + set(LIB_PATH1 "/usr/local/Ascend/ascend-toolkit/latest/atc/lib64/stub/") + message(STATUS "set default LIB_PATH: ${LIB_PATH}") +else () + message(STATUS "env LIB_PATH: ${LIB_PATH}") +endif() + +# Header path +include_directories( + ${INC_PATH}/runtime/include + ${INC_PATH}/atc/include + ${CUST_PKG_PATH}/include +) + +# add host lib path +link_directories( + ${LIB_PATH} + ${LIB_PATH1} + ${CUST_PKG_PATH}/lib +) + +add_executable(execute_add_op + main.cpp +) + +target_link_libraries(execute_add_op + ascendcl + cust_opapi + acl_op_compiler + nnopbase + stdc++ +) + +install(TARGETS execute_add_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) diff --git a/operator/AddCustomSample/FrameworkLaunchOpGen/AclNNInvocationNaive/README.md b/operator/AddCustomSample/FrameworkLaunchOpGen/AclNNInvocationNaive/README.md new file mode 100644 index 000000000..75c02cebe --- /dev/null +++ b/operator/AddCustomSample/FrameworkLaunchOpGen/AclNNInvocationNaive/README.md @@ -0,0 +1,60 @@ +## 概述 +本样例相比于AclNNInvocation样例工程,简化了工程配置。 +## 目录结构介绍 +``` +├── AclNNInvocationNaive +│ ├── CMakeLists.txt // 编译规则文件 +│ ├── main.cpp // 单算子调用应用的入口 +│ └── run.sh // 编译运行算子的脚本 +``` +## 代码实现介绍 +完成自定义算子的开发部署后,可以通过单算子调用的方式来验证单算子的功能。main.cpp代码为单算子API执行方式。单算子API执行是基于C语言的API执行算子,无需提供单算子描述文件进行离线模型的转换,直接调用单算子API接口。 + +自定义算子编译部署后,会自动生成单算子API,可以直接在应用程序中调用。算子API的形式一般定义为“两段式接口”,形如: + ```cpp + // 获取算子使用的workspace空间大小 + aclnnStatus aclnnAddCustomGetWorkspaceSize(const aclTensor *x, const aclTensor *y, const alcTensor *out, uint64_t workspaceSize, aclOpExecutor **executor); + // 执行算子 + aclnnStatus aclnnAddCustom(void *workspace, int64_t workspaceSize, aclOpExecutor **executor, aclrtStream stream); + ``` +其中aclnnAddCustomGetWorkspaceSize为第一段接口,主要用于计算本次API调用计算过程中需要多少的workspace内存。获取到本次API计算需要的workspace大小之后,按照workspaceSize大小申请Device侧内存,然后调用第二段接口aclnnAddCustom执行计算。具体参考[AscendCL单算子调用](https://hiascend.com/document/redirect/CannCommunityAscendCInVorkSingleOp)>单算子API执行 章节。 +## 运行样例算子 +### 1. 编译算子工程 +运行此样例前,请参考[编译算子工程](../README.md#operatorcompile)完成前期准备。 +### 2. aclnn调用样例运行 + + - 进入到样例目录 + 以命令行方式下载样例代码,master分支为例。 + ```bash + cd ${git_clone_path}/samples/operator/AddCustomSample/FrameworkLaunch/AclNNInvocationNaive + ``` + - 样例编译文件修改 + + 将CMakeLists.txt文件内"/usr/local/Ascend/ascend-toolkit/latest"替换为CANN软件包安装后的实际路径。 + eg:/home/HwHiAiUser/Ascend/ascend-toolkit/latest + + - 环境变量配置 + + 需要设置NPU_HOST_LIB环境变量,以x86为例 + ```bash + export NPU_HOST_LIB=/home/HwHiAiUser/Ascend/ascend-toolkit/latest/x86_64-linux/lib64 + ``` + - 样例执行 + + 样例执行过程中会自动生成测试数据,然后编译与运行aclnn样例,最后打印运行结果。 + ```bash + mkdir -p build + cd build + cmake .. && make + ./execute_add_op + ``` + + 用户亦可参考run.sh脚本进行编译与运行。 + ```bash + bash run.sh + ``` + +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2024/05/22 | 新增本readme | \ No newline at end of file diff --git a/operator/AddCustomSample/FrameworkLaunchOpGen/AclNNInvocationNaive/main.cpp b/operator/AddCustomSample/FrameworkLaunchOpGen/AclNNInvocationNaive/main.cpp new file mode 100644 index 000000000..4d7d91543 --- /dev/null +++ b/operator/AddCustomSample/FrameworkLaunchOpGen/AclNNInvocationNaive/main.cpp @@ -0,0 +1,167 @@ +/** + * @file main.cpp + * + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include +#include +#include +#include + +#include "acl/acl.h" +#include "aclnn_add_custom.h" + +#define SUCCESS 0 +#define FAILED 1 + +#define CHECK_RET(cond, return_expr) \ + do { \ + if (!(cond)) { \ + return_expr; \ + } \ + } while (0) + +#define LOG_PRINT(message, ...) \ + do { \ + printf(message, ##__VA_ARGS__); \ + } while (0) + +int64_t GetShapeSize(const std::vector &shape) +{ + int64_t shapeSize = 1; + for (auto i : shape) { + shapeSize *= i; + } + return shapeSize; +} + +int Init(int32_t deviceId, aclrtStream *stream) +{ + // 固定写法,acl初始化 + auto ret = aclInit(nullptr); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclInit failed. ERROR: %d\n", ret); return FAILED); + ret = aclrtSetDevice(deviceId); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret); return FAILED); + ret = aclrtCreateStream(stream); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtCreateStream failed. ERROR: %d\n", ret); return FAILED); + + return SUCCESS; +} + +template +int CreateAclTensor(const std::vector &hostData, const std::vector &shape, void **deviceAddr, + aclDataType dataType, aclTensor **tensor) +{ + auto size = GetShapeSize(shape) * sizeof(T); + // 调用aclrtMalloc申请device侧内存 + auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret); return FAILED); + + // 调用aclrtMemcpy将host侧数据拷贝到device侧内存上 + ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size, ACL_MEMCPY_HOST_TO_DEVICE); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMemcpy failed. ERROR: %d\n", ret); return FAILED); + + // 调用aclCreateTensor接口创建aclTensor + *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, nullptr, 0, aclFormat::ACL_FORMAT_ND, shape.data(), + shape.size(), *deviceAddr); + return SUCCESS; +} + +int main(int argc, char **argv) +{ + // 1. (固定写法)device/stream初始化, 参考acl对外接口列表 + // 根据自己的实际device填写deviceId + int32_t deviceId = 0; + aclrtStream stream; + auto ret = Init(deviceId, &stream); + CHECK_RET(ret == 0, LOG_PRINT("Init acl failed. ERROR: %d\n", ret); return FAILED); + + // 2. 构造输入与输出,需要根据API的接口自定义构造 + std::vector inputXShape = {8, 2048}; + std::vector inputYShape = {8, 2048}; + std::vector outputZShape = {8, 2048}; + void *inputXDeviceAddr = nullptr; + void *inputYDeviceAddr = nullptr; + void *outputZDeviceAddr = nullptr; + aclTensor *inputX = nullptr; + aclTensor *inputY = nullptr; + aclTensor *outputZ = nullptr; + std::vector inputXHostData(inputXShape[0] * inputXShape[1]); + std::vector inputYHostData(inputYShape[0] * inputYShape[1]); + std::vector outputZHostData(outputZShape[0] * outputZShape[1]); + for (int i = 0; i < inputXShape[0] * inputXShape[1]; ++i) { + inputXHostData[i] = aclFloatToFloat16(1.0); + inputYHostData[i] = aclFloatToFloat16(2.0); + outputZHostData[i] = aclFloatToFloat16(0.0); + } + // 创建inputX aclTensor + ret = CreateAclTensor(inputXHostData, inputXShape, &inputXDeviceAddr, aclDataType::ACL_FLOAT16, &inputX); + CHECK_RET(ret == ACL_SUCCESS, return FAILED); + // 创建inputY aclTensor + ret = CreateAclTensor(inputYHostData, inputYShape, &inputYDeviceAddr, aclDataType::ACL_FLOAT16, &inputY); + CHECK_RET(ret == ACL_SUCCESS, return FAILED); + // 创建outputZ aclTensor + ret = CreateAclTensor(outputZHostData, outputZShape, &outputZDeviceAddr, aclDataType::ACL_FLOAT16, &outputZ); + CHECK_RET(ret == ACL_SUCCESS, return FAILED); + + // 3. 调用CANN自定义算子库API + uint64_t workspaceSize = 0; + aclOpExecutor *executor; + // 计算workspace大小并申请内存 + ret = aclnnAddCustomGetWorkspaceSize(inputX, inputY, outputZ, &workspaceSize, &executor); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnAddCustomGetWorkspaceSize failed. ERROR: %d\n", ret); return FAILED); + void *workspaceAddr = nullptr; + if (workspaceSize > 0) { + ret = aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return FAILED;); + } + // 执行算子 + ret = aclnnAddCustom(workspaceAddr, workspaceSize, executor, stream); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclnnAdd failed. ERROR: %d\n", ret); return FAILED); + + // 4. (固定写法)同步等待任务执行结束 + ret = aclrtSynchronizeStream(stream); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret); return FAILED); + + // 5. 获取输出的值,将device侧内存上的结果拷贝至host侧,需要根据具体API的接口定义修改 + auto size = GetShapeSize(outputZShape); + std::vector resultData(size, 0); + ret = aclrtMemcpy(resultData.data(), resultData.size() * sizeof(resultData[0]), outputZDeviceAddr, + size * sizeof(aclFloat16), ACL_MEMCPY_DEVICE_TO_HOST); + CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("copy result from device to host failed. ERROR: %d\n", ret); return FAILED); + + // 6. 释放aclTensor,需要根据具体API的接口定义修改 + aclDestroyTensor(inputX); + aclDestroyTensor(inputY); + aclDestroyTensor(outputZ); + + // 7. 释放device资源,需要根据具体API的接口定义修改 + aclrtFree(inputXDeviceAddr); + aclrtFree(inputYDeviceAddr); + aclrtFree(outputZDeviceAddr); + if (workspaceSize > 0) { + aclrtFree(workspaceAddr); + } + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + // 打印输出结果 + std::vector goldenData(size, aclFloatToFloat16(3.0)); + ; + LOG_PRINT("result is:\n"); + for (int64_t i = 0; i < 10; i++) { + LOG_PRINT("%.1f ", aclFloat16ToFloat(resultData[i])); + } + LOG_PRINT("\n"); + if (std::equal(resultData.begin(), resultData.end(), goldenData.begin())) { + LOG_PRINT("test pass\n"); + } else { + LOG_PRINT("test failed\n"); + } + return SUCCESS; +} diff --git a/operator/AddCustomSample/FrameworkLaunchOpGen/AclNNInvocationNaive/run.sh b/operator/AddCustomSample/FrameworkLaunchOpGen/AclNNInvocationNaive/run.sh new file mode 100644 index 000000000..0cc50f4ee --- /dev/null +++ b/operator/AddCustomSample/FrameworkLaunchOpGen/AclNNInvocationNaive/run.sh @@ -0,0 +1,82 @@ +#!/bin/bash +SHORT=r:,v:,i:, +LONG=run-mode:,soc-version:,install-path:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" + +while :; do + case "$1" in + -r | --run-mode) + RUN_MODE="$2" + shift 2 + ;; + -v | --soc-version) + SOC_VERSION="$2" + shift 2 + ;; + -i | --install-path) + ASCEND_INSTALL_PATH="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + echo "[ERROR] Unexpected option: $1" + break + ;; + esac +done + +RUN_MODE_LIST="sim npu" +if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then + echo "ERROR: RUN_MODE error, This sample only support sim or npu!" + exit -1 +fi + +if [ "${RUN_MODE}" = "npu" ] && [ "$SOC_VERSION" ]; then + echo "ERROR: can not specify SOC_VERSION when running on npu!" + exit -1 +fi + +VERSION_LIST="Ascend910A Ascend910B Ascend310B1 Ascend310B2 Ascend310B3 Ascend310B4 Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4" +if [ "${RUN_MODE}" = "sim" ] && [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then + echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]" + exit -1 +fi + +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi +source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash +export DDK_PATH=${_ASCEND_INSTALL_PATH} +export NPU_HOST_LIB=${_ASCEND_INSTALL_PATH}/lib64 + +export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH} +export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH} +if [ "${RUN_MODE}" = "sim" ]; then + export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH +fi + +set -e +rm -rf build +mkdir -p build +cmake -B build +cmake --build build -j +( + cd build + if [ "${RUN_MODE}" = "npu" ]; then + msprof op --application=./execute_add_op + elif [ "${RUN_MODE}" = "sim" ]; then + msprof op simulator --application=./execute_add_op + fi +) \ No newline at end of file diff --git a/operator/AddCustomSample/FrameworkLaunchOpGen/AddCustom.json b/operator/AddCustomSample/FrameworkLaunchOpGen/AddCustom.json new file mode 100644 index 000000000..dce1ed85f --- /dev/null +++ b/operator/AddCustomSample/FrameworkLaunchOpGen/AddCustom.json @@ -0,0 +1,40 @@ +[ + { + "op": "AddCustom", + "language": "cpp", + "input_desc": [ + { + "name": "x", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float16" + ] + }, + { + "name": "y", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float16" + ] + } + ], + "output_desc": [ + { + "name": "z", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float16" + ] + } + ] + } +] \ No newline at end of file diff --git a/operator/AddCustomSample/FrameworkLaunchOpGen/AddCustom/op_host/add_custom.cpp b/operator/AddCustomSample/FrameworkLaunchOpGen/AddCustom/op_host/add_custom.cpp new file mode 100644 index 000000000..840494b1a --- /dev/null +++ b/operator/AddCustomSample/FrameworkLaunchOpGen/AddCustom/op_host/add_custom.cpp @@ -0,0 +1,76 @@ +/** + * @file add_custom.cpp + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "add_custom_tiling.h" +#include "register/op_def_registry.h" + +namespace optiling { +const uint32_t BLOCK_DIM = 8; +const uint32_t TILE_NUM = 8; +static ge::graphStatus TilingFunc(gert::TilingContext *context) +{ + TilingData tiling; + uint32_t totalLength = context->GetInputShape(0)->GetOriginShape().GetShapeSize(); + context->SetBlockDim(BLOCK_DIM); + tiling.set_totalLength(totalLength); + tiling.set_tileNum(TILE_NUM); + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + size_t *currentWorkspace = context->GetWorkspaceSizes(1); + currentWorkspace[0] = 0; + return ge::GRAPH_SUCCESS; +} +} // namespace optiling + +namespace ge { +static graphStatus InferShape(gert::InferShapeContext *context) +{ + const gert::Shape *x1_shape = context->GetInputShape(0); + gert::Shape *y_shape = context->GetOutputShape(0); + *y_shape = *x1_shape; + return GRAPH_SUCCESS; +} + +static graphStatus InferDataType(gert::InferDataTypeContext *context) +{ + const auto inputDataType = context->GetInputDataType(0); + context->SetOutputDataType(0, inputDataType); + return ge::GRAPH_SUCCESS; +} +} // namespace ge + +namespace ops { +class AddCustom : public OpDef { +public: + explicit AddCustom(const char *name) : OpDef(name) + { + this->Input("x") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND}); + this->Input("y") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND}); + this->Output("z") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND}); + + this->SetInferShape(ge::InferShape).SetInferDataType(ge::InferDataType); + this->AICore() + .SetTiling(optiling::TilingFunc) + .AddConfig("ascend910") + .AddConfig("ascend310p") + .AddConfig("ascend310b") + .AddConfig("ascend910b"); + } +}; +OP_ADD(AddCustom); +} // namespace ops diff --git a/operator/AddCustomSample/FrameworkLaunchOpGen/AddCustom/op_host/add_custom_tiling.h b/operator/AddCustomSample/FrameworkLaunchOpGen/AddCustom/op_host/add_custom_tiling.h new file mode 100644 index 000000000..323f3076f --- /dev/null +++ b/operator/AddCustomSample/FrameworkLaunchOpGen/AddCustom/op_host/add_custom_tiling.h @@ -0,0 +1,22 @@ +/** + * @file add_custom_tiling.h + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef ADD_CUSTOM_TILING_H +#define ADD_CUSTOM_TILING_H +#include "register/tilingdata_base.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(TilingData) +TILING_DATA_FIELD_DEF(uint32_t, totalLength); +TILING_DATA_FIELD_DEF(uint32_t, tileNum); +END_TILING_DATA_DEF; + +REGISTER_TILING_DATA_CLASS(AddCustom, TilingData) +} // namespace optiling +#endif // ADD_CUSTOM_TILING_H diff --git a/operator/AddCustomSample/FrameworkLaunchOpGen/AddCustom/op_kernel/add_custom.cpp b/operator/AddCustomSample/FrameworkLaunchOpGen/AddCustom/op_kernel/add_custom.cpp new file mode 100644 index 000000000..f11dfe861 --- /dev/null +++ b/operator/AddCustomSample/FrameworkLaunchOpGen/AddCustom/op_kernel/add_custom.cpp @@ -0,0 +1,93 @@ +/** + * @file add_custom.cpp + * + * Copyright (C) 2022-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" +constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue + +class KernelAdd { +public: + __aicore__ inline KernelAdd() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum) + { + this->blockLength = totalLength / AscendC::GetBlockNum(); + this->tileNum = tileNum; + this->tileLength = this->blockLength / tileNum / BUFFER_NUM; + + xGm.SetGlobalBuffer((__gm__ DTYPE_X *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + yGm.SetGlobalBuffer((__gm__ DTYPE_Y *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + zGm.SetGlobalBuffer((__gm__ DTYPE_Z *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(DTYPE_X)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Y)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Z)); + } + __aicore__ inline void Process() + { + int32_t loopCount = this->tileNum * BUFFER_NUM; + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength); + AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength); + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = inQueueY.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + AscendC::Add(zLocal, xLocal, yLocal, this->tileLength); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength); + outQueueZ.FreeTensor(zLocal); + } + +private: + AscendC::TPipe pipe; + AscendC::TQue inQueueX, inQueueY; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; + uint32_t blockLength; + uint32_t tileNum; + uint32_t tileLength; +}; + +extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) +{ + GET_TILING_DATA(tiling_data, tiling); + KernelAdd op; + op.Init(x, y, z, tiling_data.totalLength, tiling_data.tileNum); + op.Process(); +} + +#ifndef ASCENDC_CPU_DEBUG +// call of kernel function +void add_custom_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, + uint8_t *workspace, uint8_t *tiling) +{ + add_custom<<>>(x, y, z, workspace, tiling); +} +#endif \ No newline at end of file diff --git a/operator/AddCustomSample/FrameworkLaunchOpGen/README.md b/operator/AddCustomSample/FrameworkLaunchOpGen/README.md new file mode 100644 index 000000000..5b600852d --- /dev/null +++ b/operator/AddCustomSample/FrameworkLaunchOpGen/README.md @@ -0,0 +1,6 @@ +# 需要自行补充 +可支持aclnn上板运行和仿真运行 +```bash +bash install.sh -v Ascend910B1 +(cd AclNNInvocationNaive; bash run.sh -v Ascend910B1 -r sim) +``` \ No newline at end of file diff --git a/operator/AddCustomSample/FrameworkLaunchOpGen/install.sh b/operator/AddCustomSample/FrameworkLaunchOpGen/install.sh new file mode 100644 index 000000000..3225e7101 --- /dev/null +++ b/operator/AddCustomSample/FrameworkLaunchOpGen/install.sh @@ -0,0 +1,51 @@ +#!/bin/bash +SHORT=v:,i:, +LONG=soc-version:,install-path:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" + +while :; do + case "$1" in + -v | --soc-version) + SOC_VERSION="$2" + shift 2 + ;; + -i | --install-path) + ASCEND_INSTALL_PATH="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + echo "[ERROR] Unexpected option: $1" + break + ;; + esac +done + +VERSION_LIST="Ascend910A Ascend910B Ascend310B1 Ascend310B2 Ascend310B3 Ascend310B4 Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4" +if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then + echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]" + exit -1 +fi + +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi +source $_ASCEND_INSTALL_PATH/bin/setenv.bash +export ASCEND_HOME_PATH=$_ASCEND_INSTALL_PATH + +rm -rf CustomOp +msopgen gen -i AddCustom.json -f aclnn -c ai_core-${SOC_VERSION} -lan cpp -out CustomOp +cp -rf AddCustom/* CustomOp +(cd CustomOp && bash build.sh) \ No newline at end of file -- Gitee