diff --git a/IndexSelectForRank1Backward/README.md b/IndexSelectForRank1Backward/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e16c330ab6ba92f0a9ad142db7514be7422729df --- /dev/null +++ b/IndexSelectForRank1Backward/README.md @@ -0,0 +1,114 @@ +## 目录结构 +| 目录 | 描述 | +|---------------------|----------------------| +| matmul_leakyrelu_custom.json | MatmulLeakyReluCustom算子的原型定义json文件 | +| [MatmulLeakyReluCustom](./MatmulLeakyReluCustom) | MatmulLeakyReluCustom算子工程 | +| [AclNNInvocation](./AclNNInvocation) | 通过aclnn调用的方式调用MatmulLeakyReluCustom算子工程。 | + +## 编译算子工程部署算子包 + +### 1.获取源码包 + + 可以使用以下两种方式下载,请选择其中一种进行源码准备。 + + - 命令行方式下载(下载时间较长,但步骤简单)。 + + ``` + # 开发环境,非root用户命令行中执行以下命令下载源码仓。 + cd ${HOME} + git clone https://gitee.com/ascend/samples.git + ``` + **注:如果需要切换到其它tag版本,以v0.5.0为例,可执行以下命令。** + ``` + git checkout v0.5.0 + ``` + - 压缩包方式下载(下载时间较短,但步骤稍微复杂)。 + **注:如果需要下载其它版本代码,请先请根据前置条件说明进行samples仓分支切换。** + ``` + # 1. samples仓右上角选择 【克隆/下载】 下拉框并选择 【下载ZIP】。 + # 2. 将ZIP包上传到开发环境中的普通用户家目录中,【例如:${HOME}/ascend-samples-master.zip】。 + # 3. 开发环境中,执行以下命令,解压zip包。 + cd ${HOME} + unzip ascend-samples-master.zip + ``` + +### 2.编译算子工程 + + 编译自定义算子工程,构建生成自定义算子包 + + - 执行如下命令,切换到算子工程MatmulLeakyReluCustom目录 + + ``` + cd $HOME/samples/operator/MatMulLeakyReluCustomSample/FrameworkLaunch/MatmulLeakyReluCustom + ``` + + - 修改CMakePresets.json中ASCEND_CANN_PACKAGE_PATH为CANN软件包安装后的实际路径。 + + + ``` + { + …… + "configurePresets": [ + { + …… + "ASCEND_CANN_PACKAGE_PATH": { + "type": "PATH", + "value": "~/Ascend/ascend-toolkit/latest" //请替换为CANN软件包安装后的实际路径。eg:/home/HwHiAiUser/Ascend/ascend-toolkit/latest + }, + …… + } + ] + } + ``` + - 在算子工程MatmulLeakyReluCustom目录下执行如下命令,进行算子工程编译。 + + ``` + ./build.sh + ``` + 编译成功后,会在当前目录下创建build_out目录,并在build_out目录下生成自定义算子安装包custom_opp__.run,例如“custom_opp_ubuntu_x86_64.run”。 + + +### 3.部署算子包 + + - 执行如下命令,在自定义算子安装包所在路径下,安装自定义算子包。 + + ``` + cd build_out + ./custom_opp__.run + ``` + + 命令执行成功后,自定义算子包中的相关文件将部署至当前环境的OPP算子库的vendors/customize目录中。 + +## 配置环境变量 + + 这里的\$HOME需要替换为CANN包的安装路径。 + ``` + export ASCEND_HOME_DIR=$HOME/Ascend/ascend-toolkit/latest + ``` + +## 通过aclnn调用的方式调用MatmulLeakyReluCustom算子工程 + +### 样例运行 + + - 进入到样例目录 + + ``` + cd $HOME/samples/operator/MatmulLeakyReluCustomSample/FrameworkLaunch/AclNNInvocation + ``` + + - 样例执行 + + 样例执行过程中会自动生成测试数据,然后编译与运行aclnn样例,最后检验运行结果。具体过程可参见run.sh脚本。 + ``` + bash run.sh + ``` + +## 更新说明 + | 时间 | 更新事项 | +|----|------| +| 2023/11/9 | 新增AclNNInvocation样例 | + + +## 已知issue + + 暂无 \ No newline at end of file diff --git a/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/inc/common.h b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/inc/common.h new file mode 100644 index 0000000000000000000000000000000000000000..7036813a972219f046e079072fe2e1ed94670e5f --- /dev/null +++ b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/inc/common.h @@ -0,0 +1,45 @@ +/** +* @file common.h +* +* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#ifndef COMMON_H +#define COMMON_H + +#include +#include +#include +#include +#include + +#include "acl/acl.h" + +#define SUCCESS 0 +#define FAILED 1 + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stderr, "[ERROR] " fmt "\n", ##args) + +/** + * @brief Read data from file + * @param [in] filePath: file path + * @param [out] fileSize: file size + * @return read result + */ +bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize); + +/** + * @brief Write data to file + * @param [in] filePath: file path + * @param [in] buffer: data to write to file + * @param [in] size: size to write + * @return write result + */ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size); + +#endif // COMMON_H diff --git a/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/inc/op_runner.h b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/inc/op_runner.h new file mode 100644 index 0000000000000000000000000000000000000000..bf2a9ef43d12950dae6f19b98d48f5a6a36ac02f --- /dev/null +++ b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/inc/op_runner.h @@ -0,0 +1,182 @@ +/** +* @file op_runner.h +* +* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#ifndef OP_RUNNER_H +#define OP_RUNNER_H + +#include "aclnn/acl_meta.h" +#include "acl/acl.h" +#include "common.h" +#include "operator_desc.h" + +/** + * Op Runner + */ +class OpRunner { +public: + /** + * @brief Constructor + * @param [in] opDesc: op description + */ + explicit OpRunner(OperatorDesc *opDesc); + + /** + * @brief Destructor + */ + virtual ~OpRunner(); + + /** + * @brief Init op runner + */ + bool Init(); + + /** + * @brief Get number of inputs + * @return number of inputs + */ + const size_t NumInputs(); + + /** + * @brief Get number of outputs + * @return number of outputs + */ + const size_t NumOutputs(); + + /** + * @brief Get input size by index + * @param [in] index: input index + * @return size of the input + */ + const size_t GetInputSize(size_t index) const; + const size_t GetInputNumDims(size_t index) const; + aclDataType GetInputDataType(size_t index) const; + aclFormat GetInputFormat(size_t index) const; + + /** + * @brief Get output size by index + * @param [in] index: output index + * @return size of the output + */ + size_t GetOutputSize(size_t index) const; + const size_t GetOutputNumDims(size_t index) const; + aclDataType GetOutputDataType(size_t index) const; + aclFormat GetOutputFormat(size_t index) const; + + /** + * @brief Get input element count by index + * @param i[in] ndex: input index + * @return element count of the input + */ + size_t GetInputElementCount(size_t index) const; + + /** + * @brief Get output element count by index + * @param [in] index: output index + * @return element count of the output + */ + size_t GetOutputElementCount(size_t index) const; + + /** + * @brief Get input shape by index + * @param [in] index: input index + * @return shape of the output + */ + std::vector GetInputShape(size_t index) const; + + /** + * @brief Get output shape by index + * @param [in] index: output index + * @return shape of the output + */ + std::vector GetOutputShape(size_t index) const; + + /** + * @brief Get input buffer(host memory) by index + * @tparam T: data type + * @param [in] index: input index + * @return host address of the input + */ + template + T *GetInputBuffer(size_t index) + { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return nullptr; + } + return reinterpret_cast(hostInputs_[index]); + } + + /** + * @brief Get output buffer(host memory) by index + * @tparam T: data type + * @param [in] index: output index + * @return host address of the output + */ + template + const T *GetOutputBuffer(size_t index) + { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return nullptr; + } + + return reinterpret_cast(hostOutputs_[index]); + } + + /** + * @brief Print readable input by index + * @param [in] index: input index + * @param [in] elementsPerRow: number of elements per row + */ + void PrintInput(size_t index, size_t elementsPerRow = 16); + + /** + * @brief Print readable output by index + * @param [in] index: output index + * @param [in] elementsPerRow: number of elements per row + */ + void PrintOutput(size_t index, size_t elementsPerRow = 16); + + /** + * @brief Compile static op + * @return compile result + */ + bool CompileStaticOp(); + + /** + * @brief Compile dynamic op + * @return compile result + */ + bool CompileDynamicOp(); + + /** + * @brief Run op + * @return run result + */ + bool RunOp(); + +private: + size_t numInputs_; + size_t numOutputs_; + + std::vector inputBuffers_; + std::vector outputBuffers_; + + std::vector devInputs_; + std::vector devOutputs_; + + std::vector hostInputs_; + std::vector hostOutputs_; + + std::vector inputTensor_; + std::vector outputTensor_; + OperatorDesc *opDesc_; +}; + +#endif // OP_RUNNER_H diff --git a/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/inc/operator_desc.h b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/inc/operator_desc.h new file mode 100644 index 0000000000000000000000000000000000000000..4cbdf07e081232fc3165d1fbc205272ff9ccd2a5 --- /dev/null +++ b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/inc/operator_desc.h @@ -0,0 +1,57 @@ +/** +* @file operator_desc.h +* +* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#ifndef OPERATOR_DESC_H +#define OPERATOR_DESC_H + +#include +#include + +#include "acl/acl.h" + +/** + * Op description + */ +struct OperatorDesc { + /** + * Constructor + */ + explicit OperatorDesc(); + + /** + * Destructor + */ + virtual ~OperatorDesc(); + + /** + * Add an input tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + /** + * Add an output tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + std::string opType; + std::vector inputDesc; + std::vector outputDesc; +}; + +#endif // OPERATOR_DESC_H diff --git a/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/run.sh b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..eee3116b76e640dc796eb967245fb6048ae9a907 --- /dev/null +++ b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/run.sh @@ -0,0 +1,101 @@ +#!/bin/bash +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +export ASCEND_GLOBAL_LOG_LEVEL=0 + +CURRENT_DIR=$( + cd $(dirname ${BASH_SOURCE:-$0}) + pwd +) +cd $CURRENT_DIR + +# 导出环境变量 +SHORT=v:, +LONG=dtype:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + # float16, float, int32 + (-v | --dtype) + DTYPE="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +if [ ! $ASCEND_HOME_DIR ]; then + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + export ASCEND_HOME_DIR=$HOME/Ascend/ascend-toolkit/latest + else + export ASCEND_HOME_DIR=/usr/local/Ascend/ascend-toolkit/latest + fi +fi +#source $ASCEND_HOME_DIR/bin/setenv.bash + +export DDK_PATH=$ASCEND_HOME_DIR +arch=$(uname -m) +export NPU_HOST_LIB=$ASCEND_HOME_DIR/${arch}-linux/lib64 + +function main { + # 1. 清除遗留生成文件和日志文件 + rm -rf $HOME/ascend/log/* + rm ./input/*.bin + rm ./output/*.bin + + # 2. 生成输入数据和真值数据 + cd $CURRENT_DIR + python3 scripts/gen_data.py + if [ $? -ne 0 ]; then + echo "ERROR: generate input data failed!" + return 1 + fi + echo "INFO: generate input data success!" + + # 3. 编译acl可执行文件 + cd $CURRENT_DIR; rm -rf build; mkdir -p build; cd build + cmake ../src + if [ $? -ne 0 ]; then + echo "ERROR: cmake failed!" + return 1 + fi + echo "INFO: cmake success!" + make + if [ $? -ne 0 ]; then + echo "ERROR: make failed!" + return 1 + fi + echo "INFO: make success!" + + # 4. 运行可执行文件 + cd $CURRENT_DIR/output + echo "INFO: execute op!" + ./execute_index_select_for_rank1_backward_op + + if [ $? -ne 0 ]; then + echo "ERROR: acl executable run failed! please check your project!" + return 1 + fi + echo "INFO: acl executable run success!" + # grad_query.tofile("./output/golden_grad_query.bin") + # grad_key.tofile("./output/golden_grad_key.bin") + # grad_value.tofile("./output/golden_grad_value.bin") + # 5. 比较真值文件 + cd $CURRENT_DIR + ret=`python3 scripts/verify_result.py output/grad_x.bin output/grad_index.bin output/golden_grad_x.bin output/golden_grad_index.bin ` + echo $ret + if [ "x$ret" == "xtest pass" ]; then + echo "" + echo "#####################################" + echo "INFO: you have passed the Precision!" + echo "#####################################" + echo "" + fi +} + +main diff --git a/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/gen_data.py b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/gen_data.py new file mode 100644 index 0000000000000000000000000000000000000000..f9e37b3d35b14bc693c68b10bc811cd7f03e0ac3 --- /dev/null +++ b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/gen_data.py @@ -0,0 +1,34 @@ +#!/usr/bin/python3 +# -*- coding:utf-8 -*- +# Copyright 2022-2023 Huawei Technologies Co., Ltd +import numpy as np +import os +import math + +def index_select_for_rank1_backward(grad, x, index): + grad_x = np.zeros(x.shape).astype(np.float32) + grad_index = np.zeros(index.shape) + for i in range(x.shape[0]): + out = grad[index == i] + out = out.sum() + grad_x[i] = out + return grad_x.astype(np.float32), grad_index + +def gen_golden_data_simple(): + grad = np.ones([128, 211, 211]).astype(np.float32) + x = np.ones([129]).astype(np.float32) + index = np.arange(128)[:, np.newaxis, np.newaxis].repeat(211, axis=1).repeat(211, axis=2) + + grad_x, grad_index = index_select_for_rank1_backward(grad, x, index) + + os.system("mkdir -p input") + os.system("mkdir -p output") + grad.tofile("./input/grad.bin") + x.tofile("./input/x.bin") + index.tofile("./input/index.bin") + + grad_x.tofile("./output/golden_grad_x.bin") + grad_index.tofile("./output/golden_grad_index.bin") + +if __name__ == "__main__": + gen_golden_data_simple() diff --git a/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/input/grad.bin b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/input/grad.bin new file mode 100644 index 0000000000000000000000000000000000000000..0a350b214f6a65eed4833f718904045a1cd933f1 Binary files /dev/null and b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/input/grad.bin differ diff --git a/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/input/index.bin b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/input/index.bin new file mode 100644 index 0000000000000000000000000000000000000000..102bec0f640226127819de95653bdb1e0404793a Binary files /dev/null and b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/input/index.bin differ diff --git a/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/input/x.bin b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/input/x.bin new file mode 100644 index 0000000000000000000000000000000000000000..93cdd17701c1e0f955e22f94b6602256537b4f9d Binary files /dev/null and b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/input/x.bin differ diff --git a/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/output/golden_grad_index.bin b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/output/golden_grad_index.bin new file mode 100644 index 0000000000000000000000000000000000000000..bb5206be57de1c5664795da3e527fcbe5c109a2a Binary files /dev/null and b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/output/golden_grad_index.bin differ diff --git a/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/output/golden_grad_x.bin b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/output/golden_grad_x.bin new file mode 100644 index 0000000000000000000000000000000000000000..da740800dfb9db521088f0a693f841a16bacbb49 Binary files /dev/null and b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/output/golden_grad_x.bin differ diff --git a/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/verify_result.py b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/verify_result.py new file mode 100644 index 0000000000000000000000000000000000000000..99396e11ed8bf8b1c17761b2d0a823063d9367b3 --- /dev/null +++ b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/scripts/verify_result.py @@ -0,0 +1,31 @@ +import os +import sys +import numpy as np + +loss = 1e-3 +minimum = 10e-10 + +def verify_result(real_result, golden): + real_result = np.fromfile(real_result, dtype=np.float32) + print("================================") + print(real_result.shape) + golden = np.fromfile(golden, dtype=np.float32) + print("================================") + print(golden.shape) + + print(real_result[:32]) + print(golden[:32]) + result = np.abs(real_result - golden) + deno = np.maximum(np.abs(real_result), np.abs(golden)) + result_atol = np.less_equal(result, loss) + result_rtol = np.less_equal(result / np.add(deno, minimum), loss) + if not result_rtol.all() and not result_atol.all(): + if np.sum(result_rtol == False) > real_result.size * loss and np.sum(result_atol == False) > real_result.size * loss: + print("[ERROR] result error") + return False + print("test pass") + return True + +if __name__ == '__main__': + print("=============================grad x============") + verify_result(sys.argv[1], sys.argv[3]) \ No newline at end of file diff --git a/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/src/CMakeLists.txt b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/src/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..24016bc57c00639998a0498fa87d341fd74c92d2 --- /dev/null +++ b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/src/CMakeLists.txt @@ -0,0 +1,68 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved. + +# CMake lowest version requirement +cmake_minimum_required(VERSION 3.5.1) + +# project information +project(acl_execute_index_select_for_rank1_backward) + +# Compile options +add_compile_options(-std=c++11) + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "../output") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "../output") + +set(INC_PATH $ENV{DDK_PATH}) + +if (NOT DEFINED ENV{DDK_PATH}) + set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest") + message(STATUS "set default INC_PATH: ${INC_PATH}") +else () + message(STATUS "env INC_PATH: ${INC_PATH}") +endif() + +set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/index_select_for_rank1_backward/op_api") + +set(LIB_PATH $ENV{NPU_HOST_LIB}) + +# Dynamic libraries in the stub directory can only be used for compilation +if (NOT DEFINED ENV{NPU_HOST_LIB}) + set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/acllib/lib64/stub/") + set(LIB_PATH1 "/usr/local/Ascend/ascend-toolkit/latest/atc/lib64/stub/") + message(STATUS "set default LIB_PATH: ${LIB_PATH}") +else () + message(STATUS "env LIB_PATH: ${LIB_PATH}") +endif() + +# Header path +include_directories( + ${INC_PATH}/runtime/include + ${INC_PATH}/atc/include + ../inc + ${CUST_PKG_PATH}/include +) + +# add host lib path +link_directories( + ${LIB_PATH} + ${LIB_PATH1} + ${CUST_PKG_PATH}/lib +) + +add_executable(execute_index_select_for_rank1_backward_op + operator_desc.cpp + op_runner.cpp + main.cpp + op_runner.cpp + common.cpp +) + +target_link_libraries(execute_index_select_for_rank1_backward_op + ascendcl + cust_opapi + acl_op_compiler + nnopbase + stdc++ +) + +install(TARGETS execute_index_select_for_rank1_backward_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) diff --git a/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/src/common.cpp b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/src/common.cpp new file mode 100644 index 0000000000000000000000000000000000000000..47a39cd5f190b0472542a2037146d54fc3b5eb0b --- /dev/null +++ b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/src/common.cpp @@ -0,0 +1,79 @@ +/** +* @file common.cpp +* +* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#include "common.h" + +#include +#include +#include +#include + +extern bool g_isDevice; + +bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file %s", filePath.c_str()); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + auto writeSize = write(fd, buffer, size); + (void) close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} diff --git a/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/src/main.cpp b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/src/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..436029328d86ee3d60facc6f0d032f10a2ba58fa --- /dev/null +++ b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/src/main.cpp @@ -0,0 +1,173 @@ +/** +* @file main.cpp +* +* Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#include +#include +#include +#include +#include + +#include "acl/acl.h" +#include "op_runner.h" + +#include "common.h" + +bool g_isDevice = false; +int deviceId = 0; + +OperatorDesc CreateOpDesc() +{ + // define operator + std::vector grad { 128, 211, 211 }; + std::vector x { 129 }; + std::vector index { 128, 211, 211 }; + + std::vector grad_x { 129 }; + std::vector grad_index { 128, 211, 211 }; + + aclFormat format = ACL_FORMAT_ND; + OperatorDesc opDesc; + opDesc.AddInputTensorDesc(ACL_FLOAT, grad.size(), grad.data(), format); + opDesc.AddInputTensorDesc(ACL_FLOAT, x.size(), x.data(), format); + opDesc.AddInputTensorDesc(ACL_INT64, index.size(), index.data(), format); + + opDesc.AddOutputTensorDesc(ACL_FLOAT, grad_x.size(), grad_x.data(), format); + opDesc.AddOutputTensorDesc(ACL_INT64, grad_index.size(), grad_index.data(), format); + return opDesc; +} + +bool SetInputData(OpRunner &runner) +{ + size_t fileSize = 0; + ReadFile("../input/grad.bin", fileSize, runner.GetInputBuffer(0), runner.GetInputSize(0)); + ReadFile("../input/x.bin", fileSize, runner.GetInputBuffer(1), runner.GetInputSize(1)); + ReadFile("../input/index.bin", fileSize, runner.GetInputBuffer(2), runner.GetInputSize(2)); + INFO_LOG("Set input success"); + return true; +} + +bool ProcessOutputData(OpRunner &runner) +{ + WriteFile("../output/grad_x.bin", runner.GetOutputBuffer(0), runner.GetOutputSize(0)); + WriteFile("../output/grad_index.bin", runner.GetOutputBuffer(1), runner.GetOutputSize(1)); + INFO_LOG("Write output success"); + return true; +} + +void DestoryResource() +{ + bool flag = false; + if (aclrtResetDevice(deviceId) != ACL_SUCCESS) { + ERROR_LOG("Reset device %d failed", deviceId); + flag = true; + } + INFO_LOG("Reset Device success"); + if (aclFinalize() != ACL_SUCCESS) { + ERROR_LOG("Finalize acl failed"); + flag = true; + } + if (flag) { + ERROR_LOG("Destory resource failed"); + } else { + INFO_LOG("Destory resource success"); + } +} + +bool InitResource() +{ + std::string output = "../output"; + if (access(output.c_str(), 0) == -1) { + int ret = mkdir(output.c_str(), 0700); + if (ret == 0) { + INFO_LOG("Make output directory successfully"); + } + else { + ERROR_LOG("Make output directory fail"); + return false; + } + } + + // acl.json is dump or profiling config file + if (aclInit(NULL) != ACL_SUCCESS) { + ERROR_LOG("acl init failed"); + return false; + } + + if (aclrtSetDevice(deviceId) != ACL_SUCCESS) { + ERROR_LOG("Set device failed. deviceId is %d", deviceId); + (void)aclFinalize(); + return false; + } + INFO_LOG("Set device[%d] success", deviceId); + + // runMode is ACL_HOST which represents app is running in host + // runMode is ACL_DEVICE which represents app is running in device + aclrtRunMode runMode; + if (aclrtGetRunMode(&runMode) != ACL_SUCCESS) { + ERROR_LOG("Get run mode failed"); + DestoryResource(); + return false; + } + g_isDevice = (runMode == ACL_DEVICE); + INFO_LOG("Get RunMode[%d] success", runMode); + + return true; +} + +bool RunOp() +{ + // create op desc + OperatorDesc opDesc = CreateOpDesc(); + + // create Runner + OpRunner opRunner(&opDesc); + if (!opRunner.Init()) { + ERROR_LOG("Init OpRunner failed"); + return false; + } + + // Load inputs + if (!SetInputData(opRunner)) { + ERROR_LOG("Set input data failed"); + return false; + } + + // Run op + if (!opRunner.RunOp()) { + ERROR_LOG("Run op failed"); + return false; + } + + // process output data + if (!ProcessOutputData(opRunner)) { + ERROR_LOG("Process output data failed"); + return false; + } + + INFO_LOG("Run op success"); + return true; +} + +int main(int argc, char **argv) +{ + if (!InitResource()) { + ERROR_LOG("Init resource failed"); + return FAILED; + } + INFO_LOG("Init resource success"); + + if (!RunOp()) { + DestoryResource(); + return FAILED; + } + + DestoryResource(); + + return SUCCESS; +} diff --git a/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/src/op_runner.cpp b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/src/op_runner.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a53d404a2482cea57ebeeb5f97ffab1871a5d809 --- /dev/null +++ b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/src/op_runner.cpp @@ -0,0 +1,464 @@ +/** +* @file op_runner.cpp +* +* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#include "op_runner.h" +#include "aclnn_index_select_for_rank1_backward.h" +#include +#include +#include +#include "acl/acl_op_compiler.h" +#include "common.h" + +using namespace std; + +extern bool g_isDevice; + +OpRunner::OpRunner(OperatorDesc *opDesc) : opDesc_(opDesc) +{ + numInputs_ = opDesc->inputDesc.size(); + numOutputs_ = opDesc->outputDesc.size(); +} + +OpRunner::~OpRunner() +{ + for (size_t i = 0; i < numInputs_; ++i) { + (void)aclDestroyTensor(inputTensor_[i]); + (void)aclDestroyDataBuffer(inputBuffers_[i]); + (void)aclrtFree(devInputs_[i]); + if (g_isDevice) { + (void)aclrtFree(hostInputs_[i]); + } else { + (void)aclrtFreeHost(hostInputs_[i]); + } + } + + for (size_t i = 0; i < numOutputs_; ++i) { + (void)aclDestroyTensor(outputTensor_[i]); + (void)aclDestroyDataBuffer(outputBuffers_[i]); + (void)aclrtFree(devOutputs_[i]); + if (g_isDevice) { + (void)aclrtFree(hostOutputs_[i]); + } else { + (void)aclrtFreeHost(hostOutputs_[i]); + } + } +} + +bool OpRunner::Init() +{ + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + void *devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + devInputs_.emplace_back(devMem); + inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); + + void *hostInput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostInput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostInput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } + if (hostInput == nullptr) { + ERROR_LOG("Malloc memory for input[%zu] failed", i); + return false; + } + hostInputs_.emplace_back(hostInput); + + aclTensor *inputTensor = aclCreateTensor(GetInputShape(i).data(), GetInputNumDims(i), GetInputDataType(i), + nullptr, 0, GetInputFormat(i), GetInputShape(i).data(), GetInputNumDims(i), devInputs_[i]); + if (inputTensor == nullptr) { + ERROR_LOG("Create Tensor for input[%zu] failed", i); + return false; + } + inputTensor_.emplace_back(inputTensor); + } + + for (size_t i = 0; i < numOutputs_; ++i) { + auto size = GetOutputSize(i); + void *devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + devOutputs_.emplace_back(devMem); + outputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); + + void *hostOutput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } + if (hostOutput == nullptr) { + ERROR_LOG("Malloc host memory for output[%zu] failed", i); + return false; + } + hostOutputs_.emplace_back(hostOutput); + + aclTensor *outputTensor = aclCreateTensor(GetOutputShape(i).data(), GetOutputNumDims(i), GetOutputDataType(i), + nullptr, 0, GetOutputFormat(i), GetOutputShape(i).data(), GetOutputNumDims(i), devOutputs_[i]); + if (outputTensor == nullptr) { + ERROR_LOG("Create Tensor for output[%zu] failed", i); + return false; + } + outputTensor_.emplace_back(outputTensor); + } + + return true; +} + +const size_t OpRunner::NumInputs() +{ + return numInputs_; +} + +const size_t OpRunner::NumOutputs() +{ + return numOutputs_; +} + +const size_t OpRunner::GetInputSize(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescSize(opDesc_->inputDesc[index]); +} + +const size_t OpRunner::GetInputNumDims(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescNumDims(opDesc_->inputDesc[index]); +} + +aclDataType OpRunner::GetInputDataType(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_DT_UNDEFINED; + } + + return aclGetTensorDescType(opDesc_->inputDesc[index]); +} + +aclFormat OpRunner::GetInputFormat(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_FORMAT_UNDEFINED; + } + + return aclGetTensorDescFormat(opDesc_->inputDesc[index]); +} + +std::vector OpRunner::GetInputShape(size_t index) const +{ + std::vector ret; + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ret; + } + + auto desc = opDesc_->inputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + + return ret; +} + +size_t OpRunner::GetOutputSize(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescSize(opDesc_->outputDesc[index]); +} + +const size_t OpRunner::GetOutputNumDims(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescNumDims(opDesc_->outputDesc[index]); +} + +aclDataType OpRunner::GetOutputDataType(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_DT_UNDEFINED; + } + + return aclGetTensorDescType(opDesc_->outputDesc[index]); +} + + +aclFormat OpRunner::GetOutputFormat(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_FORMAT_UNDEFINED; + } + + return aclGetTensorDescFormat(opDesc_->outputDesc[index]); +} + +std::vector OpRunner::GetOutputShape(size_t index) const +{ + std::vector ret; + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ret; + } + + auto desc = opDesc_->outputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + return ret; +} + +size_t OpRunner::GetInputElementCount(size_t index) const +{ + if (index >= opDesc_->inputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescElementCount(opDesc_->inputDesc[index]); +} + +size_t OpRunner::GetOutputElementCount(size_t index) const +{ + if (index >= opDesc_->outputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescElementCount(opDesc_->outputDesc[index]); +} + +bool OpRunner::RunOp() +{ + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(devInputs_[i], size, hostInputs_[i], size, kind) != ACL_SUCCESS) { + ERROR_LOG("Copy input[%zu] failed", i); + return false; + } + INFO_LOG("Copy input[%zu] success", i); + } + + aclrtStream stream = nullptr; + if (aclrtCreateStream(&stream) != ACL_SUCCESS) { + ERROR_LOG("Create stream failed"); + return false; + } + INFO_LOG("Create stream success"); + + size_t workspaceSize = 0; + aclOpExecutor *handle = nullptr; + auto ret = aclnnIndexSelectForRank1BackwardGetWorkspaceSize(inputTensor_[0], inputTensor_[1], inputTensor_[2], outputTensor_[0], outputTensor_[1], + &workspaceSize, &handle); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnIndexSelectForRank1BackwardGetWorkspaceSize success, workspace size %lu", workspaceSize); + + void *workspace = nullptr; + if (workspaceSize != 0) { + if (aclrtMalloc(&workspace, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory failed"); + } + } + + ret = aclnnIndexSelectForRank1Backward(workspace, workspaceSize, handle, stream); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Execute Operator failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnIndexSelectForRank1Backward success"); + + ret = aclrtSynchronizeStreamWithTimeout(stream, 5000); + if (ret != SUCCESS) { + ERROR_LOG("Synchronize stream failed. error code is %d", static_cast(ret)); + (void)aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Synchronize stream success"); + + auto beforeTime = std::chrono::steady_clock::now(); + for (int i = 0; i<100; i++) { + ret = aclnnIndexSelectForRank1BackwardGetWorkspaceSize(inputTensor_[0], inputTensor_[1], inputTensor_[2], outputTensor_[0], outputTensor_[1], + &workspaceSize, &handle); + ret = aclnnIndexSelectForRank1Backward(workspace, workspaceSize, handle, stream); + } + ret = aclrtSynchronizeStreamWithTimeout(stream, 5000); + auto afterTime = std::chrono::steady_clock::now(); + double duration_microsecond = std::chrono::duration(afterTime - beforeTime).count(); + std::cout << "time cost " << duration_microsecond/100 << " us" << std::endl; + + for (size_t i = 0; i < numOutputs_; ++i) { + auto size = GetOutputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_DEVICE_TO_HOST; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(hostOutputs_[i], size, devOutputs_[i], size, kind) != ACL_SUCCESS) { + INFO_LOG("Copy output[%zu] success", i); + (void)aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Copy output[%zu] success", i); + } + + (void)aclrtDestroyStream(stream); + return true; +} + + +template +void DoPrintData(const T *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void DoPrintFp16Data(const aclFloat16 *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << std::setprecision(4) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void PrintData(const void *data, size_t count, aclDataType dataType, size_t elementsPerRow) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case ACL_BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT16: + DoPrintFp16Data(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } +} + +void OpRunner::PrintInput(size_t index, size_t numElementsPerRow) +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numInputs_); + return; + } + + auto desc = opDesc_->inputDesc[index]; + PrintData(hostInputs_[index], GetInputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} + +void OpRunner::PrintOutput(size_t index, size_t numElementsPerRow) +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return; + } + + auto desc = opDesc_->outputDesc[index]; + PrintData(hostOutputs_[index], GetOutputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} diff --git a/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/src/operator_desc.cpp b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/src/operator_desc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a7a1ed3bd9f6b33a7a697c9aa67045112093081a --- /dev/null +++ b/IndexSelectForRank1Backward/aclnn_index_select_for_rank1_backward/src/operator_desc.cpp @@ -0,0 +1,56 @@ +/** +* @file operator_desc.cpp +* +* Copyright (C) 2020. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#include "common.h" +#include "operator_desc.h" + +using namespace std; + +OperatorDesc::OperatorDesc() {} + +OperatorDesc::~OperatorDesc() +{ + for (auto *desc : inputDesc) { + aclDestroyTensorDesc(desc); + } + + for (auto *desc : outputDesc) { + aclDestroyTensorDesc(desc); + } + +} + +OperatorDesc &OperatorDesc::AddInputTensorDesc(aclDataType dataType, + int numDims, + const int64_t *dims, + aclFormat format) +{ + aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + inputDesc.emplace_back(desc); + return *this; +} + +OperatorDesc &OperatorDesc::AddOutputTensorDesc(aclDataType dataType, + int numDims, + const int64_t *dims, + aclFormat format) +{ + aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + + outputDesc.emplace_back(desc); + return *this; +} diff --git a/IndexSelectForRank1Backward/clear_all.sh b/IndexSelectForRank1Backward/clear_all.sh new file mode 100644 index 0000000000000000000000000000000000000000..0c1c12ec4f0c359ba0c8e7df1d955fc7d542a72b --- /dev/null +++ b/IndexSelectForRank1Backward/clear_all.sh @@ -0,0 +1,6 @@ +rm -rf index_select_for_rank1_backward +rm -rf cpu/IndexSelectForRank1Backward +rm -rf cpu/*.log +rm -rf aclnn_index_select_for_rank1_backward/build +rm -rf aclnn_index_select_for_rank1_backward/input +rm -rf aclnn_index_select_for_rank1_backward/output \ No newline at end of file diff --git a/IndexSelectForRank1Backward/cpu/aclnn_index_select_for_rank1_backward_debug.json b/IndexSelectForRank1Backward/cpu/aclnn_index_select_for_rank1_backward_debug.json new file mode 100644 index 0000000000000000000000000000000000000000..2f6d1edeffbb0afe9d83893dc968346f950e66ad --- /dev/null +++ b/IndexSelectForRank1Backward/cpu/aclnn_index_select_for_rank1_backward_debug.json @@ -0,0 +1,55 @@ +{ + "op_type": "IndexSelectForRank1Backward", + "data_script": "", + "gen_data": false, + "inputs": [ + { + "name": "grad", + "dtype": "float32", + "format": "ND", + "ignore": false, + "shape": [128, 211, 211], + "param_type": "required", + "data_file": "grad.bin" + }, + { + "name": "x", + "dtype": "float32", + "format": "ND", + "ignore": false, + "shape": [129], + "param_type": "required", + "data_file": "x.bin" + }, + { + "name": "index", + "dtype": "int64", + "format": "ND", + "ignore": false, + "shape": [128, 211, 211], + "param_type": "required", + "data_file": "index.bin" + } + ], + "outputs": [ + { + "name": "grad_x", + "dtype": "float32", + "format": "ND", + "ignore": false, + "shape": [129], + "param_type": "required", + "data_file": "grad_x.bin" + }, + { + "name": "grad_index", + "dtype": "int64", + "format": "ND", + "ignore": false, + "shape": [128, 211, 211], + "param_type": "required", + "data_file": "grad_index.bin" + } + ], + "attrs": [] +} diff --git a/IndexSelectForRank1Backward/cpu/build.sh b/IndexSelectForRank1Backward/cpu/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..d9466ec4821f55005d3992dac6651032cd356c68 --- /dev/null +++ b/IndexSelectForRank1Backward/cpu/build.sh @@ -0,0 +1,3 @@ +rm -rf build +cmake -S . -B build -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Debug -DCMAKE_EXPORT_COMPILE_COMMANDS=on +cmake --build build \ No newline at end of file diff --git a/IndexSelectForRank1Backward/cpu/generate_cpu.sh b/IndexSelectForRank1Backward/cpu/generate_cpu.sh new file mode 100644 index 0000000000000000000000000000000000000000..0bb417b10726360f6bbc8382834f6e9dcb147139 --- /dev/null +++ b/IndexSelectForRank1Backward/cpu/generate_cpu.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +mkdir -p IndexSelectForRank1Backward/data +cp -r ../aclnn_index_select_for_rank1_backward/input/* IndexSelectForRank1Backward/data +cp -r ../aclnn_index_select_for_rank1_backward/output/* IndexSelectForRank1Backward/data + + +/usr/local/Ascend/ascend-toolkit/latest/tools/ascendc_tools/ascendebug kernel --backend cpu --json-file aclnn_index_select_for_rank1_backward_debug.json --repo-type customize --customize-path \ + /usr/local/Ascend/ascend-toolkit/latest/opp/vendors/index_select_for_rank1_backward/ --chip-version Ascend910B2 --core-type MixCore --install-path /usr/local/Ascend/ascend-toolkit/ --work-dir . \ No newline at end of file diff --git a/IndexSelectForRank1Backward/creat.sh b/IndexSelectForRank1Backward/creat.sh new file mode 100755 index 0000000000000000000000000000000000000000..519836e7ebfae8f16868ac3c96b998d36ca4ea53 --- /dev/null +++ b/IndexSelectForRank1Backward/creat.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +set -e + + +# 查找msopgen的路径,加入到环境变量PATH中 +msopgen_path=$(find /usr/local/Ascend/ -name msopgen | grep bin) +parent_dir=$(dirname "$msopgen_path") +export PATH=$parent_dir:$PATH + +# 利用msopgen生成可编译文件 +rm -rf ./index_select_for_rank1_backward +python3 /usr/local/Ascend/ascend-toolkit/latest/python/site-packages/bin/msopgen gen -i index_select_for_rank1_backward.json -f tf -c ai_core-Ascend910 -lan cpp -out ./index_select_for_rank1_backward -m 0 -op IndexSelectForRank1Backward +rm -rf index_select_for_rank1_backward/op_kernel +rm -rf index_select_for_rank1_backward/host +cp -rf op_kernel index_select_for_rank1_backward/ +cp -rf op_host index_select_for_rank1_backward/ + +cd index_select_for_rank1_backward + +# 判断当前目录下是否存在CMakePresets.json文件 +if [ ! -f "CMakePresets.json" ]; then + echo "ERROR, CMakePresets.json file not exist." + exit 1 +fi + +# 禁止生成CRC校验和 +sed -i 's/--nomd5/--nomd5 --nocrc/g' ./cmake/makeself.cmake + +# 修改cann安装路径 +sed -i 's:"/usr/local/Ascend/latest":"/usr/local/Ascend/ascend-toolkit/latest":g' CMakePresets.json +# 修改vendor_name 防止覆盖之前vendor_name为customize的算子; +# vendor_name需要和aclnn中的CMakeLists.txt中的CUST_PKG_PATH值同步,不同步aclnn会调用失败; +# vendor_name字段值不能包含customize;包含会导致多算子部署场景CANN的vendors路径下config.ini文件内容截取错误 +sed -i 's:"customize":"index_select_for_rank1_backward":g' CMakePresets.json + +bash build.sh + +# # 安装编译成功的算子包 +bash ./build_out/custom_opp*.run +cd ../aclnn_index_select_for_rank1_backward +bash run.sh + +# cpu工程 +# cd ../cpu +# bash generate_cpu.sh +# cd .. \ No newline at end of file diff --git a/IndexSelectForRank1Backward/index_select_for_rank1_backward.json b/IndexSelectForRank1Backward/index_select_for_rank1_backward.json new file mode 100644 index 0000000000000000000000000000000000000000..15ae9de6c43d2e56d42eaf1dbae352890d0126a2 --- /dev/null +++ b/IndexSelectForRank1Backward/index_select_for_rank1_backward.json @@ -0,0 +1,60 @@ +[ + { + "op": "IndexSelectForRank1Backward", + "language": "cpp", + "input_desc": [ + { + "name": "grad_y", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float" + ] + }, + { + "name": "x", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float" + ] + }, + { + "name": "index", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "int64" + ] + } + ], + "output_desc": [ + { + "name": "grad_x", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float" + ] + }, + { + "name": "grad_index", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "int64" + ] + } + ] + } +] \ No newline at end of file diff --git a/IndexSelectForRank1Backward/op_host/CMakeLists.txt b/IndexSelectForRank1Backward/op_host/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..40dd51cfac524b0a9607b7d8b2813edd2210c509 --- /dev/null +++ b/IndexSelectForRank1Backward/op_host/CMakeLists.txt @@ -0,0 +1,82 @@ + +aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} ops_srcs) + +opbuild(OPS_SRC ${ops_srcs} + OUT_DIR ${ASCEND_AUTOGEN_PATH} +) + +add_library(cust_op_proto SHARED ${ops_srcs} ${ASCEND_AUTOGEN_PATH}/op_proto.cc) +target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB) +target_compile_options(cust_op_proto PRIVATE + -fvisibility=hidden +) +if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_op_proto PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) +endif() +target_link_libraries(cust_op_proto PRIVATE + intf_pub + exe_graph + register + tiling_api + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive +) +set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME + cust_opsproto_rt2.0 +) +add_library(cust_optiling SHARED ${ops_srcs}) +target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB) +target_compile_options(cust_optiling PRIVATE + -fvisibility=hidden +) +if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_optiling PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) +endif() +target_link_libraries(cust_optiling PRIVATE + intf_pub + exe_graph + register + tiling_api + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive +) +set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME + cust_opmaster_rt2.0 +) + +file(GLOB aclnn_src ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp) +file(GLOB aclnn_inc ${ASCEND_AUTOGEN_PATH}/aclnn_*.h) +add_library(cust_opapi SHARED ${aclnn_src}) +if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_opapi PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) +endif() +target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase) + +add_custom_target(optiling_compat ALL + COMMAND ln -sf lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$ + ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so +) + +install(TARGETS cust_op_proto + LIBRARY DESTINATION packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR}) +install(FILES ${ASCEND_AUTOGEN_PATH}/op_proto.h + DESTINATION packages/vendors/${vendor_name}/op_proto/inc) +install(TARGETS cust_optiling + LIBRARY DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR}) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so + DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling) +install(TARGETS cust_opapi + LIBRARY DESTINATION packages/vendors/${vendor_name}/op_api/lib) +install(FILES ${aclnn_inc} + DESTINATION packages/vendors/${vendor_name}/op_api/include) diff --git a/IndexSelectForRank1Backward/op_host/index_select_for_rank1_backward.cpp b/IndexSelectForRank1Backward/op_host/index_select_for_rank1_backward.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8f9030c204e36d817e1f95d3bf15d47402f03f69 --- /dev/null +++ b/IndexSelectForRank1Backward/op_host/index_select_for_rank1_backward.cpp @@ -0,0 +1,80 @@ +#include "tiling/platform/platform_ascendc.h" +#include "index_select_for_rank1_backward_tiling.h" +#include "register/op_def_registry.h" +constexpr int RESERVED_WORKSPACE = 20 * 1024; +namespace optiling { +static ge::graphStatus TilingFunc(gert::TilingContext* context) +{ + auto ascnedPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + size_t *currentWorkspace = context->GetWorkspaceSizes(1); + size_t systemWorkspacesSize = ascnedPlatform.GetLibApiWorkSpaceSize(); + currentWorkspace[0] = systemWorkspacesSize; + + IndexSelectForRank1BackwardTilingData tiling; + const gert::StorageShape* x1_shape = context->GetInputShape(0); + int32_t data_sz = 1; + for (int i = 0; i < x1_shape->GetStorageShape().GetDimNum(); i++) + data_sz *= x1_shape->GetStorageShape().GetDim(i); + tiling.set_size(data_sz); + context->SetBlockDim(8); + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + + return ge::GRAPH_SUCCESS; +} +} + + +namespace ge { +static ge::graphStatus InferShape(gert::InferShapeContext* context) +{ + const gert::Shape* x1_shape = context->GetInputShape(0); + gert::Shape* y_shape = context->GetOutputShape(0); + *y_shape = *x1_shape; + return GRAPH_SUCCESS; +} +} + + +namespace ops { +class IndexSelectForRank1Backward : public OpDef { +public: + explicit IndexSelectForRank1Backward(const char* name) : OpDef(name) + { + this->Input("grad_y") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("x") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("index") + .ParamType(REQUIRED) + .DataType({ge::DT_INT64}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Output("grad_x") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Output("grad_index") + .ParamType(REQUIRED) + .DataType({ge::DT_INT64}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + + this->SetInferShape(ge::InferShape); + + this->AICore() + .SetTiling(optiling::TilingFunc); + this->AICore().AddConfig("ascend910b"); + this->AICore().AddConfig("ascend910"); + } +}; + +OP_ADD(IndexSelectForRank1Backward); +} diff --git a/IndexSelectForRank1Backward/op_host/index_select_for_rank1_backward_tiling.h b/IndexSelectForRank1Backward/op_host/index_select_for_rank1_backward_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..18748457d3822acd19a8c0456ca3d6425986c02d --- /dev/null +++ b/IndexSelectForRank1Backward/op_host/index_select_for_rank1_backward_tiling.h @@ -0,0 +1,10 @@ + +#include "register/tilingdata_base.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(IndexSelectForRank1BackwardTilingData) + TILING_DATA_FIELD_DEF(uint32_t, size); +END_TILING_DATA_DEF; + +REGISTER_TILING_DATA_CLASS(IndexSelectForRank1Backward, IndexSelectForRank1BackwardTilingData) +} diff --git a/IndexSelectForRank1Backward/op_kernel/CMakeLists.txt b/IndexSelectForRank1Backward/op_kernel/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c94a952da2058b785affa6784d78b87dfe9b3d7 --- /dev/null +++ b/IndexSelectForRank1Backward/op_kernel/CMakeLists.txt @@ -0,0 +1,68 @@ +# set custom compile options +if ("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx") + add_ops_compile_options(ALL OPTIONS -g -O0) +endif() + +foreach(compute_unit ${ASCEND_COMPUTE_UNIT}) + + # generate aic-${compute_unit}-ops-info.json + add_ops_info_target(TARGET ops_info_gen_${compute_unit} + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core/${compute_unit}/aic-${compute_unit}-ops-info.json + OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit} + ) + + # generate ascendc impl py once + if (NOT TARGET ascendc_impl_gen) + add_ops_impl_target(TARGET ascendc_impl_gen + OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + IMPL_DIR ${CMAKE_CURRENT_SOURCE_DIR} + OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe + INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl + ) + endif() + + # dynamic shape binary compile + if (${ENABLE_BINARY_PACKAGE} AND NOT ${ENABLE_CROSS_COMPILE}) + add_bin_compile_target(TARGET ascendc_bin_${compute_unit} + OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + IMPL_DIR ${CMAKE_CURRENT_SOURCE_DIR} + ADP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe/dynamic + OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit} + INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel + COMPUTE_UNIT ${compute_unit} + ) + add_dependencies(ascendc_bin_${compute_unit} ascendc_impl_gen) + endif() + + if (${ENABLE_CROSS_COMPILE} AND ${ENABLE_BINARY_PACKAGE}) + add_cross_compile_target( + TARGET bin_${compute_unit} + OUT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../kernel + INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/ + ) + endif() +endforeach() + +# generate npu_supported_ops.json +add_npu_support_target(TARGET npu_supported_ops + OPS_INFO_DIR ${ASCEND_AUTOGEN_PATH} + OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core + INSTALL_DIR packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE} +) + +if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases) + add_subdirectory(testcases) +endif() + +# install kernel file +if (${ENABLE_SOURCE_PACKAGE}) + file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/*.h + ${CMAKE_CURRENT_SOURCE_DIR}/*.py + ) + install(FILES ${KERNEL_FILES} + DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic + ) +endif() diff --git a/IndexSelectForRank1Backward/op_kernel/index_select_for_rank1_backward.cpp b/IndexSelectForRank1Backward/op_kernel/index_select_for_rank1_backward.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9123442e71b24cdc609522c485282ee88706bce0 --- /dev/null +++ b/IndexSelectForRank1Backward/op_kernel/index_select_for_rank1_backward.cpp @@ -0,0 +1,12 @@ +#include "kernel_operator.h" +#include "utils.h" +#ifdef __CCE_KT_TEST__ +#include "index_select_for_rank1_backward_tiling.h" +#endif + +using namespace AscendC; +extern "C" __global__ __aicore__ void index_select_for_rank1_backward(GM_ADDR grad_y, GM_ADDR x, GM_ADDR index, GM_ADDR grad_x, GM_ADDR grad_index, GM_ADDR workspace, GM_ADDR tiling) { + GET_TILING_DATA(tiling_data, tiling); + // TODO: user kernel impl + LOG("kernel run sucessful"); +} \ No newline at end of file diff --git a/IndexSelectForRank1Backward/op_kernel/utils.h b/IndexSelectForRank1Backward/op_kernel/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..17592066cfba17d46a9fe8ac0d14a4b77883e21d --- /dev/null +++ b/IndexSelectForRank1Backward/op_kernel/utils.h @@ -0,0 +1,40 @@ +#ifndef ATTENTION_FUSION_GRAD_UTILS_H +#define ATTENTION_FUSION_GRAD_UTILS_H +#include "kernel_operator.h" + +template +__aicore__ inline T1 CeilDiv(T1 a, T2 b) { + if (b == 0) { + return 0; + } + return (a + b -1) / b; +} + + + +#ifdef __CCE_KT_TEST__ + +#define LOG(X...) Log(X) +__global__ __aicore__ void printArgs() {} + +template +__global__ __aicore__ void printArgs(T t, Args &&... args) { + + std::cout << t << " "; + printArgs(args...); +} + +template +__global__ __aicore__ void Log(Args &&... args) { +// don't log when using npu + std::cout << "[AttentionFusion LOG][" << AscendC::GetBlockIdx() << "] "; + printArgs(args...); + std::cout << std::endl; +} + + + +#else + #define LOG(X...) +#endif +#endif \ No newline at end of file