From bfb52987803597911532edce2d757f50276bb292 Mon Sep 17 00:00:00 2001 From: z30065766 Date: Wed, 4 Dec 2024 07:58:45 +0000 Subject: [PATCH 1/6] update the front-end command for dumping statistics --- msit/components/llm/msit_llm/__main__.py | 2 +- msit/components/llm/msit_llm/common/constant.py | 1 + msit/components/llm/msit_llm/dump/initial.py | 5 +++++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/msit/components/llm/msit_llm/__main__.py b/msit/components/llm/msit_llm/__main__.py index 71c080263..1c38c8522 100644 --- a/msit/components/llm/msit_llm/__main__.py +++ b/msit/components/llm/msit_llm/__main__.py @@ -133,7 +133,7 @@ class DumpCommand(BaseCommand): required=False, nargs='+', default=['tensor', 'model'], - choices=['model', 'layer', 'op', 'kernel', 'tensor', 'cpu_profiling', 'onnx'], + choices=['model', 'layer', 'op', 'kernel', 'tensor', 'cpu_profiling', 'onnx', 'stats'], help='dump type.') parser.add_argument( diff --git a/msit/components/llm/msit_llm/common/constant.py b/msit/components/llm/msit_llm/common/constant.py index f491ed57d..e49633672 100644 --- a/msit/components/llm/msit_llm/common/constant.py +++ b/msit/components/llm/msit_llm/common/constant.py @@ -44,6 +44,7 @@ ATB_SAVE_TENSOR_IDS = "ATB_SAVE_TENSOR_IDS" ATB_SAVE_TENSOR_RUNNER = "ATB_SAVE_TENSOR_RUNNER" ATB_SAVE_TENSOR = "ATB_SAVE_TENSOR" ATB_SAVE_TENSOR_RANGE = "ATB_SAVE_TENSOR_RANGE" +ATB_SAVE_TENSOR_STATISTICS = "ATB_SAVE_TENSOR_STATISTICS" ATB_SAVE_TILING = "ATB_SAVE_TILING" ATB_OUTPUT_DIR = "ATB_OUTPUT_DIR" ATB_SAVE_CHILD = "ATB_SAVE_CHILD" diff --git a/msit/components/llm/msit_llm/dump/initial.py b/msit/components/llm/msit_llm/dump/initial.py index 7b69eacec..a0e5211de 100644 --- a/msit/components/llm/msit_llm/dump/initial.py +++ b/msit/components/llm/msit_llm/dump/initial.py @@ -90,6 +90,11 @@ def init_dump_task(args): else: os.environ.pop(ATB_DUMP_TYPE, None) # Ensure none is set + if "stats" in args.type and "tensor" in args.type: + os.environ[ATB_SAVE_TENSOR_STATISTICS] = "1" + else: + os.environ.pop(ATB_SAVE_TENSOR_STATISTICS, None) # Ensure none is set + if "onnx" in args.type and ("model" in args.type or "layer" in args.type): os.environ[ATB_DUMP_SUB_PROC_INFO_SAVE_PATH] = os.path.join(str(args.output), str(os.getpid())) subprocess_info_path = os.path.join(args.output, str(os.getpid())) -- Gitee From c8483495a2a56d427d9aaf62694c9aa821bc5eb9 Mon Sep 17 00:00:00 2001 From: z30065766 Date: Thu, 5 Dec 2024 12:24:32 +0000 Subject: [PATCH 2/6] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=9C=AA=E5=BC=95?= =?UTF-8?q?=E5=85=A5=E6=89=80=E9=9C=80=E5=B8=B8=E9=87=8F=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- msit/components/llm/msit_llm/dump/initial.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/msit/components/llm/msit_llm/dump/initial.py b/msit/components/llm/msit_llm/dump/initial.py index cf2b0d827..d78b21ab0 100644 --- a/msit/components/llm/msit_llm/dump/initial.py +++ b/msit/components/llm/msit_llm/dump/initial.py @@ -30,7 +30,7 @@ from msit_llm.common.constant import ATB_HOME_PATH, ATB_SAVE_TENSOR_TIME, ATB_SA ATB_SAVE_TILING, LD_PRELOAD, ATB_OUTPUT_DIR, ATB_SAVE_CHILD, ATB_SAVE_TENSOR_PART, \ ASCEND_TOOLKIT_HOME, ATB_PROB_LIB_WITH_ABI, ATB_PROB_LIB_WITHOUT_ABI, ATB_SAVE_CPU_PROFILING, \ ATB_CUR_PID, ATB_DUMP_SUB_PROC_INFO_SAVE_PATH, ATB_DEVICE_ID, ATB_AIT_LOG_LEVEL, ATB_DUMP_TYPE, get_ait_dump_path, \ - ATB_TIMESTAMP, GLOBAL_HISTORY_AIT_DUMP_PATH_LIST, ATB_SAVE_TENSOR_IN_BEFORE_OUT_AFTER + ATB_TIMESTAMP, GLOBAL_HISTORY_AIT_DUMP_PATH_LIST, ATB_SAVE_TENSOR_IN_BEFORE_OUT_AFTER, ATB_SAVE_TENSOR_STATISTICS def is_use_cxx11(): -- Gitee From 2bd487d03154f1a0295a27ac6ea6a115fb8dc675 Mon Sep 17 00:00:00 2001 From: z30065766 Date: Fri, 6 Dec 2024 02:08:07 +0000 Subject: [PATCH 3/6] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E8=B5=84=E6=96=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...\256\344\275\277\347\224\250\350\257\264\346\230\216.md" | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git "a/msit/docs/llm/\345\267\245\345\205\267-DUMP\345\212\240\351\200\237\345\272\223\346\225\260\346\215\256\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/msit/docs/llm/\345\267\245\345\205\267-DUMP\345\212\240\351\200\237\345\272\223\346\225\260\346\215\256\344\275\277\347\224\250\350\257\264\346\230\216.md" index 39f5b333c..49d38ad14 100644 --- "a/msit/docs/llm/\345\267\245\345\205\267-DUMP\345\212\240\351\200\237\345\272\223\346\225\260\346\215\256\344\275\277\347\224\250\350\257\264\346\230\216.md" +++ "b/msit/docs/llm/\345\267\245\345\205\267-DUMP\345\212\240\351\200\237\345\272\223\346\225\260\346\215\256\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -18,6 +18,9 @@ msit llm dump --exec "<任意包含ATB的程序执行命令>" [可选参数] msit llm dump --exec "<任意包含ATB的程序执行命令>" --type model tensor # 常用用于自动比对 msit llm dump --exec "<任意包含ATB的程序执行命令>" --type onnx # 常用于导出onnx查看网络结构 +# 仅dump统计量 +msit llm dump --exec "<任意包含ATB的程序执行命令>" --type model tensor stats # 查看模型tensor的统计量, 相比全量落盘,可节省磁盘空间 + # 仅dump layer 层的算子输出,常用于精度比对先找到存在问题的 layer 层。相比全量dump,可以节省磁盘空间和定位时间 msit llm dump --exec "<任意包含ATB的程序执行命令>" --type model tensor -child False @@ -35,7 +38,7 @@ msit llm dump --exec "<任意包含ATB的程序执行命令>" --type model tenso | 参数名 | 描述 | 必选 | |-------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ---- | | --exec | 指定包含 ATB 的程序执行命令,使用示例: --exec "bash run.sh patches/models/modeling_xxx.py"。**注:命令中不支持重定向字符,如果需要重定向输出,建议将执行命令写入 shell 脚本,然后启动 shell 脚本。** | 是 | -| --type | dump 类型,默认为['tensor', 'model']。使用方式:--type layer tensor。可选项有:
model: 模型拓扑信息(默认),当dump model的时候,layer会跟着model一起dump下来
layer: Operation 维度拓扑信息
op: ATB Operation 信息
kernel: kernel Operation 信息
tensor: tensor 数据(默认)
cpu_profiling: cpu profiling 数据
onnx: onnx 模型。仅用于模型结构可视化 | 否 | +| --type | dump 类型,默认为['tensor', 'model']。使用方式:--type layer tensor。可选项有:
model: 模型拓扑信息(默认),当dump model的时候,layer会跟着model一起dump下来
layer: Operation 维度拓扑信息
op: ATB Operation 信息
kernel: kernel Operation 信息
tensor: tensor 数据(默认)
stats: 必须在--type同时填选tensor, 即[--type 其它 tensor stats], 会根据tensor的数据(仅支持数值数据类型,不支持bool/string等不可计算的数据类型)来计算统计量: [format、type、dims、max、min、mean、l2norm], 最后仅落盘dump后tensor的以上所述7种统计量
cpu_profiling: cpu profiling 数据
onnx: onnx 模型。仅用于模型结构可视化 | 否 | | -sd,--only-save-desc | 只保存 tensor 描述信息开关,默认为否,开启开关时将 dump tensor 的描述信息,使用方式:-sd | 否 | | -ids,--save-operation-ids | 设置 dump 指定 id 的算子的 tensor,默认为空,全量 dump。使用方式:-ids 2, 3_1 表示只 dump 第 2 个 operation 和第 3 个 operation 的第 1 个算子的数据,id 从 0 开始。若不确定算子 id,可以先执行 msit llm dump --exec xx --type model 命令,将 model 信息 dump 下来,即可获得模型中所有的算子 id 信息。 | 否 | | -er,--execute-range | 指定 dump 的 token 轮次范围,区间左右全闭,可以支持多个区间序列,默认为第 0 次,使用方式:-er 1,3 或 -er 3,5,7,7(代表区间[3,5],[7,7],也就是第 3,4,5,7 次token)。此外,请确保输入多区间时的总输入长度不超过500个字符。 | 否 | @@ -60,6 +63,7 @@ Dump 默认落盘路径 `{DUMP_DIR}`在当前目录下,如果指定 output 目 注:`{device_id}`为设备号;`{PID}`为进程号;`{TID}`为 `token_id`;`{TIMESTAMP}`为时间戳;`{executeCount}`为 `operation`运行次数。 - tensor 信息,具体路径是 `{DUMP_DIR}/msit_dump_{TIMESTAMP}/tensors/{device_id}_{PID}/{TID}`目录下(使用老版本的 cann 包可能导致 tensor 落盘路径不同)。 +- stats 统计量信息,具体路径是 `{DUMP_DIR}/msit_dump_{TIMESTAMP}/tensors/{device_id}_{PID}/{TID}`目录下(同`tensor 信息`落盘位置)。 - layer 信息,具体路径是 `{DUMP_DIR}/msit_dump_{TIMESTAMP}/layer/{PID}`目录下。 - model 信息,具体路径是 `{DUMP_DIR}/msit_dump_{TIMESTAMP}/model/{PID}`目录下。注:由于 model 由 layer 组合而成,因此使用 model 时,默认同时会落盘 layer 信息。 - onnx 落盘位置和 model、layer 相同的目录。(落盘onnx文件格式为 xxx.onnx) -- Gitee From bc6e39360f74ea1d9a255e65bd619a92f543223e Mon Sep 17 00:00:00 2001 From: z30065766 Date: Fri, 6 Dec 2024 09:04:38 +0000 Subject: [PATCH 4/6] =?UTF-8?q?update=20msit/docs/llm/=E5=B7=A5=E5=85=B7-D?= =?UTF-8?q?UMP=E5=8A=A0=E9=80=9F=E5=BA=93=E6=95=B0=E6=8D=AE=E4=BD=BF?= =?UTF-8?q?=E7=94=A8=E8=AF=B4=E6=98=8E.md.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: z30065766 --- ...\215\256\344\275\277\347\224\250\350\257\264\346\230\216.md" | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git "a/msit/docs/llm/\345\267\245\345\205\267-DUMP\345\212\240\351\200\237\345\272\223\346\225\260\346\215\256\344\275\277\347\224\250\350\257\264\346\230\216.md" "b/msit/docs/llm/\345\267\245\345\205\267-DUMP\345\212\240\351\200\237\345\272\223\346\225\260\346\215\256\344\275\277\347\224\250\350\257\264\346\230\216.md" index 49d38ad14..11f13858c 100644 --- "a/msit/docs/llm/\345\267\245\345\205\267-DUMP\345\212\240\351\200\237\345\272\223\346\225\260\346\215\256\344\275\277\347\224\250\350\257\264\346\230\216.md" +++ "b/msit/docs/llm/\345\267\245\345\205\267-DUMP\345\212\240\351\200\237\345\272\223\346\225\260\346\215\256\344\275\277\347\224\250\350\257\264\346\230\216.md" @@ -19,7 +19,7 @@ msit llm dump --exec "<任意包含ATB的程序执行命令>" --type model tenso msit llm dump --exec "<任意包含ATB的程序执行命令>" --type onnx # 常用于导出onnx查看网络结构 # 仅dump统计量 -msit llm dump --exec "<任意包含ATB的程序执行命令>" --type model tensor stats # 查看模型tensor的统计量, 相比全量落盘,可节省磁盘空间 +msit llm dump --exec "<任意包含ATB的程序执行命令>" --type model tensor stats # 查看模型tensor的统计量, 相比全量落盘,可节省磁盘空间,但需花费额外时间进行统计量的计算 # 仅dump layer 层的算子输出,常用于精度比对先找到存在问题的 layer 层。相比全量dump,可以节省磁盘空间和定位时间 msit llm dump --exec "<任意包含ATB的程序执行命令>" --type model tensor -child False -- Gitee From 1e157c39aab95fc6e68a08cbdc4a28ab3e9cd18d Mon Sep 17 00:00:00 2001 From: z30065766 Date: Fri, 6 Dec 2024 12:43:35 +0000 Subject: [PATCH 5/6] =?UTF-8?q?add=20msit/components/debug/compare/tests/u?= =?UTF-8?q?t/test=5Fnet=5Fcompare.py.=20=E3=80=90UT=E3=80=91=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0net=5Fcompare.py=E7=9A=84UT=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: z30065766 --- .../debug/compare/tests/ut/test_net_compare.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 msit/components/debug/compare/tests/ut/test_net_compare.py diff --git a/msit/components/debug/compare/tests/ut/test_net_compare.py b/msit/components/debug/compare/tests/ut/test_net_compare.py new file mode 100644 index 000000000..a83ca0aa1 --- /dev/null +++ b/msit/components/debug/compare/tests/ut/test_net_compare.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023-2024 Huawei Technologies Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file -- Gitee From d74d8d614b8550ffef1c84ea2b4df162f919be96 Mon Sep 17 00:00:00 2001 From: z30065766 Date: Thu, 12 Dec 2024 09:11:03 +0000 Subject: [PATCH 6/6] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20msit?= =?UTF-8?q?/components/debug/compare/tests/ut/test=5Fnet=5Fcompare.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../debug/compare/tests/ut/test_net_compare.py | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 msit/components/debug/compare/tests/ut/test_net_compare.py diff --git a/msit/components/debug/compare/tests/ut/test_net_compare.py b/msit/components/debug/compare/tests/ut/test_net_compare.py deleted file mode 100644 index a83ca0aa1..000000000 --- a/msit/components/debug/compare/tests/ut/test_net_compare.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2023-2024 Huawei Technologies Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. \ No newline at end of file -- Gitee