From 5e9d12bd7a96d3ebab6a4706f23311417dcf5e81 Mon Sep 17 00:00:00 2001 From: yanhe13 Date: Tue, 23 Apr 2024 11:01:59 +0800 Subject: [PATCH 1/2] add mindspore wkld --- huawei/mindspore/mindformers/README.md | 27 ++ huawei/mindspore/mindformers/build.sh | 69 +++++ .../mindspore/mindformers/code/benchmark.sh | 65 +++++ .../mindformers/code/evaluate_run.sh | 40 +++ .../mindformers/code/multi_nodes_run.sh | 44 ++++ .../mindformers/code/single_node_run.sh | 57 ++++ .../mindformers/models/glm2_6b/README.md | 187 +++++++++++++ .../models/glm2_6b/evaluate_scripts.sh | 14 + .../models/glm2_6b/launch_config.sh | 13 + .../run_glm2_6b_finetune_800T_A2_64G.yaml | 248 +++++++++++++++++ .../run_glm2_6b_finetune_800_32G.yaml | 249 ++++++++++++++++++ .../run_glm2_6b_finetune_eval.yaml | 230 ++++++++++++++++ .../models/glm2_6b/registed_tasks.sh | 8 + .../mindformers/models/llama2_13b/README.md | 204 ++++++++++++++ .../models/llama2_13b/evaluate_scripts.sh | 15 ++ .../models/llama2_13b/launch_config.sh | 13 + .../launch_yamls/run_llama2_13b.yaml | 213 +++++++++++++++ .../launch_yamls/run_llama2_13b_910b.yaml | 209 +++++++++++++++ .../run_llama2_13b_910b_finetune.yaml | 209 +++++++++++++++ .../launch_yamls/run_llama2_13b_finetune.yaml | 214 +++++++++++++++ .../models/llama2_13b/registed_tasks.sh | 11 + .../mindformers/models/llama2_70b/README.md | 197 ++++++++++++++ .../models/llama2_70b/evaluate_scripts.sh | 11 + .../models/llama2_70b/launch_config.sh | 13 + .../launch_yamls/predict_llama2_70b_910b.yaml | 154 +++++++++++ .../launch_yamls/run_llama2_70b_910b.yaml | 214 +++++++++++++++ .../run_llama2_70b_910b_finetune.yaml | 214 +++++++++++++++ .../models/llama2_70b/registed_tasks.sh | 8 + .../mindformers/models/llama2_7b/README.md | 185 +++++++++++++ .../models/llama2_7b/evaluate_scripts.sh | 15 ++ .../models/llama2_7b/launch_config.sh | 13 + .../llama2_7b/launch_yamls/run_llama2_7b.yaml | 213 +++++++++++++++ .../launch_yamls/run_llama2_7b_910b.yaml | 210 +++++++++++++++ .../run_llama2_7b_910b_finetune.yaml | 209 +++++++++++++++ .../launch_yamls/run_llama2_7b_finetune.yaml | 214 +++++++++++++++ .../models/llama2_7b/registed_tasks.sh | 10 + .../patch_files/r1.1.rc1/patch_config.sh | 15 ++ .../patch_files/r1.1.rc1/r1.1.rc1.patch | 209 +++++++++++++++ huawei/mindspore/mindformers/update_patch.sh | 29 ++ 39 files changed, 4472 insertions(+) create mode 100644 huawei/mindspore/mindformers/README.md create mode 100644 huawei/mindspore/mindformers/build.sh create mode 100644 huawei/mindspore/mindformers/code/benchmark.sh create mode 100644 huawei/mindspore/mindformers/code/evaluate_run.sh create mode 100644 huawei/mindspore/mindformers/code/multi_nodes_run.sh create mode 100644 huawei/mindspore/mindformers/code/single_node_run.sh create mode 100644 huawei/mindspore/mindformers/models/glm2_6b/README.md create mode 100644 huawei/mindspore/mindformers/models/glm2_6b/evaluate_scripts.sh create mode 100644 huawei/mindspore/mindformers/models/glm2_6b/launch_config.sh create mode 100644 huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800T_A2_64G.yaml create mode 100644 huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800_32G.yaml create mode 100644 huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_eval.yaml create mode 100644 huawei/mindspore/mindformers/models/glm2_6b/registed_tasks.sh create mode 100644 huawei/mindspore/mindformers/models/llama2_13b/README.md create mode 100644 huawei/mindspore/mindformers/models/llama2_13b/evaluate_scripts.sh create mode 100644 huawei/mindspore/mindformers/models/llama2_13b/launch_config.sh create mode 100644 huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b.yaml create mode 100644 huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_910b.yaml create mode 100644 huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_910b_finetune.yaml create mode 100644 huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_finetune.yaml create mode 100644 huawei/mindspore/mindformers/models/llama2_13b/registed_tasks.sh create mode 100644 huawei/mindspore/mindformers/models/llama2_70b/README.md create mode 100644 huawei/mindspore/mindformers/models/llama2_70b/evaluate_scripts.sh create mode 100644 huawei/mindspore/mindformers/models/llama2_70b/launch_config.sh create mode 100644 huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/predict_llama2_70b_910b.yaml create mode 100644 huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/run_llama2_70b_910b.yaml create mode 100644 huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/run_llama2_70b_910b_finetune.yaml create mode 100644 huawei/mindspore/mindformers/models/llama2_70b/registed_tasks.sh create mode 100644 huawei/mindspore/mindformers/models/llama2_7b/README.md create mode 100644 huawei/mindspore/mindformers/models/llama2_7b/evaluate_scripts.sh create mode 100644 huawei/mindspore/mindformers/models/llama2_7b/launch_config.sh create mode 100644 huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b.yaml create mode 100644 huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_910b.yaml create mode 100644 huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_910b_finetune.yaml create mode 100644 huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_finetune.yaml create mode 100644 huawei/mindspore/mindformers/models/llama2_7b/registed_tasks.sh create mode 100644 huawei/mindspore/mindformers/patch_files/r1.1.rc1/patch_config.sh create mode 100644 huawei/mindspore/mindformers/patch_files/r1.1.rc1/r1.1.rc1.patch create mode 100644 huawei/mindspore/mindformers/update_patch.sh diff --git a/huawei/mindspore/mindformers/README.md b/huawei/mindspore/mindformers/README.md new file mode 100644 index 0000000..a1c8643 --- /dev/null +++ b/huawei/mindspore/mindformers/README.md @@ -0,0 +1,27 @@ +# Mindformers 负载导航 +## MindFormers 训练负载包版本归档 +### r1.1.rc1 版本 +#### mindspore版本 +```bash +mindspore >= 2.3 +``` +#### 取包链接 +|模型|负载包链接| +| ----- | ------------------------------- | +|LLaMA2 7B|[x86_64](https://aisbench.obs.cn-north-4.myhuaweicloud.com/workload_packages/train/mindformers/r1.1.rc1/Ais-Benchmark-Stubs-x86_64-2.0-training-mindformers-llama2_7b-r1.1.rc1.tar.gz)
[aarch64](https://aisbench.obs.cn-north-4.myhuaweicloud.com/workload_packages/train/mindformers/r1.1.rc1/Ais-Benchmark-Stubs-aarch64-2.0-training-mindformers-llama2_7b-r1.1.rc1.tar.gz)| +|LLaMA2 13B|[x86_64](https://aisbench.obs.cn-north-4.myhuaweicloud.com/workload_packages/train/mindformers/r1.1.rc1/Ais-Benchmark-Stubs-x86_64-2.0-training-mindformers-llama2_13b-r1.1.rc1.tar.gz)
[aarch64](https://aisbench.obs.cn-north-4.myhuaweicloud.com/workload_packages/train/mindformers/r1.1.rc1/Ais-Benchmark-Stubs-aarch64-2.0-training-mindformers-llama2_13b-r1.1.rc1.tar.gz)| +|LLaMA2 70B|[x86_64](https://aisbench.obs.cn-north-4.myhuaweicloud.com/workload_packages/train/mindformers/r1.1.rc1/Ais-Benchmark-Stubs-x86_64-2.0-training-mindformers-llama2_13b-r1.1.rc1.tar.gz)
[aarch64](https://aisbench.obs.cn-north-4.myhuaweicloud.com/workload_packages/train/mindformers/r1.1.rc1/Ais-Benchmark-Stubs-aarch64-2.0-training-mindformers-llama2_70b-r1.1.rc1.tar.gz)| +|GLM2 6B|[x86_64](https://aisbench.obs.cn-north-4.myhuaweicloud.com/workload_packages/train/mindformers/r1.1.rc1/Ais-Benchmark-Stubs-x86_64-2.0-training-mindformers-glm2_6b-r1.1.rc1.tar.gz)
[aarch64](https://aisbench.obs.cn-north-4.myhuaweicloud.com/workload_packages/train/mindformers/r1.1.rc1/Ais-Benchmark-Stubs-aarch64-2.0-training-mindformers-glm2_6b-r1.1.rc1.tar.gz)| + +## 贡献指南 +### 使用build.sh出负载包 +```bash +bash build.sh <训练任务类型> +``` +在`./output`路径下生成构建好的负载包,请自行打包成压缩包 +### 使用update_patch.sh更新mindformers打点的patch版本 +``` +bash update_patch.sh <传入修改好的完成打点的ModelLink代码路径> +``` + + diff --git a/huawei/mindspore/mindformers/build.sh b/huawei/mindspore/mindformers/build.sh new file mode 100644 index 0000000..f96060e --- /dev/null +++ b/huawei/mindspore/mindformers/build.sh @@ -0,0 +1,69 @@ +#!/bin/bash +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_DIR=$(dirname $(readlink -f $0)) +LLM_FRAMEWORK_NAME="mindformers" +WORKLOAD_PACKAGE_NAME="" +WORKLOAD_PACKAGE_DIR="" +PATCH_TOOLS_PATH="$CUR_DIR/../../../tools/patch_tool/patch_tool.sh" +PACKAGE_OUTPUT_PATH="$CUR_DIR/output" +stubs_package_path="" +model_type="" # 模型的文件夹目录,如llama2_7b +patch_version="" # patch版本,在patch_files文件夹下的目录名,如v0 + +load_stubs_package(){ + if [ ! -f $stubs_package_path ];then + echo "stubs package: ${stubs_package_path} not exist!" + return $ret_error + fi + stubs_basename=$(basename "$stubs_package_path") + stubs_subname=${stubs_basename%.tar.gz} + cp -f "$stubs_package_path" "$PACKAGE_OUTPUT_PATH" + cd "${PACKAGE_OUTPUT_PATH}" + tar -xzf "${PACKAGE_OUTPUT_PATH}/${stubs_basename}" + rm -f "${PACKAGE_OUTPUT_PATH}/${stubs_basename}" + stubs_content_dir=$(find ./ -name "Ais-Benchmark-Stubs*" -type d) + cd $CUR_DIR + WORKLOAD_PACKAGE_NAME="${stubs_subname}-training-${LLM_FRAMEWORK_NAME}-${model_type}-${patch_version}" + WORKLOAD_PACKAGE_DIR=${PACKAGE_OUTPUT_PATH}/${WORKLOAD_PACKAGE_NAME} + mv "${PACKAGE_OUTPUT_PATH}/${stubs_content_dir}" "${WORKLOAD_PACKAGE_DIR}" + return $ret_ok +} + +add_workload_code(){ + code_path="${WORKLOAD_PACKAGE_DIR}/code" + rm -r "${code_path}" + cp -rf "${CUR_DIR}/code" "${WORKLOAD_PACKAGE_DIR}" || { echo "$model_type not found";return $ret_error; } + patch_config_path="${CUR_DIR}/patch_files/${patch_version}/patch_config.sh" + . $patch_config_path # 导入patch配置文件 + result_code_dir="${code_path}/${LLM_FRAMEWORK_NAME}" # 改变patch配置文件export的变量 + patch_file_path="${CUR_DIR}/patch_files/${patch_version}/${patch_version}.patch" # 改变patch配置文件export的变量 + bash $PATCH_TOOLS_PATH "applypatch" || { echo "apply changes to mindformers code failed!";return $ret_error; } # 调用patch_tool + cp -rf ${CUR_DIR}/models/${model_type}/launch_yamls/ ${code_path} || { echo "launch yaml file not found";return $ret_error; } + cp -f ${CUR_DIR}/models/${model_type}/*.sh ${code_path} || { echo "registed task not found";return $ret_error; } # launch_config.sh和registed task放入code + cp -f ${CUR_DIR}/models/${model_type}/README.md ${WORKLOAD_PACKAGE_DIR} || { echo "copy readme failed";return $ret_error; } + return $ret_ok +} + + +main(){ + stubs_package_path=$1 + model_type=$2 # 任务的文件夹目录,如llama2_7b + patch_version=$3 # patch版本,在patch_files文件夹下的目录名,如v0 + + # 清空原来的出包路径内容,新建出包的路径 + if [ -d $PACKAGE_OUTPUT_PATH ];then + rm -rf $PACKAGE_OUTPUT_PATH + fi + mkdir -p $PACKAGE_OUTPUT_PATH + + load_stubs_package || { echo "ERROR: load stubs package failed!";return $ret_error; } + add_workload_code || { echo "ERROR: add workload code failed!";return $ret_error; } + rm -rf $CUR_DIR/buildtmp # 清空patch的临时数据 + + return $ret_ok +} + +main "$@" +exit $? \ No newline at end of file diff --git a/huawei/mindspore/mindformers/code/benchmark.sh b/huawei/mindspore/mindformers/code/benchmark.sh new file mode 100644 index 0000000..0c1aaea --- /dev/null +++ b/huawei/mindspore/mindformers/code/benchmark.sh @@ -0,0 +1,65 @@ +#!/bin/bash +declare -i ret_ok=0 +declare -i ret_error=1 + +CUR_DIR=$(cd "$(dirname "$0")";pwd) +WORK_DIR="${CUR_DIR}/../work" +RESULT_DIR="${CUR_DIR}/../result" +RUN_MODE="single_node" + +source ${CUR_DIR}/registed_tasks.sh # 导入注册了的启动任务 + +function contains() { + local list=("$@") + for element in "${list[@]}"; do + if [ "$element" == "$AIS_TRAIN_YAML" ]; then + return $ret_ok # 找到,返回真 + fi + done + return $ret_error # 没有找到,返回假 +} + +function train_yaml_select(){ # from registed_tasks.sh + if contains "${SINGLE_NODE_LAUNCH[@]}"; then + RUN_MODE="single_node" + echo "launch with ${AIS_TRAIN_YAML}.yaml using single node mode." + elif contains "${MULTI_NODES_LAUNCH[@]}"; then + RUN_MODE="multi_nodes" + echo "launch with ${AIS_TRAIN_YAML}.yaml using multi nodes mode." + else + echo "invalid yaml name ${AIS_TRAIN_YAML}" + return $ret_error + fi + return $ret_ok +} + +function prepare() { + export PYTHONPATH=$PYTHONPATH:${CUR_DIR} + rm -f ${RESULT_DIR}/* + if [ -d ${WORK_DIR} ];then + rm -rf ${WORK_DIR} + fi + mkdir -p ${WORK_DIR} + cp -r ${CUR_DIR}/* ${WORK_DIR} +} + +function collect_result() { + python3 -c "from ais_bench.logging import collect_report; collect_report('training', ['$CUR_DIR/../result'])" +} + +function main() { + source "${CUR_DIR}/launch_config.sh" + train_yaml_select || { echo "train task select failed!";return $ret_error; } + prepare + if [ "${RUN_MODE}" == "single_node" ]; then + bash "${WORK_DIR}/single_node_run.sh" + else + bash "${WORK_DIR}/multi_nodes_run.sh" + fi + bash "${WORK_DIR}/evaluate_run.sh" || { echo "evaluate run failed!";return $ret_error; } + collect_result || { echo "collect train result failed!";return $ret_error; } + return $ret_ok +} + +main "$@" +exit $? \ No newline at end of file diff --git a/huawei/mindspore/mindformers/code/evaluate_run.sh b/huawei/mindspore/mindformers/code/evaluate_run.sh new file mode 100644 index 0000000..a3b7ec6 --- /dev/null +++ b/huawei/mindspore/mindformers/code/evaluate_run.sh @@ -0,0 +1,40 @@ +#!/bin/bash +declare -i ret_ok=0 +declare -i ret_error=1 +CUR_DIR=$(cd "$(dirname "$0")";pwd) +LAUNCH_SCRIPT_PATH=${CUR_DIR}/mindformers/scripts/ +TOOLS_SCRIPT_PATH=${CUR_DIR}/mindformers/mindformers/tools/ +RESULT_PATH=${CUR_DIR}/result/ +OUTPUT_PATH=${CUR_DIR}/mindformers/output/ + +function merge_ckpt() { + cmd="${AIS_PYTHON} ${TOOLS_SCRIPT_PATH}/transform_ckpt.py \ + --src_ckpt_strategy ${OUTPUT_PATH}/strategy/ \ + --src_ckpt_dir ${OUTPUT_PATH}/checkpoint/ \ + --dst_ckpt_dir ${OUTPUT_PATH}/target_checkpoint/ \ + --prefix ${AIS_MODEL_NAME}" + eval $cmd || { echo "exec merge ckpt script ${cmd} failed!";return $ret_error; } + return $ret_ok +} + +function launch_evaluate() { + source ${CUR_DIR}/evaluate_scripts.sh + export AIS_WORK_RESULT_PATH=${RESULT_PATH} + cmd=${mindformers_workload_eval_cmd} # from evaluate_scripts.sh + if [ "${cmd}" == "" ]; then + echo "evaluate cmd not given, skip" + return $ret_ok + fi + eval $cmd || { echo "launch eval cmd: ${cmd} failed!";return $ret_error; } + return $ret_ok +} + +function main() { + merge_ckpt || { echo "merge ckpt failed!";return $ret_error; } + launch_evaluate || { echo "launch evaluate failed!";return $ret_error; } + cp -r ${RESULT_PATH} ${CUR_DIR}/../ || { echo "cp work result to base result failed!";return $ret_error; } + return $ret_ok +} + +main "$@" +exit $? diff --git a/huawei/mindspore/mindformers/code/multi_nodes_run.sh b/huawei/mindspore/mindformers/code/multi_nodes_run.sh new file mode 100644 index 0000000..68b794e --- /dev/null +++ b/huawei/mindspore/mindformers/code/multi_nodes_run.sh @@ -0,0 +1,44 @@ +#!/bin/bash +declare -i ret_ok=0 +declare -i ret_error=1 +CUR_DIR=$(cd "$(dirname "$0")";pwd) +WORK_DIR="${CUR_DIR}/../work" + +function cluster_exist_check() { + $AIS_PYTHON -c "import ais_bench.cluster" || { echo "can't find cluster_tools be installed";return $ret_error; } +} + +function cluster_deploy_code() { + cmd="rm -rf ./work" + $AIS_PYTHON -m ais_bench.cluster multi_exec -n "${AIS_NODE_FILE_PATH}" -c "${cmd}" || { echo "clear work dir failed";return $ret_error; } + $AIS_PYTHON -m ais_bench.cluster multi_put -n "${AIS_NODE_FILE_PATH}" -s "${WORK_DIR}" -d "./" || { echo "cluster deploy code failed";return $ret_error; } +} + +function cluster_parallel_run() { + cmd="source /etc/profile; \ + source ./work/launch_config.sh; \ + export AIS_WORK_PATH=\$PWD/work/; \ + bash ./work/single_node_run.sh" + $AIS_PYTHON -m ais_bench.cluster multi_exec -n "${AIS_NODE_FILE_PATH}" -c "${cmd}" || { echo "cluster parallel run failed";return $ret_error; } +} + +function cluster_collect_result() { + # collect outputs of nodes + $AIS_PYTHON -m ais_bench.cluster multi_get -n "${AIS_NODE_FILE_PATH}" -s "./work/mindformers/output" \ + -d "${CUR_DIR}/mindformers/" || { echo "cluster collect output failed";return $ret_error; } + # collect logging dump file from nodes + $AIS_PYTHON -m ais_bench.cluster multi_get -n "${AIS_NODE_FILE_PATH}" -s "./work/result/" \ + -d "${CUR_DIR}/" || { echo "cluster collect result failed";return $ret_error; } +} + +function main() { + source "${CUR_DIR}/launch_config.sh" + cluster_exist_check || { echo "cluster_exist_check failed";return $ret_error; } + cluster_deploy_code || { echo "cluster_deploy_code failed";return $ret_error; } + cluster_parallel_run || { echo "cluster_parallel_run failed";return $ret_error; } + cluster_collect_result || { echo "cluster_collect_result failed";return $ret_error; } + return $ret_ok +} + +main "$@" +exit $? \ No newline at end of file diff --git a/huawei/mindspore/mindformers/code/single_node_run.sh b/huawei/mindspore/mindformers/code/single_node_run.sh new file mode 100644 index 0000000..bff542c --- /dev/null +++ b/huawei/mindspore/mindformers/code/single_node_run.sh @@ -0,0 +1,57 @@ +#!/bin/bash +declare -i ret_ok=0 +declare -i ret_error=1 +CUR_DIR=$(cd "$(dirname "$0")";pwd) +LAUNCH_SCRIPT_PATH=${CUR_DIR}/mindformers/scripts/ +RESULT_PATH=${CUR_DIR}/result/ +NODE_DEVICE_INFO="[0,8]" + +function get_node_info() { + if [ "${NODE_ID}" == "" ]; then + NODE_DEVICE_INFO="[0,${AIS_DEVICE_NUM}]" + else + RANK_START=`expr ${NODE_ID} \* $AIS_DEVICE_NUM` # NODE_ID get from cluster_tools export + RANK_ID_MAX=$[AIS_DEVICE_NUM+RANK_START] + NODE_DEVICE_INFO="[$RANK_START,$RANK_ID_MAX]" + fi +} + +function prepare_and_clear() { + export AIS_WORK_PATH=${CUR_DIR} + source "${CUR_DIR}/launch_config.sh" + export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE" # 推荐开启INFNAN模式,llama2_7b和70b 不用设置该项 + echo "single node run..." + mkdir -p ${RESULT_PATH} || { echo "mkdir work result dir:${RESULT_PATH} failed!";return $ret_error; } # logging的落盘文件在里面 + return $ret_ok +} + +function install_mindformers() { + pip3 install ${CUR_DIR}/mindformers/ --force-reinstall || { return $ret_error; } +} + +function launch_train() { + get_node_info + cd ${LAUNCH_SCRIPT_PATH} + export AIS_WORK_RESULT_PATH=${RESULT_PATH} + cmd="bash run_distribute.sh \ + ${CUR_DIR}/${AIS_RANK_TABLE_FILE} \ + ${CUR_DIR}/launch_yamls/${AIS_TRAIN_YAML}.yaml \ + ${NODE_DEVICE_INFO} \ + ${AIS_TRAIN_TASK_TYPE} \ + ${AIS_RANK_NUM}" + eval $cmd || { echo "exec launch train scripts ${cmd} failed!";return $ret_error; } + cd ${CUR_DIR} + return $ret_ok +} + + +function main() { + prepare_and_clear || { echo "prepare_and_clear failed!";return $ret_error; } + install_mindformers || { echo "install mindformers failed";return $ret_error; } + launch_train || { echo "launch_train failed!";return $ret_error; } + return $ret_ok +} + +main "$@" +exit $? + diff --git a/huawei/mindspore/mindformers/models/glm2_6b/README.md b/huawei/mindspore/mindformers/models/glm2_6b/README.md new file mode 100644 index 0000000..b2b214e --- /dev/null +++ b/huawei/mindspore/mindformers/models/glm2_6b/README.md @@ -0,0 +1,187 @@ +# 基于Mindspore/mindformers框架的glm2大模型训练负载使用指南 +本文主要介绍使用基于mindformers LLaMA2 7b大模型训练业务代码构建的AISBench的负载包"Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-glm2_6b-{mindformers version}.tar.gz",进行服务器性能测试的流程。 +本负载包支持一键拉起单机的GLM2 6b模型的微调任务,自动完成评测,并汇总训练相关的性能数据。 +## 名词定义 +|名词| 定义| +| --- | ----------------------------------- | +|ais-bench-stubs|启动性能测试任务的二进制程序| +## 查看GLM2 6b 训练负载包目录结构,简单确认完整性 +解压负载包"Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-glm2_6b-{mindformers version}.tar.gz"(如果在包中看到本文档忽略此步) +```bash +tar xzf Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-glm2_6b-{mindformers version}.tar.gz +``` +负载包名中"{mindformers version}"表示[mindformer仓库](https://gitee.com/mindspore/mindformers)的分支名 +查看目录结构 +```bash +├── ais-bench-stubs # 启动测试的二进制文件 +├── code/ +│   ├── benchmark.sh +│   ├── evaluate_run.sh +│   ├── evaluate_scripts.sh # 评测的运行脚本,需要依据实际运行的评测命令 +│   ├── launch_config.sh +│   ├── launch_yamls/ # 启动训练任务的配置文件 +| │   ├──run_glm2_6b_finetune_800T_A2_64G.yaml +| │   ├──...... +│   ├── mindformers/ # 嵌入了logging打点接口的mindformers代码 +│   ├── multi_nodes_run.sh +│   ├── registed_tasks.sh # 注册了可用的mindformers脚本 +│   └── single_node_run.sh +├── config/ +│   ├── config.json +│   └── system.json +├── log/ +├── result/ +├── README.md # 本文档 +└── STUBS_PACKAGE_INTRO.md +``` +**后续对于相对路径的描述都是相对于负载包中的一级目录,例如 ./** +## 负载包运行环境准备 +### 基本环境准备 +``` +python >=3.7 +``` +### mindspore准备 +请依据负载包名中的“{mindformers version}”对应的mindformers分支版本,参考[mindformers训练负载主页](https://gitee.com/aisbench/training/tree/master/huawei/mindspore/mindformers),安装指定版本的mindspore(python版本不限)。 +MindSpore安装参考[MindSpore官网](https://www.mindspore.cn/)MindSpore需要能成功在npu上运行,验证命令: +```bash +python -c "import mindspore;mindspore.set_context(device_target='Ascend');mindspore.run_check()" + +### logging准备 +从[logging打点工具发行版](https://gitee.com/aisbench/logging/releases)获取最新的发行版。 +参考[logging打点工具主页](https://gitee.com/aisbench/logging)的“安装与卸载/安装logging”章节安装logging打点工具。 + +## 资源准备 +### 前置声明 +1. 以下涉及到mindformers代码仓库的链接没有指定代码分支,需要依据负载包名"{mindformers version}",自行切换到对应的分支。 +2. 运行LLaMA2训练的MindSpore/mindformers的代码全部在`./code/mindformers`文件夹中,资源准备总体参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md),具体资源的参考详见本章其他小节。
+### rank_table_file准备 +确保`/etc/hccn.conf`文件已经配好(如果没配好,参考[数据中心解决方案/配置训练节点](https://www.hiascend.com/document/detail/zh/Ascend%20Data%20Center%20Solution/22.0.0/install/800_9000/install_800_9000_0029.html)配置)。
+ +参考[glm2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/glm2.md)的“生成RANK_TABLE_FILE”(单机多卡情况)章节。 + +### 模型权重下载与转换 +- 参考[glm2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/glm2.md)的“模型权重下载与转换”章节准备GLM2 6b的权重和词表文件; +- 资源链接: + - [glm2_6b.ckpt](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/XFormer_for_mindspore/glm2/glm2_6b.ckpt)(点击直接下载) + - [tokenizer](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/XFormer_for_mindspore/glm2/tokenizer.model)(点击直接下载) +### 数据集准备 +- 参考[glm2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/glm2.md)的“微调--数据集准备”章节准备微调和评测所需的数据集; +- 资源链接: + - [ADGEN数据集](https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1)(下载后需解压) +- 下载解压后目录结构为: + ``` + AdvertiseGen + ├── train.json + └── dev.json + ``` +## 查看本负载包支持的mindformers启动配置文件 +查看`code/register_task.sh`文件: +```bash +#!/bin/bash +# 单机运行的任务 +# 单机运行的任务 +SINGLE_NODE_LAUNCH=( \ + "run_glm2_6b_finetune_800_32G" \ + "run_glm2_6b_finetune_800T_A2_64G" +) +# 多机运行的任务 +MULTI_NODES_LAUNCH=() +``` + +```bash +"run_glm2_6b_finetune_800T_A2_64G" # 8卡 910B1,910B2,910B3 微调任务 +"run_glm2_6b_finetune_800_32G" # 8卡 910A,910B4 预训练任务 +``` +需要确认`code/launch_yamls/`路径下包含了上述`code/register_task.sh`中注册的yaml文件。 + +## 启动前配置 +### 负载通用配置文件launch_config.sh配置 +编辑`code/launch_config.sh`启动文件: +```bash +#!/bin/bash + +export AIS_PYTHON=python3 # 使用的python解释器 +export AISBENCH_LOGGING_WARM_UP_STEP_COUNT=5 # 从第几个steps之后开始统计step的性能数据 + +export AIS_NODE_FILE_PATH=/home/xx/xx/xx/node_file.json # 分布式运行使用cluster_tools所需包含节点信息和ssh key路径的文件,单机训练不用填 +export AIS_TRAIN_TASK_TYPE="finetune" # 预训练:"train",微调:"finetune" +export AIS_MODEL_NAME="glm2_6b" +export AIS_TRAIN_YAML="run_glm2_6b_finetune_800_32G" # 请从code/registed_task.sh中注册的yaml文件中选择一个填入 + +export AIS_RANK_NUM=8 # 集群总加速卡数 +export AIS_DEVICE_NUM=8 # 单台服务器的加速卡数量 +export AIS_RANK_TABLE_FILE="./xx.json" # rank_table_file 的路径, 相对于当前脚本 + +``` +### 修改yaml配置文件 +修改`code/launch_config.sh`中设置的`AIS_TRAIN_YAML`对应的`code/launch_yaml/`中的yaml配置文件。 +```yaml +seed: 0 +run_mode: 'train' +output_dir: './output' # path to save checkpoint/strategy +load_checkpoint: '{path}/glm2_6b.ckpt' # 需要填入准备的权重文件.ckpt的路径 +``` +`load_checkpoint`支持绝对路径以及相对路径。`load_checkpoint`填入的相对路径`./`实际对应负载包路径`code/mindformers/` +```yaml +train_dataset: &train_dataset + data_loader: + type: ADGenDataLoader + dataset_dir: "/path/to/AdvertiseGen/train.json" # 需要填入train.json数据集的路径 + shuffle: True + phase: "train" + version: 2 + origin_columns: ["content", "summary"] + tokenizer: + type: ChatGLM2Tokenizer + vocab_file: "/path/to/tokenizer.model" # 需要填入词表文件tokenizer.model的实际路径 +``` +`dataset_dir`支持绝对路径以及相对路径。`dataset_dir`填入的相对路径`../../`实际对应负载包路径`code/mindformers/`
+`vocab_file`支持绝对路径以及相对路径。`vocab_file`填入的相对路径`../../`实际对应负载包路径`code/mindformers/ + +### 修改评测启动脚本 +`code/evaluate_scripts.sh`评测脚本的启动较为多样,为了保证可拓展性,支持用户自行修改。默认提供的是基于wikitext-2数据集的文本生成任务的评测启动脚本。`code/evaluate_scripts.sh`默认内容如下: +```bash +#!/bin/bash +CUR_DIR=$(cd "$(dirname "$0")";pwd) +MINDFORMERS_CODE_PATH=${CUR_DIR}/mindformers/ +LAUNCH_SCRIPT_PATH=${MINDFORMERS_CODE_PATH}/scripts/ +OUTPUT_PATH=${MINDFORMERS_CODE_PATH}/output/ # 训练完后output路径 + +# eval_script 评测启动命令请自行根据实际情况修改 +export mindformers_workload_eval_cmd="${AIS_PYTHON} ${MINDFORMERS_CODE_PATH}/run_mindformer.py \ +--config ${CUR_DIR}/launch_yamls/run_glm2_6b_finetune_eval.yaml \ +--run_mode eval \ +--load_checkpoint ${OUTPUT_PATH}/target_checkpoint/rank_0/glm2_6b0.ckpt \ +--epochs 1 \ +--use_parallel False \ +--device_id 0" +``` +按默认启动脚本执行,需要修改`code/launch_yamls/run_glm2_6b_finetune_eval.yaml`配置文件的内容: +```yaml +eval_dataset: &eval_dataset + data_loader: + type: ADGenDataLoader + dataset_dir: "/path/to/AdvertiseGen/dev.json" # 需要填入评测数据集dev.json的实际路径 + shuffle: False + phase: "train" + version: 2 + origin_columns: ["content", "summary"] + tokenizer: + type: ChatGLM2Tokenizer + vocab_file: "/path/to/tokenizer.model" # 需要填入词表文件tokenizer.model的实际路径 +``` +`dataset_dir`支持绝对路径以及相对路径。`dataset_dir`填入的相对路径`../../`实际对应负载包路径`code/mindformers/`
+`vocab_file`支持绝对路径以及相对路径。`vocab_file`填入的相对路径`../../`实际对应负载包路径`code/mindformers/` + + +## 3 负载启动 +### 3.1 在线测试 +执行命令 +```bash +./ais-bench-stubs +``` +### 3.2 轻量化离线测试 +执行命令 +```bash +./ais-bench-stubs test +``` \ No newline at end of file diff --git a/huawei/mindspore/mindformers/models/glm2_6b/evaluate_scripts.sh b/huawei/mindspore/mindformers/models/glm2_6b/evaluate_scripts.sh new file mode 100644 index 0000000..af81df3 --- /dev/null +++ b/huawei/mindspore/mindformers/models/glm2_6b/evaluate_scripts.sh @@ -0,0 +1,14 @@ +#!/bin/bash +CUR_DIR=$(cd "$(dirname "$0")";pwd) +MINDFORMERS_CODE_PATH=${CUR_DIR}/mindformers/ +LAUNCH_SCRIPT_PATH=${MINDFORMERS_CODE_PATH}/scripts/ +OUTPUT_PATH=${MINDFORMERS_CODE_PATH}/output/ # 训练完后output路径 + +# eval_script 评测启动命令请自行根据实际情况修改 +export mindformers_workload_eval_cmd="${AIS_PYTHON} ${MINDFORMERS_CODE_PATH}/run_mindformer.py \ +--config ${CUR_DIR}/launch_yamls/run_glm2_6b_finetune_eval.yaml \ +--run_mode eval \ +--load_checkpoint ${OUTPUT_PATH}/target_checkpoint/rank_0/glm2_6b0.ckpt \ +--epochs 1 \ +--use_parallel False \ +--device_id 0" \ No newline at end of file diff --git a/huawei/mindspore/mindformers/models/glm2_6b/launch_config.sh b/huawei/mindspore/mindformers/models/glm2_6b/launch_config.sh new file mode 100644 index 0000000..b0e71f7 --- /dev/null +++ b/huawei/mindspore/mindformers/models/glm2_6b/launch_config.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +export AIS_PYTHON=python3 # 使用的python解释器 +export AISBENCH_LOGGING_WARM_UP_STEP_COUNT=5 # 从第几个steps之后开始统计step的性能数据 + +export AIS_NODE_FILE_PATH=/home/xx/xx/xx/node_file.json # 分布式运行使用cluster_tools所需包含节点信息和ssh key路径的文件,单机训练不用填 +export AIS_TRAIN_TASK_TYPE="finetune" # 预训练:"train",微调:"finetune" +export AIS_MODEL_NAME="glm2_6b" +export AIS_TRAIN_YAML="run_glm2_6b_finetune_800_32G" # 请从code/registed_task.sh中注册的yaml文件中选择一个填入 + +export AIS_RANK_NUM=8 # 集群总加速卡数 +export AIS_DEVICE_NUM=8 # 单台服务器的加速卡数量 +export AIS_RANK_TABLE_FILE="./xx.json" # rank_table_file 的路径, 相对于当前脚本 diff --git a/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800T_A2_64G.yaml b/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800T_A2_64G.yaml new file mode 100644 index 0000000..d991a63 --- /dev/null +++ b/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800T_A2_64G.yaml @@ -0,0 +1,248 @@ +seed: 0 +run_mode: 'train' +output_dir: './output' # path to save checkpoint/strategy +load_checkpoint: ''path/to/glm2_6b.ckpt'' +src_strategy_path_or_dir: '' +auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model +only_save_strategy: False +resume_training: False + +# ==== context config ==== +context: + mode: 0 #0--Graph Mode; 1--Pynative Mode + device_target: "Ascend" + enable_graph_kernel: False + graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true" + max_call_depth: 10000 + max_device_memory: "59GB" # 59GB for Atlas 800T A2 + save_graphs: False + device_id: 0 + +# aicc +remote_save_url: "Please input obs url on AICC platform." + +# ==== model config ==== +model: + model_config: + type: ChatGLM2Config + num_layers: 28 + padded_vocab_size: 65024 + hidden_size: 4096 + ffn_hidden_size: 13696 + kv_channels: 128 + num_attention_heads: 32 + seq_length: 192 + hidden_dropout: 0.0 + attention_dropout: 0.0 + layernorm_epsilon: 1e-5 + rmsnorm: True + apply_residual_connection_post_layernorm: False + post_layer_norm: True + add_bias_linear: False + add_qkv_bias: True + bias_dropout_fusion: True + multi_query_attention: True + multi_query_group_num: 2 + apply_query_key_layer_scaling: True + attention_softmax_in_fp32: True + fp32_residual_connection: False + quantization_bit: 0 + pre_seq_len: None + prefix_projection: False + param_init_type: "float16" + compute_dtype: "float16" + layernorm_compute_type: "float32" + use_past: False + use_flash_attention: True # when use FlashAttention, seq_length should be multiple of 16 + eos_token_id: 2 + pad_token_id: 0 + repetition_penalty: 1.0 + max_decode_length: 256 + checkpoint_name_or_path: "glm2_6b" + top_k: 1 + top_p: 1 + do_sample: True + arch: + type: ChatGLM2ForConditionalGeneration + +trainer: + type: CausalLanguageModelingTrainer + model_name: 'glm2_6b' +# if True do, evaluate during the training process. if false, do nothing. +# note that the task trainer should support _evaluate_in_training function. +do_eval: False +eval_step_interval: 1788 +eval_epoch_interval: -1 + +metric: + type: PerplexityMetric + +processor: + return_tensors: ms + tokenizer: + type: ChatGLM2Tokenizer + bos_token: '' + eos_token: '' + end_token: '' + mask_token: '[MASK]' + gmask_token: '[gMASK]' + pad_token: '' + unk_token: '' + type: GLMProcessor + +# ==== dataset config ==== +train_dataset: &train_dataset + data_loader: + type: ADGenDataLoader + dataset_dir: "/path/to/AdvertiseGen/train.json" + shuffle: True + phase: "train" + version: 2 + origin_columns: ["content", "summary"] + tokenizer: + type: ChatGLM2Tokenizer + vocab_file: "/path/to/tokenizer.model" + input_columns: ["input_ids", "labels"] + max_source_length: 64 + max_target_length: 127 + ignore_pad_token_for_loss: True + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 8 + repeat: 1 + numa_enable: False + prefetch_size: 1 + seed: 0 + +train_dataset_task: + type: KeyWordGenDataset + dataset_config: *train_dataset + +eval_dataset: &eval_dataset + data_loader: + type: ADGenDataLoader + dataset_dir: "/path/to/AdvertiseGen/dev.json" + shuffle: False + phase: "train" + version: 2 + origin_columns: ["content", "summary"] + tokenizer: + type: ChatGLM2Tokenizer + vocab_file: "/path/to/tokenizer.model" + max_source_length: 64 + max_target_length: 127 + ignore_pad_token_for_loss: True + input_columns: ["input_ids", "labels"] + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 8 + repeat: 1 + numa_enable: False + prefetch_size: 1 + seed: 0 + +eval_dataset_task: + type: KeyWordGenDataset + dataset_config: *eval_dataset + +# ==== runner config ==== +runner_config: + epochs: 1 + batch_size: 8 + sink_mode: True + sink_size: 4 + +runner_wrapper: + type: MFTrainOneStepCell + scale_sense: + type: DynamicLossScaleUpdateCell + loss_scale_value: 65536 + scale_factor: 2 + scale_window: 1000 + use_clip_grad: True + +# lr sechdule +lr_schedule: + type: polynomial + learning_rate: 5.e-5 + lr_end: 1.e-6 + warmup_steps: 0 + total_steps: -1 # -1 means it will load the total steps of the dataset +layer_scale: False +layer_decay: 0.65 + +# optimizer +optimizer: + type: FP32StateAdamWeightDecay + beta1: 0.9 + beta2: 0.95 + eps: 1.e-8 + weight_decay: 0.1 +lr_scale: False +lr_scale_factor: 256 + +# parallel config +use_parallel: True +parallel: + parallel_mode: 1 # 0-dataset, 1-semi, 2-auto, 3-hybrid + gradients_mean: False + loss_repeated_mean: True + enable_alltoall: False + full_batch: True + search_mode: "sharding_propagation" + enable_parallel_optimizer: True # optimizer shard + strategy_ckpt_config: + save_file: "./ckpt_strategy.ckpt" +parallel_config: + data_parallel: 8 + model_parallel: 1 + pipeline_stage: 1 + expert_parallel: 1 + micro_batch_num: 1 + vocab_emb_dp: True + gradient_aggregation_group: 4 +micro_batch_interleave_num: 1 + +# moe +moe_config: + expert_num: 1 + capacity_factor: 1.05 + aux_loss_factor: 0.05 + num_experts_chosen: 1 + +# recompute +recompute_config: + recompute: True + parallel_optimizer_comm_recompute: False + mp_comm_recompute: True + recompute_slice_activation: True + +# autotune +auto_tune: False +filepath_prefix: './autotune' +autotune_per_step: 10 + +# profile +profile: False +profile_start_step: 1 +profile_stop_step: 10 +init_start_profile: True +profile_communication: True +profile_memory: True + +# callbacks +callbacks: + - type: MFLossMonitor + - type: CheckpointMointor + prefix: "glm2-6b" + save_checkpoint_steps: 100000000 # big enough + keep_checkpoint_max: 1 + integrated_save: False + async_save: False + - type: ObsMonitor + keep_last: False +eval_callbacks: + - type: ObsMonitor + keep_last: False diff --git a/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800_32G.yaml b/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800_32G.yaml new file mode 100644 index 0000000..8cc03b1 --- /dev/null +++ b/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800_32G.yaml @@ -0,0 +1,249 @@ +seed: 0 +run_mode: 'train' +output_dir: './output' # path to save checkpoint/strategy +load_checkpoint: '{path}/glm2_6b.ckpt' +src_strategy_path_or_dir: '' +auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model +only_save_strategy: False +resume_training: False + +# ==== context config ==== +context: + mode: 0 #0--Graph Mode; 1--Pynative Mode + device_target: "Ascend" + enable_graph_kernel: False + graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true" + max_call_depth: 10000 + max_device_memory: "28GB" # 59GB for Atlas 800T A2 + save_graphs: False + device_id: 0 + +# aicc +remote_save_url: "Please input obs url on AICC platform." + +# ==== model config ==== +model: + model_config: + type: ChatGLM2Config + num_layers: 28 + padded_vocab_size: 65024 + hidden_size: 4096 + ffn_hidden_size: 13696 + kv_channels: 128 + num_attention_heads: 32 + seq_length: 192 + hidden_dropout: 0.0 + attention_dropout: 0.0 + layernorm_epsilon: 1e-5 + rmsnorm: True + apply_residual_connection_post_layernorm: False + post_layer_norm: True + add_bias_linear: False + add_qkv_bias: True + bias_dropout_fusion: True + multi_query_attention: True + multi_query_group_num: 2 + apply_query_key_layer_scaling: True + attention_softmax_in_fp32: True + fp32_residual_connection: False + quantization_bit: 0 + pre_seq_len: None + prefix_projection: False + param_init_type: "float16" + compute_dtype: "float16" + layernorm_compute_type: "float32" + use_past: False + use_flash_attention: False # when use FlashAttention, seq_length should be multiple of 16 + eos_token_id: 2 + pad_token_id: 0 + repetition_penalty: 1.0 + max_decode_length: 256 + checkpoint_name_or_path: "glm2_6b" + top_k: 1 + top_p: 1 + do_sample: True + arch: + type: ChatGLM2ForConditionalGeneration + +trainer: + type: CausalLanguageModelingTrainer + model_name: 'glm2_6b' +# if True do, evaluate during the training process. if false, do nothing. +# note that the task trainer should support _evaluate_in_training function. +do_eval: False +eval_step_interval: 1788 +eval_epoch_interval: -1 + +metric: + type: PerplexityMetric + +processor: + return_tensors: ms + tokenizer: + type: ChatGLM2Tokenizer + bos_token: '' + eos_token: '' + end_token: '' + mask_token: '[MASK]' + gmask_token: '[gMASK]' + pad_token: '' + unk_token: '' + # vocab_file: "/path/to/tokenizer.model" + type: GLMProcessor + +# ==== dataset config ==== +train_dataset: &train_dataset + data_loader: + type: ADGenDataLoader + dataset_dir: "/path/to/AdvertiseGen/train.json" + shuffle: True + phase: "train" + version: 2 + origin_columns: ["content", "summary"] + tokenizer: + type: ChatGLM2Tokenizer + vocab_file: "/path/to/tokenizer.model" + input_columns: ["input_ids", "labels"] + max_source_length: 64 + max_target_length: 127 + ignore_pad_token_for_loss: True + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 8 + repeat: 1 + numa_enable: False + prefetch_size: 1 + seed: 0 + +train_dataset_task: + type: KeyWordGenDataset + dataset_config: *train_dataset + +eval_dataset: &eval_dataset + data_loader: + type: ADGenDataLoader + dataset_dir: "/path/to/AdvertiseGen/dev.json" + shuffle: False + phase: "train" + version: 2 + origin_columns: ["content", "summary"] + tokenizer: + type: ChatGLM2Tokenizer + vocab_file: "/path/to/tokenizer.model" + max_source_length: 256 + max_target_length: 256 + ignore_pad_token_for_loss: True + input_columns: ["input_ids", "labels"] + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 8 + repeat: 1 + numa_enable: False + prefetch_size: 1 + seed: 0 + +eval_dataset_task: + type: KeyWordGenDataset + dataset_config: *eval_dataset + +# ==== runner config ==== +runner_config: + epochs: 1 + batch_size: 8 + sink_mode: True + sink_size: 4 + +runner_wrapper: + type: MFTrainOneStepCell + scale_sense: + type: DynamicLossScaleUpdateCell + loss_scale_value: 65536 + scale_factor: 2 + scale_window: 1000 + use_clip_grad: True + +# lr sechdule +lr_schedule: + type: polynomial + learning_rate: 5.e-5 + lr_end: 1.e-6 + warmup_steps: 0 + total_steps: -1 # -1 means it will load the total steps of the dataset +layer_scale: False +layer_decay: 0.65 + +# optimizer +optimizer: + type: FP32StateAdamWeightDecay + beta1: 0.9 + beta2: 0.95 + eps: 1.e-8 + weight_decay: 0.1 +lr_scale: False +lr_scale_factor: 256 + +# parallel config +use_parallel: True +parallel: + parallel_mode: 1 # 0-dataset, 1-semi, 2-auto, 3-hybrid + gradients_mean: False + loss_repeated_mean: True + enable_alltoall: False + full_batch: True + search_mode: "sharding_propagation" + enable_parallel_optimizer: True # optimizer shard + strategy_ckpt_config: + save_file: "./ckpt_strategy.ckpt" +parallel_config: + data_parallel: 8 + model_parallel: 1 + pipeline_stage: 1 + expert_parallel: 1 + micro_batch_num: 1 + vocab_emb_dp: True + gradient_aggregation_group: 4 +micro_batch_interleave_num: 1 + +# moe +moe_config: + expert_num: 1 + capacity_factor: 1.05 + aux_loss_factor: 0.05 + num_experts_chosen: 1 + +# recompute +recompute_config: + recompute: True + parallel_optimizer_comm_recompute: False + mp_comm_recompute: True + recompute_slice_activation: True + +# autotune +auto_tune: False +filepath_prefix: './autotune' +autotune_per_step: 10 + +# profile +profile: False +profile_start_step: 1 +profile_stop_step: 10 +init_start_profile: True +profile_communication: True +profile_memory: True + +# callbacks +callbacks: + - type: MFLossMonitor + - type: CheckpointMointor + prefix: "glm2-6b" + save_checkpoint_steps: 100000000 # big enough + keep_checkpoint_max: 1 + integrated_save: False + async_save: False + - type: ObsMonitor + keep_last: False +eval_callbacks: + - type: ObsMonitor + keep_last: False diff --git a/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_eval.yaml b/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_eval.yaml new file mode 100644 index 0000000..1817d6b --- /dev/null +++ b/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_eval.yaml @@ -0,0 +1,230 @@ +seed: 0 +run_mode: 'eval' +output_dir: './output' # path to save checkpoint/strategy +load_checkpoint: './output/target_checkpoint/rank_0/glm2_6b0.ckpt' +src_strategy_path_or_dir: '' +auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model +only_save_strategy: False +resume_training: False + +# ==== context config ==== +context: + mode: 0 #0--Graph Mode; 1--Pynative Mode + device_target: "Ascend" + enable_graph_kernel: False + graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true" + max_call_depth: 10000 + max_device_memory: "28GB" # 59GB for Atlas 800T A2 + save_graphs: False + save_graphs_path: "./ir-graphs" + device_id: 0 + +# aicc +remote_save_url: "Please input obs url on AICC platform." + +# ==== model config ==== +model: + model_config: + type: ChatGLM2Config + # only for incremental infer, + # when parallel_mode=1 and full_batch=True, + # batch_size should be set to runner_config.batch_size * data_parallel; + # when single card, batch_size should be set to runner_config.batch_size + batch_size: 8 + num_layers: 28 + padded_vocab_size: 65024 + hidden_size: 4096 + ffn_hidden_size: 13696 + kv_channels: 128 + num_attention_heads: 32 + seq_length: 256 + hidden_dropout: 0.0 + attention_dropout: 0.0 + layernorm_epsilon: 1e-5 + rmsnorm: True + apply_residual_connection_post_layernorm: False + post_layer_norm: True + add_bias_linear: False + add_qkv_bias: True + bias_dropout_fusion: True + multi_query_attention: True + multi_query_group_num: 2 + apply_query_key_layer_scaling: True + attention_softmax_in_fp32: True + fp32_residual_connection: False + quantization_bit: 0 + pre_seq_len: None + prefix_projection: False + param_init_type: "float16" + compute_dtype: "float16" + layernorm_compute_type: "float32" + use_past: True + use_prompt_flash_attention: False + use_incre_flash_attention: False + eos_token_id: 2 + pad_token_id: 0 + repetition_penalty: 1.0 + max_decode_length: 256 + checkpoint_name_or_path: "glm2_6b" + top_k: 1 + top_p: 1 + do_sample: True + arch: + type: ChatGLM2ForConditionalGeneration + +trainer: + type: CausalLanguageModelingTrainer + model_name: 'glm2_6b' +# if True do, evaluate during the training process. if false, do nothing. +# note that the task trainer should support _evaluate_in_training function. +do_eval: False +eval_step_interval: 500 +eval_epoch_interval: -1 + +metric: + type: ADGENMetric + +processor: + return_tensors: ms + tokenizer: + type: ChatGLM2Tokenizer + bos_token: '' + eos_token: '' + end_token: '' + mask_token: '[MASK]' + gmask_token: '[gMASK]' + pad_token: '' + unk_token: '' + # vocab_file: "/path/to/tokenizer.model" + type: GLMProcessor + +# ==== dataset config ==== + +eval_dataset: &eval_dataset + data_loader: + type: ADGenDataLoader + dataset_dir: "/path/to/AdvertiseGen/dev.json" + shuffle: False + phase: "eval" + version: 2 + origin_columns: ["content", "summary"] + tokenizer: + type: ChatGLM2Tokenizer + vocab_file: "/path/to/tokenizer.model" + max_source_length: 256 + max_target_length: 256 + ignore_pad_token_for_loss: True + input_columns: ["input_ids", "labels"] + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 8 + repeat: 1 + numa_enable: False + prefetch_size: 1 + seed: 0 + +eval_dataset_task: + type: KeyWordGenDataset + dataset_config: *eval_dataset + +# ==== runner config ==== +runner_config: + epochs: 1 + batch_size: 8 + sink_mode: True + sink_size: 4 + +runner_wrapper: + type: MFTrainOneStepCell + scale_sense: + type: DynamicLossScaleUpdateCell + loss_scale_value: 65536 + scale_factor: 2 + scale_window: 1000 + use_clip_grad: True + +# lr sechdule +lr_schedule: + type: polynomial + learning_rate: 5.e-5 + lr_end: 1.e-6 + warmup_steps: 0 + total_steps: -1 # -1 means it will load the total steps of the dataset +layer_scale: False +layer_decay: 0.65 + +# optimizer +optimizer: + type: FP32StateAdamWeightDecay + beta1: 0.9 + beta2: 0.95 + eps: 1.e-8 + weight_decay: 0.1 +lr_scale: False +lr_scale_factor: 256 + +# parallel config +use_parallel: True +parallel: + parallel_mode: 1 # 0-dataset, 1-semi, 2-auto, 3-hybrid + gradients_mean: False + loss_repeated_mean: True + enable_alltoall: False + full_batch: True + search_mode: "sharding_propagation" + enable_parallel_optimizer: False # optimizer shard + strategy_ckpt_config: + save_file: "./ckpt_strategy.ckpt" + only_trainable_params: False # 设置成 False,才能在策略文件中保存所有参数 +parallel_config: + data_parallel: 8 + model_parallel: 1 + pipeline_stage: 1 + expert_parallel: 1 + micro_batch_num: 1 + vocab_emb_dp: True + gradient_aggregation_group: 4 +micro_batch_interleave_num: 1 + +# moe +moe_config: + expert_num: 1 + capacity_factor: 1.05 + aux_loss_factor: 0.05 + num_experts_chosen: 1 + +# recompute +recompute_config: + recompute: True + parallel_optimizer_comm_recompute: False + mp_comm_recompute: True + recompute_slice_activation: True + +# autotune +auto_tune: False +filepath_prefix: './autotune' +autotune_per_step: 10 + +# profile +profile: False +profile_start_step: 1 +profile_stop_step: 10 +init_start_profile: True +profile_communication: True +profile_memory: True + +# callbacks +callbacks: + - type: MFLossMonitor + - type: CheckpointMointor + prefix: "glm2-6b" + save_checkpoint_steps: 1000 + keep_checkpoint_max: 1 + integrated_save: False + async_save: False + - type: ObsMonitor + keep_last: False +eval_callbacks: + - type: ObsMonitor + keep_last: False diff --git a/huawei/mindspore/mindformers/models/glm2_6b/registed_tasks.sh b/huawei/mindspore/mindformers/models/glm2_6b/registed_tasks.sh new file mode 100644 index 0000000..a28fb04 --- /dev/null +++ b/huawei/mindspore/mindformers/models/glm2_6b/registed_tasks.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# 单机运行的任务 +SINGLE_NODE_LAUNCH=( \ + "run_glm2_6b_finetune_800_32G" \ + "run_glm2_6b_finetune_800T_A2_64G" +) +# 多机运行的任务 +MULTI_NODES_LAUNCH=() \ No newline at end of file diff --git a/huawei/mindspore/mindformers/models/llama2_13b/README.md b/huawei/mindspore/mindformers/models/llama2_13b/README.md new file mode 100644 index 0000000..0c019ab --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_13b/README.md @@ -0,0 +1,204 @@ +# AISBench 基于Mindspore/mindformers框架的LLaMA2 13b 训练负载包使用指南 +本文主要介绍使用基于mindformers LLaMA2 13b大模型训练业务代码构建的AISBench的负载包"Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-llama2_13b-{mindformers version}.tar.gz",进行服务器性能测试的流程。 +本负载包支持一键拉起单机或多机的LLaMA2 13b模型的预训练或微调任务,自动完成评测,并汇总训练相关的性能数据。 +## 名词定义 +|名词| 定义| +| --- | ----------------------------------- | +|ais-bench-stubs|启动性能测试任务的二进制程序| +|管理节点|运行ais-bench-stubs的环境,只有一个| +|计算节点|执行训练任务的环境,可以有多个;计算节点中有一个作为管理节点| + +## 查看llama2 13b 训练负载包目录结构,简单确认完整性 +解压负载包"Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-llama2_13b-{mindformers version}.tar.gz"(如果在包中看到本文档忽略此步) +```bash +tar xzf Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-llama2_13b-{mindformers version}.tar.gz +``` +负载包名中"{mindformers version}"表示[mindformer仓库](https://gitee.com/mindspore/mindformers)的分支名 +查看目录结构 +```bash +├── ais-bench-stubs # 启动测试的二进制文件 +├── code/ +│   ├── benchmark.sh +│   ├── evaluate_run.sh +│   ├── evaluate_scripts.sh # 评测的运行脚本,需要依据实际运行的评测命令 +│   ├── launch_config.sh +│   ├── launch_yamls/ # 启动训练任务的配置文件 +| │   ├──run_llama2_13b_910b.yaml +| │   ├──...... +│   ├── mindformers/ # 嵌入了logging打点接口的mindformers代码 +│   ├── multi_nodes_run.sh +│   ├── registed_tasks.sh # 注册了可用的mindformers脚本 +│   └── single_node_run.sh +├── config/ +│   ├── config.json +│   └── system.json +├── log/ +├── result/ +├── README.md # 本文档 +└── STUBS_PACKAGE_INTRO.md +``` +**后续对于相对路径的描述都是相对于负载包中的一级目录,例如 ./** + +## 负载包运行环境准备 +### 基本环境准备 +``` +python >=3.7 +``` +### mindspore准备 +**所有计算节点需要准备**+ +请依据负载包名中的“{mindformers version}”对应的mindformers分支版本,参考[mindformers训练负载主页](https://gitee.com/aisbench/training/tree/master/huawei/mindspore/mindformers),安装指定版本的mindspore(python版本不限)。 +MindSpore安装参考[MindSpore官网](https://www.mindspore.cn/)MindSpore需要能成功在npu上运行,验证命令: +```bash +python -c "import mindspore;mindspore.set_context(device_target='Ascend');mindspore.run_check()" + +### logging准备 +**所有计算节点需要安装** +从[logging打点工具发行版](https://gitee.com/aisbench/logging/releases)获取最新的发行版。 +参考[logging打点工具主页](https://gitee.com/aisbench/logging)的“安装与卸载/安装logging”章节安装logging打点工具。 +### cluster_tools准备(多机运行需要) +**仅管理节点需要安装** +从[cluster_tools分布式运行工具发行版](https://gitee.com/aisbench/cluster_tools/releases)获取最新的发行版。 +参考[cluster_tools分布式运行工具主页](https://gitee.com/aisbench/cluster_tools/)的“安装与卸载/安装cluster_tools”章节安装cluster_tools分布式运行工具。 + +## 资源准备 +### 前置声明 +1. 以下涉及到mindformers代码仓库的链接没有指定代码分支,需要依据负载包名"{mindformers version}",自行切换到对应的分支。 +2. 运行LLaMA2训练的MindSpore/mindformers的代码全部在`./code/mindformers`文件夹中,资源准备总体参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md),具体资源的参考详见本章其他小节。
+### rank_table_file准备 +**部署在管理节点上** +rank_table_file是一个包含集群节点和加速卡ip信息的json文件。 +准备rank_table_file前确保计算节点的`/etc/hccn.conf`文件已经配好(如果没配好,参考[数据中心解决方案/配置训练节点](https://www.hiascend.com/document/detail/zh/Ascend%20Data%20Center%20Solution/22.0.0/install/800_9000/install_800_9000_0029.html)配置)。 + +参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)的“生成RANK_TABLE_FILE(多卡运行必须环节)”章节。 + +### node_file准备(多机运行需要) +**部署在管理节点上** +node_file是需要给cluster_tools传入的文件,它包含了计算节点的具体信息。 +node_file需要自建,格式参考[cluster_tools分布式运行工具主页](https://gitee.com/aisbench/cluster_tools/)的“集群节点信息文件内容格式”章节自行创建。**注意,node_file中计算节点的顺序需要与rank_table_file中的计算节点顺序相同。** + +### 模型权重下载与转换 +微调任务需要,预训练任务不涉及,**部署在所有计算节点上**。 +参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)的“模型权重下载与转换”章节准备LLaMA2 13b的ckpt文件。 + +### 数据集准备 +#### 预训练数据集准备 +**部署在所有计算节点上**,如果不想手动部署,可以放在负载包的`code/`路径中。 +参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“预训练/数据集准备”章节准备转换好的预训练数据集。 +#### 微调数据集准备 +部署在所有计算节点上,如果不想手动部署,可以放在负载包的`code/`路径中。 +参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“微调/数据集准备”章节准备转换好的微调数据集。 +#### 评测数据集准备 +依据实际的评测需求准备数据集,**部署在管理节点上**。 +**wikitext** +参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“评测/文本生成/获取数据集”章节准备评测数据集。 +**SQuAD** +参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“评测阅读理解/获取数据集”章节准备评测数据集。 + + +## 查看本负载包支持的mindformers启动配置文件 +查看`code/register_task.sh`文件: +```bash +#!/bin/bash +# 单机运行的任务 +SINGLE_NODE_LAUNCH=( \ + "run_llama2_13b_910b" \ + "run_llama2_13b_910b_finetune" +) +# 多机运行的任务 +MULTI_NODES_LAUNCH=( \ + "run_llama2_13b" \ + "run_llama2_13b_finetune" +) +``` + +```bash +"run_llama2_13b_910b_finetune" # 单机 8卡 910B1,910B2,910B3 微调任务 +"run_llama2_13b_910b" # 单机 8卡 910B1,910B2,910B3 预训练任务 +"run_llama2_13b_finetune" # 多机 每机8卡 910A,910B4 微调任务 +"run_llama2_13b" # 多机 每机8卡 910A,910B4 预训练任务 +``` +需要确认`code/launch_yamls/`路径下包含了上述`code/register_task.sh`中注册的yaml文件。 + +## 启动前配置 +### 负载通用配置文件launch_config.sh配置 +编辑`code/launch_config.sh`启动文件: +```bash +#!/bin/bash + +export AIS_PYTHON=python3 # 使用的python解释器 +export AISBENCH_LOGGING_WARM_UP_STEP_COUNT=3 # 从第几个steps之后开始统计step的性能数据 + +export AIS_NODE_FILE_PATH=/home/xx/xx/xx/node_file.json # 分布式运行使用cluster_tools所需包含节点信息和ssh key路径的文件,单机训练不用填 +export AIS_TRAIN_TASK_TYPE="train" # 预训练:"train",微调:"finetune" +export AIS_MODEL_NAME="llama2_13b" # 维持默认 +export AIS_TRAIN_YAML="run_llama2_13b" # 请从code/registed_task.sh中注册的yaml文件中选择一个填入 + +export AIS_RANK_NUM=16 # 集群总加速卡数 +export AIS_DEVICE_NUM=8 # 单台服务器的加速卡数量 +export AIS_RANK_TABLE_FILE="./xx.json" # rank_table_file 的路径, 相对于当前脚本 + +``` +### 修改yaml配置文件 +修改`code/launch_config.sh`中设置的`AIS_TRAIN_YAML`对应的`code/launch_yaml/`中的yaml配置文件。 +#### 预训练任务 +```yaml +# dataset +train_dataset: &train_dataset + data_loader: + type: MindDataset + dataset_dir: "" # 需要填入.mindrecord后缀的wikitext-2数据集的路径。 + shuffle: True +``` +`dataset_dir`支持绝对路径以及相对路径。`dataset_dir`填入的相对路径`../../`实际对应负载包路径`code/mindformers/`。 + +#### 微调任务 +```yaml +seed: 0 +output_dir: './output' # path to save checkpoint/strategy,维持默认 +load_checkpoint: '{path}/llama2_13b.ckpt' # 需要填入准备的权重文件.ckpt的路径 +src_strategy_path_or_dir: '' +``` +`load_checkpoint`支持绝对路径以及相对路径。`load_checkpoint`填入的相对路径`./`实际对应负载包路径`code/mindformers/` +```yaml +# dataset +train_dataset: &train_dataset + data_loader: + type: MindDataset + dataset_dir: "/{path}/alpaca-fastchat2048.mindrecord" # 需要填入.mindrecord后缀的alpaca数据集的路径。 + shuffle: True +``` +`dataset_dir`支持绝对路径以及相对路径。`dataset_dir`填入的相对路径`../../`实际对应负载包路径`code/mindformers/` + +### 修改评测启动脚本 +`code/evaluate_scripts.sh`评测脚本的启动较为多样,为了保证可拓展性,支持用户自行修改。默认提供的是基于wikitext-2数据集的文本生成任务的评测启动脚本。`code/evaluate_scripts.sh`默认内容如下: +```bash +#!/bin/bash +CUR_DIR=$(cd "$(dirname "$0")";pwd) +MINDFORMERS_CODE_PATH=${CUR_DIR}/mindformers/ +LAUNCH_SCRIPT_PATH=${MINDFORMERS_CODE_PATH}/scripts/ +OUTPUT_PATH=${MINDFORMERS_CODE_PATH}/output/ # 训练完后output路径 + +# eval_script 评测启动命令请自行根据实际情况修改 +export mindformers_workload_eval_cmd="${AIS_PYTHON} ${MINDFORMERS_CODE_PATH}/run_mindformer.py \ +--config ${CUR_DIR}/launch_yamls/run_llama2_13b.yaml \ +--eval_dataset_dir /{path}/wiki4096valid.mindrecord \ +--run_mode eval \ +--load_checkpoint ${OUTPUT_PATH}/target_checkpoint/rank_0/llama2_13b0.ckpt \ +--epochs 1 \ +--use_parallel False \ +--device_id 0" +``` +其中`--eval_dataset_dir`需要用户自行传入wikitext-2的.mindrecord后缀的评测数据集的绝对路径。 + +## 启动测试 +### 在线测试 +在线测试的前置准备请参考`STUBS_PACKAGE_INTRO.md`文档。启动命令: +```bash +./ais-bench-stubs +``` +### 轻量化离线测试 +启动命令: +```bash +./ais-bench-stubs test +``` + diff --git a/huawei/mindspore/mindformers/models/llama2_13b/evaluate_scripts.sh b/huawei/mindspore/mindformers/models/llama2_13b/evaluate_scripts.sh new file mode 100644 index 0000000..58c6d5d --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_13b/evaluate_scripts.sh @@ -0,0 +1,15 @@ +#!/bin/bash +CUR_DIR=$(cd "$(dirname "$0")";pwd) +MINDFORMERS_CODE_PATH=${CUR_DIR}/mindformers/ +LAUNCH_SCRIPT_PATH=${MINDFORMERS_CODE_PATH}/scripts/ +OUTPUT_PATH=${MINDFORMERS_CODE_PATH}/output/ # 训练完后output路径 + +# eval_script 评测启动命令请自行根据实际情况修改 +export mindformers_workload_eval_cmd="${AIS_PYTHON} ${MINDFORMERS_CODE_PATH}/run_mindformer.py \ +--config ${CUR_DIR}/launch_yamls/run_llama2_13b.yaml \ +--eval_dataset_dir /{path}/wiki4096valid.mindrecord \ +--run_mode eval \ +--load_checkpoint ${OUTPUT_PATH}/target_checkpoint/rank_0/llama2_13b0.ckpt \ +--epochs 1 \ +--use_parallel False \ +--device_id 0" \ No newline at end of file diff --git a/huawei/mindspore/mindformers/models/llama2_13b/launch_config.sh b/huawei/mindspore/mindformers/models/llama2_13b/launch_config.sh new file mode 100644 index 0000000..5107c7d --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_13b/launch_config.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +export AIS_PYTHON=python3 # 使用的python解释器 +export AISBENCH_LOGGING_WARM_UP_STEP_COUNT=3 # 从第几个steps之后开始统计step的性能数据 + +export AIS_NODE_FILE_PATH=/home/xx/xx/xx/node_file.json # 分布式运行使用cluster_tools所需包含节点信息和ssh key路径的文件,单机训练不用填 +export AIS_TRAIN_TASK_TYPE="train" # 预训练:"train",微调:"finetune" +export AIS_MODEL_NAME="llama2_13b" +export AIS_TRAIN_YAML="run_llama2_13b" # 请从code/registed_task.sh中注册的yaml文件中选择一个填入 + +export AIS_RANK_NUM=8 # 集群总加速卡数 +export AIS_DEVICE_NUM=8 # 单台服务器的加速卡数量 +export AIS_RANK_TABLE_FILE="./xx.json" # rank_table_file 的路径, 相对于当前脚本 diff --git a/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b.yaml b/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b.yaml new file mode 100644 index 0000000..15d63b2 --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b.yaml @@ -0,0 +1,213 @@ +seed: 0 +output_dir: './output' # path to save checkpoint/strategy +load_checkpoint: '' +src_strategy_path_or_dir: '' +auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model +only_save_strategy: False +resume_training: False +run_mode: 'train' + +# trainer config +trainer: + type: CausalLanguageModelingTrainer + model_name: 'llama2_13b' +# if True, do evaluate during the training process. if false, do nothing. +# note that the task trainer should support _evaluate_in_training function. +do_eval: False +eval_step_interval: -1 # num of step intervals between each eval, -1 means no step end eval. +eval_epoch_interval: 50 # num of epoch intervals between each eval, 1 means eval on every epoch end. + +# runner config +runner_config: + epochs: 2 + batch_size: 1 + sink_mode: True + sink_size: 2 + +# optimizer +optimizer: + type: FP32StateAdamWeightDecay + beta1: 0.9 + beta2: 0.95 + eps: 1.e-8 # 1e-8 + learning_rate: 3.e-4 + +# lr sechdule +lr_schedule: + type: CosineWithWarmUpLR + learning_rate: 3.e-4 + lr_end: 3.e-5 + warmup_ratio: 0.03 + total_steps: -1 # -1 means it will load the total steps of the dataset + +# dataset +train_dataset: &train_dataset + data_loader: + type: MindDataset + dataset_dir: "${AIS_WORK_PATH}/path/to/wiki4096.mindrecord" + shuffle: True + input_columns: ["input_ids"] # "input_ids", "labels" , labels are used in instruction finetune. + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 4 + repeat: 1 + numa_enable: False + prefetch_size: 1 +train_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *train_dataset + +# eval dataset +eval_dataset: &eval_dataset + data_loader: + type: MindDataset + dataset_dir: "" + shuffle: False + input_columns: ["input_ids"] + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: False + repeat: 1 + numa_enable: False + prefetch_size: 1 +eval_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *eval_dataset + +use_parallel: True +# parallel context config +parallel: + parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel + gradients_mean: False + enable_alltoall: False + full_batch: True + search_mode: "sharding_propagation" + enable_parallel_optimizer: True + strategy_ckpt_save_file: "./ckpt_strategy.ckpt" + parallel_optimizer_config: + gradient_accumulation_shard: False + parallel_optimizer_threshold: 64 +# default parallel of device num = 16 for Atlas 800 +parallel_config: + data_parallel: 2 + model_parallel: 4 + pipeline_stage: 2 + use_seq_parallel: False + micro_batch_num: 16 + vocab_emb_dp: True + gradient_aggregation_group: 4 +# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process. +micro_batch_interleave_num: 1 + +# recompute config +recompute_config: + recompute: True + select_recompute: False + parallel_optimizer_comm_recompute: False + mp_comm_recompute: True + recompute_slice_activation: True + +# callbacks +callbacks: + - type: MFLossMonitor + - type: CheckpointMointor + prefix: "llama_13b" + save_checkpoint_steps: 100000000 # big enough + integrated_save: False + async_save: False + - type: ObsMonitor + +# mindspore context init config +context: + mode: 0 #0--Graph Mode; 1--Pynative Mode + device_target: "Ascend" + enable_graph_kernel: False + graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true" + max_call_depth: 10000 + max_device_memory: "31GB" + save_graphs: False + save_graphs_path: "./graph" + device_id: 0 + +# model config +model: + model_config: + type: LlamaConfig + batch_size: 1 # add for increase predict + seq_length: 4096 + hidden_size: 5120 + num_layers: 40 + num_heads: 40 + max_position_embedding: 4096 + vocab_size: 32000 + multiple_of: 256 + rms_norm_eps: 1.0e-5 + bos_token_id: 1 + eos_token_id: 2 + pad_token_id: 0 + ignore_token_id: -100 + compute_dtype: "float16" + layernorm_compute_type: "float32" + softmax_compute_type: "float16" + rotary_dtype: "float16" + param_init_type: "float16" + use_past: False + pretrain_seqlen: 4096 # seqlen of the pretrain checkpoint: 2048 for llama and 4096 for llama2 + extend_method: "None" # support "None", "PI", "NTK" + compute_in_2d: False + use_flash_attention: False # FA can accelerate training or finetune + offset: 0 + use_past_shard: False + checkpoint_name_or_path: "llama2_13b" + repetition_penalty: 1 + max_decode_length: 512 + top_k: 3 + top_p: 1 + do_sample: False + arch: + type: LlamaForCausalLM + +processor: + return_tensors: ms + tokenizer: + unk_token: '' + bos_token: '' + eos_token: '' + pad_token: '' + type: LlamaTokenizer + type: LlamaProcessor + +# metric +metric: + type: PerplexityMetric + +# wrapper cell config +runner_wrapper: + type: MFTrainOneStepCell + scale_sense: + type: DynamicLossScaleUpdateCell + loss_scale_value: 4294967296 + scale_factor: 2 + scale_window: 1000 + use_clip_grad: True + +eval_callbacks: + - type: ObsMonitor + +auto_tune: False +filepath_prefix: './autotune' +autotune_per_step: 10 + +profile: False +profile_start_step: 1 +profile_stop_step: 10 +init_start_profile: False +profile_communication: False +profile_memory: True +layer_scale: False +layer_decay: 0.65 +lr_scale_factor: 256 + +# aicc +remote_save_url: "Please input obs url on AICC platform." diff --git a/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_910b.yaml b/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_910b.yaml new file mode 100644 index 0000000..5da1a20 --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_910b.yaml @@ -0,0 +1,209 @@ +seed: 0 +output_dir: './output' # path to save checkpoint/strategy +load_checkpoint: '' +src_strategy_path_or_dir: '' +auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model +only_save_strategy: False +resume_training: False +run_mode: 'train' + +# trainer config +trainer: + type: CausalLanguageModelingTrainer + model_name: 'llama2_13b' +# if True, do evaluate during the training process. if false, do nothing. +# note that the task trainer should support _evaluate_in_training function. +do_eval: False + +# runner config +runner_config: + epochs: 2 + batch_size: 2 + sink_mode: True + sink_size: 2 + +# optimizer +optimizer: + type: FP32StateAdamWeightDecay + beta1: 0.9 + beta2: 0.95 + eps: 1.e-8 # 1e-8 + learning_rate: 1.e-4 + +# lr sechdule +lr_schedule: + type: CosineWithWarmUpLR + learning_rate: 1.e-4 + lr_end: 1.e-5 + warmup_ratio: 0.03 + total_steps: -1 # -1 means it will load the total steps of the dataset + +# dataset +train_dataset: &train_dataset + data_loader: + type: MindDataset + dataset_dir: "" + shuffle: True + input_columns: ["input_ids"] # "input_ids", "labels" , labels are used in instruction finetune. + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 2 + repeat: 1 + numa_enable: False + prefetch_size: 1 +train_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *train_dataset + +# eval dataset +eval_dataset: &eval_dataset + data_loader: + type: MindDataset + dataset_dir: "" + shuffle: False + input_columns: ["input_ids"] + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: False + repeat: 1 + numa_enable: False + prefetch_size: 1 +eval_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *eval_dataset + +use_parallel: True +# parallel context config +parallel: + parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel + gradients_mean: False + enable_alltoall: False + full_batch: True + search_mode: "sharding_propagation" + enable_parallel_optimizer: True + strategy_ckpt_save_file: "./ckpt_strategy.ckpt" + parallel_optimizer_config: + gradient_accumulation_shard: False + parallel_optimizer_threshold: 64 +# default parallel of device num = 16 for Atlas 800T A2 +parallel_config: + data_parallel: 8 + model_parallel: 1 + pipeline_stage: 1 + use_seq_parallel: False + micro_batch_num: 1 + vocab_emb_dp: True + gradient_aggregation_group: 4 +# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process. +micro_batch_interleave_num: 1 + +# recompute config +recompute_config: + recompute: True + select_recompute: False + parallel_optimizer_comm_recompute: False + mp_comm_recompute: True + recompute_slice_activation: True + +# callbacks +callbacks: + - type: MFLossMonitor + - type: CheckpointMointor + prefix: "llama2_13b" + save_checkpoint_steps: 100000000 # big enough + integrated_save: False + async_save: False + - type: ObsMonitor + +# mindspore context init config +context: + mode: 0 #0--Graph Mode; 1--Pynative Mode + device_target: "Ascend" + enable_graph_kernel: False + graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true" + max_call_depth: 10000 + max_device_memory: "58GB" + save_graphs: False + save_graphs_path: "./graph" + device_id: 0 + runtime_num_threads: 1 + +# model config +model: + model_config: + type: LlamaConfig + batch_size: 1 # add for increase predict + seq_length: 4096 + hidden_size: 5120 + num_layers: 40 + num_heads: 40 + vocab_size: 32000 + multiple_of: 256 + rms_norm_eps: 1.0e-5 + bos_token_id: 1 + eos_token_id: 2 + pad_token_id: 0 + ignore_token_id: -100 + compute_dtype: "float16" + layernorm_compute_type: "float32" + softmax_compute_type: "float16" + rotary_dtype: "float16" + param_init_type: "float16" + use_past: False + scaling_factor: 1.0 + extend_method: "None" # support "None", "PI", "NTK" + use_flash_attention: False # FA can accelerate training or finetune + offset: 0 + checkpoint_name_or_path: "llama2_13b" + repetition_penalty: 1 + max_decode_length: 512 + top_k: 3 + top_p: 1 + do_sample: False + arch: + type: LlamaForCausalLM + +processor: + return_tensors: ms + tokenizer: + unk_token: '' + bos_token: '' + eos_token: '' + pad_token: '' + type: LlamaTokenizer + type: LlamaProcessor + +# metric +metric: + type: PerplexityMetric + +# wrapper cell config +runner_wrapper: + type: MFTrainOneStepCell + scale_sense: + type: DynamicLossScaleUpdateCell + loss_scale_value: 4294967296 + scale_factor: 2 + scale_window: 1000 + use_clip_grad: True + +eval_callbacks: + - type: ObsMonitor + +auto_tune: False +filepath_prefix: './autotune' +autotune_per_step: 10 + +profile: False +profile_start_step: 1 +profile_stop_step: 10 +init_start_profile: False +profile_communication: False +profile_memory: True +layer_scale: False +layer_decay: 0.65 +lr_scale_factor: 256 + +# aicc +remote_save_url: "Please input obs url on AICC platform." diff --git a/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_910b_finetune.yaml b/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_910b_finetune.yaml new file mode 100644 index 0000000..bd05b96 --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_910b_finetune.yaml @@ -0,0 +1,209 @@ +seed: 0 +output_dir: './output' # path to save checkpoint/strategy +load_checkpoint: '{path}/llama2_13b.ckpt' # 必填 +src_strategy_path_or_dir: '' +auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model +only_save_strategy: False +resume_training: False +run_mode: 'finetune' + +# trainer config +trainer: + type: CausalLanguageModelingTrainer + model_name: 'llama2_13b' +# if True, do evaluate during the training process. if false, do nothing. +# note that the task trainer should support _evaluate_in_training function. +do_eval: False + +# runner config +runner_config: + epochs: 2 + batch_size: 4 + sink_mode: True + sink_size: 2 + +# optimizer +optimizer: + type: FP32StateAdamWeightDecay + beta1: 0.9 + beta2: 0.95 + eps: 1.e-8 # 1e-8 + learning_rate: 1.e-5 + +# lr sechdule +lr_schedule: + type: CosineWithWarmUpLR + learning_rate: 1.e-5 + lr_end: 0 + warmup_ratio: 0.03 + total_steps: -1 # -1 means it will load the total steps of the dataset + +# dataset +train_dataset: &train_dataset + data_loader: + type: MindDataset + dataset_dir: "" + shuffle: True + input_columns: ["input_ids", "labels"] # "input_ids", "labels" , labels are used in instruction finetune. + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 4 + repeat: 1 + numa_enable: False + prefetch_size: 1 +train_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *train_dataset + +# eval dataset +eval_dataset: &eval_dataset + data_loader: + type: MindDataset + dataset_dir: "" + shuffle: False + input_columns: ["input_ids", "labels"] + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: False + repeat: 1 + numa_enable: False + prefetch_size: 1 +eval_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *eval_dataset + +use_parallel: True +# parallel context config +parallel: + parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel + gradients_mean: False + enable_alltoall: False + full_batch: True + search_mode: "sharding_propagation" + enable_parallel_optimizer: True + strategy_ckpt_save_file: "./ckpt_strategy.ckpt" + parallel_optimizer_config: + gradient_accumulation_shard: False + parallel_optimizer_threshold: 64 +# default parallel of device num = 16 for Atlas 800T A2 +parallel_config: + data_parallel: 8 + model_parallel: 1 + pipeline_stage: 1 + use_seq_parallel: False + micro_batch_num: 1 + vocab_emb_dp: True + gradient_aggregation_group: 4 +# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process. +micro_batch_interleave_num: 1 + +# recompute config +recompute_config: + recompute: True + select_recompute: False + parallel_optimizer_comm_recompute: False + mp_comm_recompute: True + recompute_slice_activation: True + +# callbacks +callbacks: + - type: MFLossMonitor + - type: CheckpointMointor + prefix: "llama2_13b" + save_checkpoint_steps: 100000000 # big enough + integrated_save: False + async_save: False + - type: ObsMonitor + +# mindspore context init config +context: + mode: 0 #0--Graph Mode; 1--Pynative Mode + device_target: "Ascend" + enable_graph_kernel: False + graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true" + max_call_depth: 10000 + max_device_memory: "58GB" + save_graphs: False + save_graphs_path: "./graph" + device_id: 0 + runtime_num_threads: 1 + +# model config +model: + model_config: + type: LlamaConfig + batch_size: 1 # add for increase predict + seq_length: 4096 + hidden_size: 5120 + num_layers: 40 + num_heads: 40 + vocab_size: 32000 + multiple_of: 256 + rms_norm_eps: 1.0e-5 + bos_token_id: 1 + eos_token_id: 2 + pad_token_id: 0 + ignore_token_id: -100 + compute_dtype: "float16" + layernorm_compute_type: "float32" + softmax_compute_type: "float16" + rotary_dtype: "float16" + param_init_type: "float16" + use_past: False + scaling_factor: 1.0 + extend_method: "None" # support "None", "PI", "NTK" + use_flash_attention: True # FA can accelerate training or finetune + offset: 0 + checkpoint_name_or_path: "llama2_13b" + repetition_penalty: 1 + max_decode_length: 512 + top_k: 3 + top_p: 1 + do_sample: False + arch: + type: LlamaForCausalLM + +processor: + return_tensors: ms + tokenizer: + unk_token: '' + bos_token: '' + eos_token: '' + pad_token: '' + type: LlamaTokenizer + type: LlamaProcessor + +# metric +metric: + type: PerplexityMetric + +# wrapper cell config +runner_wrapper: + type: MFTrainOneStepCell + scale_sense: + type: DynamicLossScaleUpdateCell + loss_scale_value: 4294967296 + scale_factor: 2 + scale_window: 1000 + use_clip_grad: True + +eval_callbacks: + - type: ObsMonitor + +auto_tune: False +filepath_prefix: './autotune' +autotune_per_step: 10 + +profile: False +profile_start_step: 1 +profile_stop_step: 10 +init_start_profile: False +profile_communication: False +profile_memory: True +layer_scale: False +layer_decay: 0.65 +lr_scale_factor: 256 + +# aicc +remote_save_url: "Please input obs url on AICC platform." diff --git a/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_finetune.yaml b/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_finetune.yaml new file mode 100644 index 0000000..f0521c0 --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_finetune.yaml @@ -0,0 +1,214 @@ +seed: 0 +output_dir: './output' # path to save checkpoint/strategy +load_checkpoint: '{path}/llama2_13b.ckpt' +src_strategy_path_or_dir: '' +auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model +only_save_strategy: False +resume_training: False +run_mode: 'train' + +# trainer config +trainer: + type: CausalLanguageModelingTrainer + model_name: 'llama2_13b' +# if True, do evaluate during the training process. if false, do nothing. +# note that the task trainer should support _evaluate_in_training function. +do_eval: False +eval_step_interval: -1 # num of step intervals between each eval, -1 means no step end eval. +eval_epoch_interval: 50 # num of epoch intervals between each eval, 1 means eval on every epoch end. + +# runner config +runner_config: + epochs: 2 + batch_size: 1 + sink_mode: True + sink_size: 2 + +# optimizer +optimizer: + type: FP32StateAdamWeightDecay + beta1: 0.9 + beta2: 0.999 + eps: 1.e-8 # 1e-8 + learning_rate: 1.e-5 + +# lr sechdule +lr_schedule: + type: CosineWithWarmUpLR + learning_rate: 1.e-5 + lr_end: 0 + warmup_ratio: 0.03 + total_steps: -1 # -1 means it will load the total steps of the dataset + +# dataset +train_dataset: &train_dataset + data_loader: + type: MindDataset + dataset_dir: "../../path/to/alpaca-fastchat2048.mindrecord" # abs path or relative path(../../ mean) + shuffle: True + input_columns: ["input_ids", "labels"] # "input_ids", "labels" , labels are used in instruction finetune. + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 4 + repeat: 1 + numa_enable: False + prefetch_size: 1 +train_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *train_dataset + +# eval dataset +eval_dataset: &eval_dataset + data_loader: + type: MindDataset + dataset_dir: "" + shuffle: False + input_columns: ["input_ids"] + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: False + repeat: 1 + numa_enable: False + prefetch_size: 1 +eval_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *eval_dataset + +use_parallel: True +# parallel context config +parallel: + parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel + gradients_mean: False + enable_alltoall: False + full_batch: True + search_mode: "sharding_propagation" + enable_parallel_optimizer: True + strategy_ckpt_save_file: "./ckpt_strategy.ckpt" + parallel_optimizer_config: + gradient_accumulation_shard: False + parallel_optimizer_threshold: 64 +# default parallel of device num = 16 for Atlas 800 +parallel_config: + data_parallel: 2 + model_parallel: 4 + pipeline_stage: 2 + use_seq_parallel: False + micro_batch_num: 16 + vocab_emb_dp: True + gradient_aggregation_group: 4 +# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process. +micro_batch_interleave_num: 1 + +# recompute config +recompute_config: + recompute: True + select_recompute: False + parallel_optimizer_comm_recompute: False + mp_comm_recompute: True + recompute_slice_activation: True + +# callbacks +callbacks: + - type: MFLossMonitor + - type: CheckpointMointor + prefix: "llama_13b" + save_checkpoint_steps: 100000000 # big enough + integrated_save: False + async_save: False + - type: ObsMonitor + +# mindspore context init config +context: + mode: 0 #0--Graph Mode; 1--Pynative Mode + device_target: "Ascend" + enable_graph_kernel: False + graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true" + max_call_depth: 10000 + max_device_memory: "31GB" + save_graphs: False + save_graphs_path: "./graph" + device_id: 0 + runtime_num_threads: 1 + +# model config +model: + model_config: + type: LlamaConfig + batch_size: 1 # add for increase predict + seq_length: 2048 + hidden_size: 5120 + num_layers: 40 + num_heads: 40 + max_position_embedding: 4096 + vocab_size: 32000 + multiple_of: 256 + rms_norm_eps: 1.0e-5 + bos_token_id: 1 + eos_token_id: 2 + pad_token_id: 0 + ignore_token_id: -100 + compute_dtype: "float16" + layernorm_compute_type: "float32" + softmax_compute_type: "float16" + rotary_dtype: "float16" + param_init_type: "float16" + use_past: False + pretrain_seqlen: 4096 # seqlen of the pretrain checkpoint: 2048 for llama and 4096 for llama2 + extend_method: "None" # support "None", "PI", "NTK" + compute_in_2d: False + use_flash_attention: False # FA can accelerate training or finetune + offset: 0 + use_past_shard: False + checkpoint_name_or_path: "llama2_13b" + repetition_penalty: 1 + max_decode_length: 512 + top_k: 3 + top_p: 1 + do_sample: False + arch: + type: LlamaForCausalLM + +processor: + return_tensors: ms + tokenizer: + unk_token: '' + bos_token: '' + eos_token: '' + pad_token: '' + type: LlamaTokenizer + type: LlamaProcessor + +# metric +metric: + type: PerplexityMetric + +# wrapper cell config +runner_wrapper: + type: MFTrainOneStepCell + scale_sense: + type: DynamicLossScaleUpdateCell + loss_scale_value: 4294967296 + scale_factor: 2 + scale_window: 1000 + use_clip_grad: True + +eval_callbacks: + - type: ObsMonitor + +auto_tune: False +filepath_prefix: './autotune' +autotune_per_step: 10 + +profile: False +profile_start_step: 1 +profile_stop_step: 10 +init_start_profile: False +profile_communication: False +profile_memory: True +layer_scale: False +layer_decay: 0.65 +lr_scale_factor: 256 + +# aicc +remote_save_url: "Please input obs url on AICC platform." diff --git a/huawei/mindspore/mindformers/models/llama2_13b/registed_tasks.sh b/huawei/mindspore/mindformers/models/llama2_13b/registed_tasks.sh new file mode 100644 index 0000000..399b9fc --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_13b/registed_tasks.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# 单机运行的任务 +SINGLE_NODE_LAUNCH=( \ + "run_llama2_13b_910b" \ + "run_llama2_13b_910b_finetune" +) +# 多机运行的任务 +MULTI_NODES_LAUNCH=( \ + "run_llama2_13b" \ + "run_llama2_13b_finetune" +) \ No newline at end of file diff --git a/huawei/mindspore/mindformers/models/llama2_70b/README.md b/huawei/mindspore/mindformers/models/llama2_70b/README.md new file mode 100644 index 0000000..2bcedae --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_70b/README.md @@ -0,0 +1,197 @@ +# AISBench 基于Mindspore/mindformers框架的LLaMA2 70b 训练负载包使用指南 +本文主要介绍使用基于mindformers LLaMA2 70b大模型训练业务代码构建的AISBench的负载包"Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-llama2_70b-{mindformers version}.tar.gz",进行服务器性能测试的流程。 +本负载包支持一键拉起单机或多机的LLaMA2 70b模型的预训练或微调任务,自动完成评测,并汇总训练相关的性能数据。 +## 名词定义 +|名词| 定义| +| --- | ----------------------------------- | +|ais-bench-stubs|启动性能测试任务的二进制程序| +|管理节点|运行ais-bench-stubs的环境,只有一个| +|计算节点|执行训练任务的环境,可以有多个;计算节点中有一个作为管理节点| + +## 查看llama2 13b 训练负载包目录结构,简单确认完整性 +解压负载包"Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-llama2_70b-{mindformers version}.tar.gz"(如果在包中看到本文档忽略此步) +```bash +tar xzf Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-llama2_70b-{mindformers version}.tar.gz +``` +负载包名中"{mindformers version}"表示[mindformer仓库](https://gitee.com/mindspore/mindformers)的分支名 +查看目录结构 +```bash +├── ais-bench-stubs # 启动测试的二进制文件 +├── code/ +│   ├── benchmark.sh +│   ├── evaluate_run.sh +│   ├── evaluate_scripts.sh # 评测的运行脚本,需要依据实际运行的评测命令 +│   ├── launch_config.sh +│   ├── launch_yamls/ # 启动训练任务的配置文件 +| │   ├──run_llama2_70b_910b.yaml +| │   ├──...... +│   ├── mindformers/ # 嵌入了logging打点接口的mindformers代码 +│   ├── multi_nodes_run.sh +│   ├── registed_tasks.sh # 注册了可用的mindformers脚本 +│   └── single_node_run.sh +├── config/ +│   ├── config.json +│   └── system.json +├── log/ +├── result/ +├── README.md # 本文档 +└── STUBS_PACKAGE_INTRO.md +``` +**后续对于相对路径的描述都是相对于负载包中的一级目录,例如 ./** + +## 负载包运行环境准备 +### 基本环境准备 +``` +python >=3.7 +``` +### mindspore准备 +**所有计算节点需要准备**+ +请依据负载包名中的“{mindformers version}”对应的mindformers分支版本,参考[mindformers训练负载主页](https://gitee.com/aisbench/training/tree/master/huawei/mindspore/mindformers),安装指定版本的mindspore(python版本不限)。 +MindSpore安装参考[MindSpore官网](https://www.mindspore.cn/)MindSpore需要能成功在npu上运行,验证命令: +```bash +python -c "import mindspore;mindspore.set_context(device_target='Ascend');mindspore.run_check()" + +### logging准备 +**所有计算节点需要安装** +从[logging打点工具发行版](https://gitee.com/aisbench/logging/releases)获取最新的发行版。 +参考[logging打点工具主页](https://gitee.com/aisbench/logging)的“安装与卸载/安装logging”章节安装logging打点工具。 +### cluster_tools准备(多机运行需要) +**仅管理节点需要安装** +从[cluster_tools分布式运行工具发行版](https://gitee.com/aisbench/cluster_tools/releases)获取最新的发行版。 +参考[cluster_tools分布式运行工具主页](https://gitee.com/aisbench/cluster_tools/)的“安装与卸载/安装cluster_tools”章节安装cluster_tools分布式运行工具。 + +## 资源准备 +### 前置声明 +1. 以下涉及到mindformers代码仓库的链接没有指定代码分支,需要依据负载包名"{mindformers version}",自行切换到对应的分支。 +2. 运行LLaMA2训练的MindSpore/mindformers的代码全部在`./code/mindformers`文件夹中,资源准备总体参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md),具体资源的参考详见本章其他小节。
+### rank_table_file准备 +**部署在管理节点上** +rank_table_file是一个包含集群节点和加速卡ip信息的json文件。 +准备rank_table_file前确保计算节点的`/etc/hccn.conf`文件已经配好(如果没配好,参考[数据中心解决方案/配置训练节点](https://www.hiascend.com/document/detail/zh/Ascend%20Data%20Center%20Solution/22.0.0/install/800_9000/install_800_9000_0029.html)配置)。 + +参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)的“生成RANK_TABLE_FILE(多卡运行必须环节)”章节。 + +### node_file准备(多机运行需要) +**部署在管理节点上** +node_file是需要给cluster_tools传入的文件,它包含了计算节点的具体信息。 +node_file需要自建,格式参考[cluster_tools分布式运行工具主页](https://gitee.com/aisbench/cluster_tools/)的“集群节点信息文件内容格式”章节自行创建。**注意,node_file中计算节点的顺序需要与rank_table_file中的计算节点顺序相同。** + +### 模型权重下载与转换 +微调任务需要,预训练任务不涉及,**部署在所有计算节点上**。 +参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)的“模型权重下载与转换”章节准备LLaMA2 70b的ckpt文件。 + +### 数据集准备 +#### 预训练数据集准备 +**部署在所有计算节点上**,如果不想手动部署,可以放在负载包的`code/`路径中。 +参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“预训练/数据集准备”章节准备转换好的预训练数据集。 +#### 微调数据集准备 +部署在所有计算节点上,如果不想手动部署,可以放在负载包的`code/`路径中。 +参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“微调/数据集准备”章节准备转换好的微调数据集。 +#### 评测数据集准备 +依据实际的评测需求准备数据集,**部署在管理节点上**。 +**wikitext** +参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“评测/文本生成/获取数据集”章节准备评测数据集。 +**SQuAD** +参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“评测阅读理解/获取数据集”章节准备评测数据集。 + + +## 查看本负载包支持的mindformers启动配置文件 +查看`code/register_task.sh`文件: +```bash +#!/bin/bash +# 单机运行的任务 +SINGLE_NODE_LAUNCH=() +# 多机运行的任务 +MULTI_NODES_LAUNCH=( \ + "run_llama2_70b_910b" \ + "run_llama2_70b_910b_finetune" +) +``` + +```bash +"run_llama2_70b_910b_finetune" # 多机 每机8卡 910B1,910B2,910B3 微调任务 +"run_llama2_70b_910b" # 多机 每机8卡 910B1,910B2,910B3 预训练任务 +``` +需要确认`code/launch_yamls/`路径下包含了上述`code/register_task.sh`中注册的yaml文件。 + +## 启动前配置 +### 负载通用配置文件launch_config.sh配置 +编辑`code/launch_config.sh`启动文件: +```bash +#!/bin/bash + +export AIS_PYTHON=python3 # 使用的python解释器 +export AISBENCH_LOGGING_WARM_UP_STEP_COUNT=3 # 从第几个steps之后开始统计step的性能数据 + +export AIS_NODE_FILE_PATH=/home/xx/xx/xx/node_file.json # 分布式运行使用cluster_tools所需包含节点信息和ssh key路径的文件,单机训练不用填 +export AIS_TRAIN_TASK_TYPE="train" # 预训练:"train",微调:"finetune" +export AIS_MODEL_NAME="llama2_70b" # 维持默认 +export AIS_TRAIN_YAML="run_llama2_70b" # 请从code/registed_task.sh中注册的yaml文件中选择一个填入 + +export AIS_RANK_NUM=64 # 集群总加速卡数 +export AIS_DEVICE_NUM=8 # 单台服务器的加速卡数量 +export AIS_RANK_TABLE_FILE="./xx.json" # rank_table_file 的路径, 相对于当前脚本 + +``` +### 修改yaml配置文件 +修改`code/launch_config.sh`中设置的`AIS_TRAIN_YAML`对应的`code/launch_yaml/`中的yaml配置文件。 +#### 预训练任务 +```yaml +# dataset +train_dataset: &train_dataset + data_loader: + type: MindDataset + dataset_dir: "" # 需要填入.mindrecord后缀的wikitext-2数据集的路径。 + shuffle: True +``` +`dataset_dir`支持绝对路径以及相对路径。`dataset_dir`填入的相对路径`../../`实际对应负载包路径`code/mindformers/`。 + +#### 微调任务 +```yaml +seed: 0 +output_dir: './output' # path to save checkpoint/strategy,维持默认 +load_checkpoint: '{path}/llama2_70b.ckpt' # 需要填入准备的权重文件.ckpt的路径 +src_strategy_path_or_dir: '' +``` +`load_checkpoint`支持绝对路径以及相对路径。`load_checkpoint`填入的相对路径`./`实际对应负载包路径`code/mindformers/` +```yaml +# dataset +train_dataset: &train_dataset + data_loader: + type: MindDataset + dataset_dir: "/{path}/alpaca-fastchat2048.mindrecord" # 需要填入.mindrecord后缀的alpaca数据集的路径。 + shuffle: True +``` +`dataset_dir`支持绝对路径以及相对路径。`dataset_dir`填入的相对路径`../../`实际对应负载包路径`code/mindformers/` + +### 修改评测启动脚本 +`code/evaluate_scripts.sh`评测脚本的启动较为多样,为了保证可拓展性,支持用户自行修改。默认提供的是基于wikitext-2数据集的文本生成任务的评测启动脚本。`code/evaluate_scripts.sh`默认内容如下: +```bash +#!/bin/bash +#!/bin/bash +CUR_DIR=$(cd "$(dirname "$0")";pwd) +MINDFORMERS_CODE_PATH=${CUR_DIR}/mindformers/ +LAUNCH_SCRIPT_PATH=${MINDFORMERS_CODE_PATH}/scripts/ +OUTPUT_PATH=${MINDFORMERS_CODE_PATH}/output/ # 训练完后output路径 + +# eval_script 评测启动命令请自行根据实际情况修改配置文件 +export mindformers_workload_eval_cmd="bash ${LAUNCH_SCRIPT_PATH}/run_distribute.sh \ +--config ${CUR_DIR}/launch_yamls/predict_llama2_70b_910b.yaml \ +[0,8] \ +eval" +``` +具体运行`predict_llama2_70b_910b.yaml`评测脚本需要做的准备,请参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)中的“评测/分布式评测”章节。 + + +## 启动测试 +### 在线测试 +在线测试的前置准备请参考`STUBS_PACKAGE_INTRO.md`文档。启动命令: +```bash +./ais-bench-stubs +``` +### 轻量化离线测试 +启动命令: +```bash +./ais-bench-stubs test +``` + diff --git a/huawei/mindspore/mindformers/models/llama2_70b/evaluate_scripts.sh b/huawei/mindspore/mindformers/models/llama2_70b/evaluate_scripts.sh new file mode 100644 index 0000000..6fa35c5 --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_70b/evaluate_scripts.sh @@ -0,0 +1,11 @@ +#!/bin/bash +CUR_DIR=$(cd "$(dirname "$0")";pwd) +MINDFORMERS_CODE_PATH=${CUR_DIR}/mindformers/ +LAUNCH_SCRIPT_PATH=${MINDFORMERS_CODE_PATH}/scripts/ +OUTPUT_PATH=${MINDFORMERS_CODE_PATH}/output/ # 训练完后output路径 + +# eval_script 评测启动命令请自行根据实际情况修改配置文件 +export mindformers_workload_eval_cmd="bash ${LAUNCH_SCRIPT_PATH}/run_distribute.sh \ +--config ${CUR_DIR}/launch_yamls/run_llama2_7b.yaml \ +[0,8] \ +eval" \ No newline at end of file diff --git a/huawei/mindspore/mindformers/models/llama2_70b/launch_config.sh b/huawei/mindspore/mindformers/models/llama2_70b/launch_config.sh new file mode 100644 index 0000000..6b03a19 --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_70b/launch_config.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +export AIS_PYTHON=python3 # 使用的python解释器 +export AISBENCH_LOGGING_WARM_UP_STEP_COUNT=3 # 从第几个steps之后开始统计step的性能数据 + +export AIS_NODE_FILE_PATH=/home/xx/xx/xx/node_file.json # 分布式运行使用cluster_tools所需包含节点信息和ssh key路径的文件,单机训练不用填 +export AIS_TRAIN_TASK_TYPE="train" # 预训练:"train",微调:"finetune" +export AIS_MODEL_NAME="llama2_70b" +export AIS_TRAIN_YAML="run_llama2_70b_910b" # 请从code/registed_task.sh中注册的yaml文件中选择一个填入 + +export AIS_RANK_NUM=8 # 集群总加速卡数 +export AIS_DEVICE_NUM=8 # 单台服务器的加速卡数量 +export AIS_RANK_TABLE_FILE="./xx.json" # rank_table_file 的路径, 相对于当前脚本 diff --git a/huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/predict_llama2_70b_910b.yaml b/huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/predict_llama2_70b_910b.yaml new file mode 100644 index 0000000..14bb735 --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/predict_llama2_70b_910b.yaml @@ -0,0 +1,154 @@ +seed: 0 +output_dir: './output' # path to save checkpoint/strategy +load_checkpoint: '' +src_strategy_path_or_dir: '' +auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model +only_save_strategy: False +resume_training: False +run_mode: 'predict' + +# trainer config +trainer: + type: CausalLanguageModelingTrainer + model_name: 'llama_70b' +# if True, do evaluate during the training process. if false, do nothing. +# note that the task trainer should support _evaluate_in_training function. +do_eval: False +eval_step_interval: -1 # num of step intervals between each eval, -1 means no step end eval. +eval_epoch_interval: 50 # num of epoch intervals between each eval, 1 means eval on every epoch end. + +# runner config +runner_config: + epochs: 2 + batch_size: 1 + sink_mode: True + sink_size: 2 + +use_parallel: True +# parallel context config +parallel: + parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel + gradients_mean: False + enable_alltoall: False + full_batch: True + search_mode: "sharding_propagation" + enable_parallel_optimizer: True + strategy_ckpt_save_file: "./ckpt_strategy.ckpt" + parallel_optimizer_config: + gradient_accumulation_shard: False + parallel_optimizer_threshold: 64 +# default parallel of device num = 32 for Atlas 800T A2 +parallel_config: + data_parallel: 1 + model_parallel: 8 + pipeline_stage: 1 + use_seq_parallel: False + micro_batch_num: 1 + vocab_emb_dp: True + gradient_aggregation_group: 4 +# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process. +micro_batch_interleave_num: 1 + +# callbacks +callbacks: + - type: MFLossMonitor + - type: CheckpointMointor + prefix: "llama_70b" + save_checkpoint_steps: 1000 + integrated_save: False + async_save: False + - type: ObsMonitor + +# mindspore context init config +context: + mode: 0 #0--Graph Mode; 1--Pynative Mode + device_target: "Ascend" + enable_graph_kernel: False + graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true" + ascend_config: + precision_mode: "must_keep_origin_dtype" + max_call_depth: 10000 + max_device_memory: "58GB" + save_graphs: False + save_graphs_path: "./graph" + device_id: 0 + +# model config +model: + model_config: + type: LlamaConfig + batch_size: 1 # add for increase predict + seq_length: 4096 + hidden_size: 8192 + num_layers: 80 + num_heads: 64 + max_position_embedding: 4096 + vocab_size: 32000 + multiple_of: 256 + n_kv_heads: 8 + ffn_dim_multiplier: 1.3 + rms_norm_eps: 1.0e-5 + bos_token_id: 1 + eos_token_id: 2 + pad_token_id: 0 + ignore_token_id: -100 + compute_dtype: "float16" + layernorm_compute_type: "float32" + softmax_compute_type: "float16" + rotary_dtype: "float16" + param_init_type: "float16" + use_past: True + scaling_factor: 1.0 + extend_method: "None" # support "None", "PI", "NTK" + use_flash_attention: False + use_paged_attention: False # PA only supported in inference + block_size: 16 + num_blocks: 512 + is_dynamic: False + use_kvcache_op: False + is_flexible_shape: False + offset: 0 + use_rope_slice: False + checkpoint_name_or_path: "llama2_70b" + repetition_penalty: 1 + max_decode_length: 512 + top_k: 3 + top_p: 1 + do_sample: False + arch: + type: LlamaForCausalLM + +processor: + return_tensors: ms + tokenizer: + unk_token: '' + bos_token: '' + eos_token: '' + pad_token: '' + type: LlamaTokenizer + vocab_file: "" + type: LlamaProcessor + +# metric +metric: + type: PerplexityMetric + +eval_callbacks: + - type: ObsMonitor + +auto_tune: False +filepath_prefix: './autotune' +autotune_per_step: 10 + +profile: False +profile_start_step: 1 +profile_stop_step: 10 +init_start_profile: False +profile_communication: False +profile_memory: True +layer_scale: False +layer_decay: 0.65 +lr_scale_factor: 256 + +# aicc +remote_save_url: "Please input obs url on AICC platform." diff --git a/huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/run_llama2_70b_910b.yaml b/huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/run_llama2_70b_910b.yaml new file mode 100644 index 0000000..33babe9 --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/run_llama2_70b_910b.yaml @@ -0,0 +1,214 @@ +seed: 0 +output_dir: './output' # path to save checkpoint/strategy +load_checkpoint: '' +src_strategy_path_or_dir: '' +auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model +only_save_strategy: False +resume_training: False +run_mode: 'train' + +# trainer config +trainer: + type: CausalLanguageModelingTrainer + model_name: 'llama2_70b' +# if True, do evaluate during the training process. if false, do nothing. +# note that the task trainer should support _evaluate_in_training function. +do_eval: False +eval_step_interval: -1 # num of step intervals between each eval, -1 means no step end eval. +eval_epoch_interval: 50 # num of epoch intervals between each eval, 1 means eval on every epoch end. + +# runner config +runner_config: + epochs: 3 + batch_size: 1 + sink_mode: True + sink_size: 2 +# optimizer +optimizer: + type: FP32StateAdamWeightDecay + beta1: 0.9 + beta2: 0.95 + eps: 1.e-8 # 1e-8 + learning_rate: 1.e-5 + +# lr sechdule +lr_schedule: + type: CosineWithWarmUpLR + learning_rate: 1.e-5 + lr_end: 0 + warmup_ratio: 0.03 + total_steps: -1 # -1 means it will load the total steps of the dataset + +# dataset +train_dataset: &train_dataset + data_loader: + type: MindDataset + dataset_dir: "" + shuffle: True + input_columns: ["input_ids"] # "input_ids", "labels" , labels are used in instruction finetune. + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 1 + repeat: 1 + numa_enable: False + prefetch_size: 1 +train_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *train_dataset + +# eval dataset +eval_dataset: &eval_dataset + data_loader: + type: MindDataset + dataset_dir: "" + shuffle: False + input_columns: ["input_ids"] + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: False + repeat: 1 + numa_enable: False + prefetch_size: 1 +eval_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *eval_dataset + +use_parallel: True +# parallel context config +parallel: + parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel + gradients_mean: False + enable_alltoall: False + full_batch: True + search_mode: "sharding_propagation" + enable_parallel_optimizer: True + strategy_ckpt_save_file: "./ckpt_strategy.ckpt" + parallel_optimizer_config: + gradient_accumulation_shard: False + parallel_optimizer_threshold: 64 +# default parallel of device num = 32 for Atlas 800T A2 +parallel_config: + data_parallel: 2 + model_parallel: 4 + pipeline_stage: 8 + use_seq_parallel: True + micro_batch_num: 128 + vocab_emb_dp: True + gradient_aggregation_group: 4 +# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process. +micro_batch_interleave_num: 1 + +# recompute config +recompute_config: + recompute: False + select_recompute: False + parallel_optimizer_comm_recompute: False + mp_comm_recompute: True + recompute_slice_activation: True + +# callbacks +callbacks: + - type: MFLossMonitor + - type: CheckpointMointor + prefix: "llama_70b" + save_checkpoint_steps: 100000000 # big enough + integrated_save: False + async_save: False + - type: ObsMonitor + +# mindspore context init config +context: + mode: 0 #0--Graph Mode; 1--Pynative Mode + device_target: "Ascend" + enable_graph_kernel: False + graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true" + max_call_depth: 10000 + max_device_memory: "54GB" + save_graphs: False + save_graphs_path: "./graph" + device_id: 0 + runtime_num_threads: 1 + +# model config +model: + model_config: + type: LlamaConfig + batch_size: 1 # add for increase predict + seq_length: 4096 + hidden_size: 8192 + num_layers: 80 + num_heads: 64 + vocab_size: 32000 + multiple_of: 256 + n_kv_heads: 8 + ffn_dim_multiplier: 1.3 + rms_norm_eps: 1.0e-5 + bos_token_id: 1 + eos_token_id: 2 + pad_token_id: 0 + ignore_token_id: -100 + compute_dtype: "float16" + layernorm_compute_type: "float32" + softmax_compute_type: "float16" + rotary_dtype: "float16" + param_init_type: "float16" + use_past: False + scaling_factor: 1.0 + extend_method: "None" # support "None", "PI", "NTK" + use_flash_attention: True + fine_grain_interleave: 2 + qkv_concat: False + offset: 0 + checkpoint_name_or_path: "" + repetition_penalty: 1 + max_decode_length: 512 + top_k: 3 + top_p: 1 + do_sample: False + arch: + type: LlamaForCausalLM + +processor: + return_tensors: ms + tokenizer: + unk_token: '' + bos_token: '' + eos_token: '' + pad_token: '' + type: LlamaTokenizer + type: LlamaProcessor + +# metric +metric: + type: PerplexityMetric + +# wrapper cell config +runner_wrapper: + type: MFTrainOneStepCell + scale_sense: + type: DynamicLossScaleUpdateCell + loss_scale_value: 65536 + scale_factor: 2 + scale_window: 1000 + use_clip_grad: True + +eval_callbacks: + - type: ObsMonitor + +auto_tune: False +filepath_prefix: './autotune' +autotune_per_step: 10 + +profile: False +profile_start_step: 1 +profile_stop_step: 10 +init_start_profile: False +profile_communication: False +profile_memory: True +layer_scale: False +layer_decay: 0.65 +lr_scale_factor: 256 + +# aicc +remote_save_url: "Please input obs url on AICC platform." diff --git a/huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/run_llama2_70b_910b_finetune.yaml b/huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/run_llama2_70b_910b_finetune.yaml new file mode 100644 index 0000000..d193748 --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/run_llama2_70b_910b_finetune.yaml @@ -0,0 +1,214 @@ +seed: 0 +output_dir: './output' # path to save checkpoint/strategy +load_checkpoint: '{path}/llama2_13b.ckpt' +src_strategy_path_or_dir: '' +auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model +only_save_strategy: False +resume_training: False +run_mode: 'finetune' + +# trainer config +trainer: + type: CausalLanguageModelingTrainer + model_name: 'llama2_70b' +# if True, do evaluate during the training process. if false, do nothing. +# note that the task trainer should support _evaluate_in_training function. +do_eval: False +eval_step_interval: -1 # num of step intervals between each eval, -1 means no step end eval. +eval_epoch_interval: 50 # num of epoch intervals between each eval, 1 means eval on every epoch end. + +# runner config +runner_config: + epochs: 2 + batch_size: 1 + sink_mode: True + sink_size: 2 +# optimizer +optimizer: + type: FP32StateAdamWeightDecay + beta1: 0.9 + beta2: 0.99 + eps: 1.e-8 # 1e-8 + learning_rate: 1.e-5 + +# lr sechdule +lr_schedule: + type: CosineWithWarmUpLR + learning_rate: 1.e-5 + lr_end: 0 + warmup_ratio: 0.03 + total_steps: -1 # -1 means it will load the total steps of the dataset + +# dataset +train_dataset: &train_dataset + data_loader: + type: MindDataset + dataset_dir: "" + shuffle: True + input_columns: ["input_ids"] # "input_ids", "labels" , labels are used in instruction finetune. + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 1 + repeat: 1 + numa_enable: False + prefetch_size: 1 +train_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *train_dataset + +# eval dataset +eval_dataset: &eval_dataset + data_loader: + type: MindDataset + dataset_dir: "" + shuffle: False + input_columns: ["input_ids"] + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: False + repeat: 1 + numa_enable: False + prefetch_size: 1 +eval_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *eval_dataset + +use_parallel: True +# parallel context config +parallel: + parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel + gradients_mean: False + enable_alltoall: False + full_batch: True + search_mode: "sharding_propagation" + enable_parallel_optimizer: True + strategy_ckpt_save_file: "./ckpt_strategy.ckpt" + parallel_optimizer_config: + gradient_accumulation_shard: False + parallel_optimizer_threshold: 64 +# default parallel of device num = 32 for Atlas 800T A2 +parallel_config: + data_parallel: 2 + model_parallel: 4 + pipeline_stage: 8 + use_seq_parallel: True + micro_batch_num: 128 + vocab_emb_dp: True + gradient_aggregation_group: 4 +# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process. +micro_batch_interleave_num: 1 + +# recompute config +recompute_config: + recompute: False + select_recompute: False + parallel_optimizer_comm_recompute: False + mp_comm_recompute: True + recompute_slice_activation: True + +# callbacks +callbacks: + - type: MFLossMonitor + - type: CheckpointMointor + prefix: "llama_70b" + save_checkpoint_steps: 100000000 # big enough + integrated_save: False + async_save: False + - type: ObsMonitor + +# mindspore context init config +context: + mode: 0 #0--Graph Mode; 1--Pynative Mode + device_target: "Ascend" + enable_graph_kernel: False + graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true" + max_call_depth: 10000 + max_device_memory: "54GB" + save_graphs: False + save_graphs_path: "./graph" + device_id: 0 + +# model config +model: + model_config: + type: LlamaConfig + batch_size: 1 # add for increase predict + seq_length: 4096 + hidden_size: 8192 + num_layers: 80 + num_heads: 64 + vocab_size: 32000 + multiple_of: 256 + n_kv_heads: 8 + ffn_dim_multiplier: 1.3 + rms_norm_eps: 1.0e-5 + bos_token_id: 1 + eos_token_id: 2 + pad_token_id: 0 + ignore_token_id: -100 + compute_dtype: "float16" + layernorm_compute_type: "float32" + softmax_compute_type: "float16" + rotary_dtype: "float16" + param_init_type: "float16" + use_past: False + scaling_factor: 1.0 + extend_method: "None" # support "None", "PI", "NTK" + use_flash_attention: True # FA can accelerate training or finetune + fine_grain_interleave: 2 + qkv_concat: false + offset: 0 + use_past_shard: False + checkpoint_name_or_path: "" + repetition_penalty: 1 + max_decode_length: 512 + top_k: 3 + top_p: 1 + do_sample: False + arch: + type: LlamaForCausalLM + +processor: + return_tensors: ms + tokenizer: + unk_token: '' + bos_token: '' + eos_token: '' + pad_token: '' + type: LlamaTokenizer + type: LlamaProcessor + +# metric +metric: + type: PerplexityMetric + +# wrapper cell config +runner_wrapper: + type: MFTrainOneStepCell + scale_sense: + type: DynamicLossScaleUpdateCell + loss_scale_value: 65536 + scale_factor: 2 + scale_window: 1000 + use_clip_grad: True + +eval_callbacks: + - type: ObsMonitor + +auto_tune: False +filepath_prefix: './autotune' +autotune_per_step: 10 + +profile: False +profile_start_step: 1 +profile_stop_step: 10 +init_start_profile: False +profile_communication: False +profile_memory: True +layer_scale: False +layer_decay: 0.65 +lr_scale_factor: 256 + +# aicc +remote_save_url: "Please input obs url on AICC platform." diff --git a/huawei/mindspore/mindformers/models/llama2_70b/registed_tasks.sh b/huawei/mindspore/mindformers/models/llama2_70b/registed_tasks.sh new file mode 100644 index 0000000..c2153a9 --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_70b/registed_tasks.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# 单机运行的任务 +SINGLE_NODE_LAUNCH=() +# 多机运行的任务 +MULTI_NODES_LAUNCH=( \ + "run_llama2_70b_910b" \ + "run_llama2_70b_910b_finetune" +) \ No newline at end of file diff --git a/huawei/mindspore/mindformers/models/llama2_7b/README.md b/huawei/mindspore/mindformers/models/llama2_7b/README.md new file mode 100644 index 0000000..7915f21 --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_7b/README.md @@ -0,0 +1,185 @@ +# AISBench 基于Mindspore/mindformers框架的LLaMA2 7b 训练负载包使用指南 +本文主要介绍使用基于mindformers LLaMA2 7b大模型训练业务代码构建的AISBench的负载包"Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-llama2_7b-{mindformers version}.tar.gz",进行服务器性能测试的流程。 +本负载包支持一键拉起单机的LLaMA2 7b模型的预训练或微调任务,自动完成评测,并汇总训练相关的性能数据。 +## 名词定义 +|名词| 定义| +| --- | ----------------------------------- | +|ais-bench-stubs|启动性能测试任务的二进制程序| +## 查看llama2 7b 训练负载包目录结构,简单确认完整性 +解压负载包"Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-llama2_7b-{mindformers version}.tar.gz"(如果在包中看到本文档忽略此步) +```bash +tar xzf Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-llama2_7b-{mindformers version}.tar.gz +``` +负载包名中"{mindformers version}"表示[mindformer仓库](https://gitee.com/mindspore/mindformers)的分支名 +查看目录结构 +```bash +├── ais-bench-stubs # 启动测试的二进制文件 +├── code/ +│   ├── benchmark.sh +│   ├── evaluate_run.sh +│   ├── evaluate_scripts.sh # 评测的运行脚本,需要依据实际运行的评测命令 +│   ├── launch_config.sh +│   ├── launch_yamls/ # 启动训练任务的配置文件 +| │   ├──run_llama2_7b_910b.yaml +| │   ├──...... +│   ├── mindformers/ # 嵌入了logging打点接口的mindformers代码 +│   ├── multi_nodes_run.sh +│   ├── registed_tasks.sh # 注册了可用的mindformers脚本 +│   └── single_node_run.sh +├── config/ +│   ├── config.json +│   └── system.json +├── log/ +├── result/ +├── README.md # 本文档 +└── STUBS_PACKAGE_INTRO.md +``` +**后续对于相对路径的描述都是相对于负载包中的一级目录,例如 ./** + +## 负载包运行环境准备 +### 基本环境准备 +``` +python >=3.7 +``` +### mindspore准备 +请依据负载包名中的“{mindformers version}”对应的mindformers分支版本,参考[mindformers训练负载主页](https://gitee.com/aisbench/training/tree/master/huawei/mindspore/mindformers),安装指定版本的mindspore(python版本不限)。 +MindSpore安装参考[MindSpore官网](https://www.mindspore.cn/)MindSpore需要能成功在npu上运行,验证命令: +```bash +python -c "import mindspore;mindspore.set_context(device_target='Ascend');mindspore.run_check()" + +### logging准备 +从[logging打点工具发行版](https://gitee.com/aisbench/logging/releases)获取最新的发行版。 +参考[logging打点工具主页](https://gitee.com/aisbench/logging)的“安装与卸载/安装logging”章节安装logging打点工具。 + +## 资源准备 +### 前置声明 +1. 以下涉及到mindformers代码仓库的链接没有指定代码分支,需要依据负载包名"{mindformers version}",自行切换到对应的分支。 +2. 运行LLaMA2训练的MindSpore/mindformers的代码全部在`./code/mindformers`文件夹中,资源准备总体参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md),具体资源的参考详见本章其他小节。
+### rank_table_file准备 +rank_table_file是一个包含集群节点和加速卡ip信息的json文件。 +准备rank_table_file前确保计算节点的`/etc/hccn.conf`文件已经配好(如果没配好,参考[数据中心解决方案/配置训练节点](https://www.hiascend.com/document/detail/zh/Ascend%20Data%20Center%20Solution/22.0.0/install/800_9000/install_800_9000_0029.html)配置)。 + +参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)的“生成RANK_TABLE_FILE(多卡运行必须环节)”章节。 + +### 模型权重下载与转换 +微调任务需要,预训练任务不涉及。 +参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)的“模型权重下载与转换”章节准备LLaMA2 7b的ckpt文件。 + +### 数据集准备 +#### 预训练数据集准备 +参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“预训练/数据集准备”章节准备转换好的预训练数据集。 +#### 微调数据集准备 +参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“微调/数据集准备”章节准备转换好的微调数据集。 +#### 评测数据集准备 +依据实际的评测需求准备数据集 +**wikitext** +参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“评测/文本生成/获取数据集”章节准备评测数据集。 +**SQuAD** +参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“评测阅读理解/获取数据集”章节准备评测数据集。 + + +## 查看本负载包支持的mindformers启动配置文件 +查看`code/register_task.sh`文件: +```bash +#!/bin/bash +# 单机运行的任务 +SINGLE_NODE_LAUNCH=( \ + "run_llama2_7b_910b_finetune" \ + "run_llama2_7b_910b" \ + "run_llama2_7b_finetune" \ + "run_llama2_7b" +) +# 多机运行的任务,LLaMA2 7b不涉及 +MULTI_NODES_LAUNCH=() +``` + +```bash +"run_llama2_7b_910b_finetune" # 8卡 910B1,910B2,910B3 微调任务 +"run_llama2_7b_910b" # 8卡 910B1,910B2,910B3 预训练任务 +"run_llama2_7b_finetune" # 8卡 910A,910B4 微调任务 +"run_llama2_7b" # 8卡 910A,910B4 预训练任务 +``` +需要确认`code/launch_yamls/`路径下包含了上述`code/register_task.sh`中注册的yaml文件。 + +## 启动前配置 +### 负载通用配置文件launch_config.sh配置 +编辑`code/launch_config.sh`启动文件: +```bash +#!/bin/bash + +export AIS_PYTHON=python3 # 使用的python解释器 +export AISBENCH_LOGGING_WARM_UP_STEP_COUNT=3 # 从第几个steps之后开始统计step的性能数据 + +export AIS_NODE_FILE_PATH=/home/xx/xx/xx/node_file.json # 分布式运行使用cluster_tools所需包含节点信息和ssh key路径的文件,单机训练不用填 +export AIS_TRAIN_TASK_TYPE="train" # 预训练:"train",微调:"finetune" +export AIS_MODEL_NAME="llama2_7b" # 维持默认 +export AIS_TRAIN_YAML="run_llama2_7b" # 请从code/registed_task.sh中注册的yaml文件中选择一个填入 + +export AIS_RANK_NUM=8 # 集群总加速卡数 +export AIS_DEVICE_NUM=8 # 单台服务器的加速卡数量 +export AIS_RANK_TABLE_FILE="./xx.json" # rank_table_file 的路径, 相对于当前脚本 +``` +### 修改yaml配置文件 +修改`code/launch_config.sh`中设置的`AIS_TRAIN_YAML`对应的`code/launch_yaml/`中的yaml配置文件。 +#### 预训练任务 +```yaml +# dataset +train_dataset: &train_dataset + data_loader: + type: MindDataset + dataset_dir: "" # 需要填入.mindrecord后缀的wikitext-2数据集的路径。 + shuffle: True +``` +`dataset_dir`支持绝对路径以及相对路径。`dataset_dir`填入的相对路径`../../`实际对应负载包路径`code/mindformers/` + +#### 微调任务 +```yaml +seed: 0 +output_dir: './output' # path to save checkpoint/strategy,维持默认 +load_checkpoint: '{path}/llama2_7b.ckpt' # 需要填入准备的权重文件.ckpt的路径 +src_strategy_path_or_dir: '' +``` +`load_checkpoint`支持绝对路径以及相对路径。`load_checkpoint`填入的相对路径`./`实际对应负载包路径`code/mindformers/` +```yaml +# dataset +train_dataset: &train_dataset + data_loader: + type: MindDataset + dataset_dir: "/{path}/alpaca-fastchat2048.mindrecord" # 需要填入.mindrecord后缀的alpaca数据集的路径。 + shuffle: True +``` +`dataset_dir`支持绝对路径以及相对路径。`dataset_dir`填入的相对路径`../../`实际对应负载包路径`code/mindformers/` + +### 修改评测启动脚本 +`code/evaluate_scripts.sh`评测脚本的启动较为多样,为了保证可拓展性,支持用户自行修改。默认提供的是基于wikitext-2数据集的文本生成任务的评测启动脚本。`code/evaluate_scripts.sh`默认内容如下: +```bash +#!/bin/bash +CUR_DIR=$(cd "$(dirname "$0")";pwd) +MINDFORMERS_CODE_PATH=${CUR_DIR}/mindformers/ +LAUNCH_SCRIPT_PATH=${MINDFORMERS_CODE_PATH}/scripts/ +OUTPUT_PATH=${MINDFORMERS_CODE_PATH}/output/ # 训练完后output路径 + +# eval_script 评测启动命令请自行根据实际情况修改 +export mindformers_workload_eval_cmd="${AIS_PYTHON} ${MINDFORMERS_CODE_PATH}/run_mindformer.py \ +--config ${CUR_DIR}/launch_yamls/run_llama2_7b.yaml \ +--eval_dataset_dir /{path}/wiki4096valid.mindrecord \ +--run_mode eval \ +--load_checkpoint ${OUTPUT_PATH}/target_checkpoint/rank_0/llama2_7b0.ckpt \ +--epochs 1 \ +--use_parallel False \ +--device_id 0" +``` +其中`--eval_dataset_dir`需要用户自行传入wikitext-2的.mindrecord后缀的评测数据集的绝对路径。 + +## 启动测试 +### 在线测试 +在线测试的前置准备请参考`STUBS_PACKAGE_INTRO.md`文档。启动命令: +```bash +./ais-bench-stubs +``` +### 轻量化离线测试 +启动命令: +```bash +./ais-bench-stubs test +``` + diff --git a/huawei/mindspore/mindformers/models/llama2_7b/evaluate_scripts.sh b/huawei/mindspore/mindformers/models/llama2_7b/evaluate_scripts.sh new file mode 100644 index 0000000..6ea1691 --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_7b/evaluate_scripts.sh @@ -0,0 +1,15 @@ +#!/bin/bash +CUR_DIR=$(cd "$(dirname "$0")";pwd) +MINDFORMERS_CODE_PATH=${CUR_DIR}/mindformers/ +LAUNCH_SCRIPT_PATH=${MINDFORMERS_CODE_PATH}/scripts/ +OUTPUT_PATH=${MINDFORMERS_CODE_PATH}/output/ # 训练完后output路径 + +# eval_script 评测启动命令请自行根据实际情况修改 +export mindformers_workload_eval_cmd="${AIS_PYTHON} ${MINDFORMERS_CODE_PATH}/run_mindformer.py \ +--config ${CUR_DIR}/launch_yamls/run_llama2_7b.yaml \ +--eval_dataset_dir /{path}/wiki4096valid.mindrecord \ +--run_mode eval \ +--load_checkpoint ${OUTPUT_PATH}/target_checkpoint/rank_0/llama2_7b0.ckpt \ +--epochs 1 \ +--use_parallel False \ +--device_id 0" \ No newline at end of file diff --git a/huawei/mindspore/mindformers/models/llama2_7b/launch_config.sh b/huawei/mindspore/mindformers/models/llama2_7b/launch_config.sh new file mode 100644 index 0000000..1b6e5d8 --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_7b/launch_config.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +export AIS_PYTHON=python3 # 使用的python解释器 +export AISBENCH_LOGGING_WARM_UP_STEP_COUNT=3 # 从第几个steps之后开始统计step的性能数据 + +export AIS_NODE_FILE_PATH=/home/xx/xx/xx/node_file.json # 分布式运行使用cluster_tools所需包含节点信息和ssh key路径的文件,单机训练不用填 +export AIS_TRAIN_TASK_TYPE="train" # 预训练:"train",微调:"finetune" +export AIS_MODEL_NAME="llama2_7b" # 维持默认 +export AIS_TRAIN_YAML="run_llama2_7b" # 请从code/registed_task.sh中注册的yaml文件中选择一个填入 + +export AIS_RANK_NUM=8 # 集群总加速卡数 +export AIS_DEVICE_NUM=8 # 单台服务器的加速卡数量 +export AIS_RANK_TABLE_FILE="./xx.json" # rank_table_file 的路径, 相对于当前脚本 diff --git a/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b.yaml b/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b.yaml new file mode 100644 index 0000000..7bf7e66 --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b.yaml @@ -0,0 +1,213 @@ +seed: 0 +output_dir: './output' # path to save checkpoint/strategy +load_checkpoint: '' +src_strategy_path_or_dir: '' +auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model +only_save_strategy: False +resume_training: False +run_mode: 'train' + +# trainer config +trainer: + type: CausalLanguageModelingTrainer + model_name: 'llama2_7b' + +# runner config +runner_config: + epochs: 2 + batch_size: 1 + sink_mode: True + sink_size: 2 + +# optimizer +optimizer: + type: FP32StateAdamWeightDecay + beta1: 0.9 + beta2: 0.95 + eps: 1.e-8 # 1e-8 + learning_rate: 3.e-4 + +# lr sechdule +lr_schedule: + type: CosineWithWarmUpLR + learning_rate: 3.e-4 + lr_end: 3.e-5 + warmup_ratio: 0.03 + total_steps: -1 # -1 means it will load the total steps of the dataset + +# dataset +train_dataset: &train_dataset + data_loader: + type: MindDataset + dataset_dir: "" + shuffle: True + input_columns: ["input_ids"] # "input_ids", "labels" , labels are used in instruction finetune. + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 4 + repeat: 1 + numa_enable: False + prefetch_size: 1 +train_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *train_dataset +# if True, do evaluate during the training process. if false, do nothing. +# note that the task trainer should support _evaluate_in_training function. +do_eval: False +eval_step_interval: -1 # num of step intervals between each eval, -1 means no step end eval. +eval_epoch_interval: 50 # num of epoch intervals between each eval, 1 means eval on every epoch end. + +# eval dataset +eval_dataset: &eval_dataset + data_loader: + type: MindDataset + dataset_dir: "" + shuffle: False + input_columns: ["input_ids"] + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: False + repeat: 1 + numa_enable: False + prefetch_size: 1 +eval_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *eval_dataset + +use_parallel: True +# parallel context config +parallel: + parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel + gradients_mean: False + enable_alltoall: False + full_batch: True + search_mode: "sharding_propagation" + enable_parallel_optimizer: True + strategy_ckpt_save_file: "./ckpt_strategy.ckpt" + parallel_optimizer_config: + gradient_accumulation_shard: False + parallel_optimizer_threshold: 64 +# default parallel of device num = 8 for Atlas 800 +parallel_config: + data_parallel: 2 + model_parallel: 4 + pipeline_stage: 1 + use_seq_parallel: False + micro_batch_num: 8 + vocab_emb_dp: True + gradient_aggregation_group: 4 +# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process. +micro_batch_interleave_num: 2 + +# recompute config +recompute_config: + recompute: True + select_recompute: False + parallel_optimizer_comm_recompute: False + mp_comm_recompute: True + recompute_slice_activation: True + +# callbacks +callbacks: + - type: MFLossMonitor + - type: CheckpointMointor + prefix: "llama_7b" + save_checkpoint_steps: 10000000 # big enough + integrated_save: False + async_save: False + - type: ObsMonitor + +# mindspore context init config +context: + mode: 0 #0--Graph Mode; 1--Pynative Mode + device_target: "Ascend" + enable_graph_kernel: False + graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true" + max_call_depth: 10000 + max_device_memory: "28GB" + save_graphs: False + save_graphs_path: "./graph" + device_id: 0 + +# model config +model: + model_config: + type: LlamaConfig + batch_size: 1 # add for increase predict + seq_length: 4096 + hidden_size: 4096 + num_layers: 32 + num_heads: 32 + max_position_embedding: 4096 + vocab_size: 32000 + multiple_of: 256 + rms_norm_eps: 1.0e-5 + bos_token_id: 1 + eos_token_id: 2 + pad_token_id: 0 + ignore_token_id: -100 + compute_dtype: "float16" + layernorm_compute_type: "float32" + softmax_compute_type: "float16" + rotary_dtype: "float16" + param_init_type: "float16" + use_past: False + pretrain_seqlen: 4096 # seqlen of the pretrain checkpoint: 2048 for llama and 4096 for llama2 + extend_method: "None" # support "None", "PI", "NTK" + compute_in_2d: False + use_flash_attention: False # FA can accelerate training or finetune + offset: 0 + use_past_shard: False + checkpoint_name_or_path: "llama2_7b" + repetition_penalty: 1 + max_decode_length: 512 + top_k: 3 + top_p: 1 + do_sample: False + arch: + type: LlamaForCausalLM + +processor: + return_tensors: ms + tokenizer: + unk_token: '' + bos_token: '' + eos_token: '' + pad_token: '' + type: LlamaTokenizer + type: LlamaProcessor + +# metric +metric: + type: PerplexityMetric + +# wrapper cell config +runner_wrapper: + type: MFTrainOneStepCell + scale_sense: + type: DynamicLossScaleUpdateCell + loss_scale_value: 65536 + scale_factor: 2 + scale_window: 1000 + use_clip_grad: True + +eval_callbacks: + - type: ObsMonitor + +auto_tune: False +filepath_prefix: './autotune' +autotune_per_step: 10 + +profile: False +profile_start_step: 1 +profile_stop_step: 10 +init_start_profile: False +profile_communication: False +profile_memory: True +layer_scale: False +layer_decay: 0.65 +lr_scale_factor: 256 + +# aicc +remote_save_url: "Please input obs url on AICC platform." diff --git a/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_910b.yaml b/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_910b.yaml new file mode 100644 index 0000000..01fb8a0 --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_910b.yaml @@ -0,0 +1,210 @@ +seed: 0 +output_dir: './output' # path to save checkpoint/strategy +load_checkpoint: '' +src_strategy_path_or_dir: '' +auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model +only_save_strategy: False +resume_training: False +run_mode: 'train' + +# trainer config +trainer: + type: CausalLanguageModelingTrainer + model_name: 'llama2_7b' + +# runner config +runner_config: + epochs: 2 + batch_size: 1 + sink_mode: True + sink_size: 2 + gradient_accumulation_steps: 8 + +# optimizer +optimizer: + type: FP32StateAdamWeightDecay + beta1: 0.9 + beta2: 0.95 + eps: 1.e-8 + learning_rate: 5.e-5 + +# lr sechdule +lr_schedule: + type: CosineWithWarmUpLR + learning_rate: 5.e-5 + lr_end: 0 + warmup_ratio: 0.03 + total_steps: -1 # -1 means it will load the total steps of the dataset + +# dataset +train_dataset: &train_dataset + data_loader: + type: MindDataset + dataset_dir: "" + shuffle: True + input_columns: ["input_ids"] # "input_ids", "labels" , labels are used in instruction finetune. + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 6 + repeat: 1 + numa_enable: False + prefetch_size: 1 +train_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *train_dataset +# if True, do evaluate during the training process. if false, do nothing. +# note that the task trainer should support _evaluate_in_training function. +do_eval: False + +# eval dataset +eval_dataset: &eval_dataset + data_loader: + type: MindDataset + dataset_dir: "" + shuffle: False + input_columns: ["input_ids"] + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: False + repeat: 1 + numa_enable: False + prefetch_size: 1 +eval_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *eval_dataset + +use_parallel: True +# parallel context config +parallel: + parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel + gradients_mean: False + enable_alltoall: False + full_batch: True + search_mode: "sharding_propagation" + enable_parallel_optimizer: True + strategy_ckpt_save_file: "./ckpt_strategy.ckpt" + parallel_optimizer_config: + gradient_accumulation_shard: False + parallel_optimizer_threshold: 64 +# default parallel of device num = 8 for Atlas 800T A2 +parallel_config: + data_parallel: 8 + model_parallel: 1 + pipeline_stage: 1 + use_seq_parallel: False + micro_batch_num: 1 + vocab_emb_dp: True + gradient_aggregation_group: 4 +# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process. +micro_batch_interleave_num: 1 + +# recompute config +recompute_config: + recompute: False + select_recompute: False + parallel_optimizer_comm_recompute: False + mp_comm_recompute: True + recompute_slice_activation: True + +# callbacks +callbacks: + - type: MFLossMonitor + - type: CheckpointMointor + prefix: "llama2_7b" + save_checkpoint_steps: 10000000 # big enough + integrated_save: False + async_save: False + - type: ObsMonitor + +# mindspore context init config +context: + mode: 0 #0--Graph Mode; 1--Pynative Mode + device_target: "Ascend" + enable_graph_kernel: False + graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true" + max_call_depth: 10000 + max_device_memory: "58GB" + save_graphs: False + save_graphs_path: "./graph" + device_id: 0 + runtime_num_threads: 1 + +# model config +model: + model_config: + type: LlamaConfig + batch_size: 1 # add for increase predict + seq_length: 4096 + hidden_size: 4096 + num_layers: 32 + num_heads: 32 + vocab_size: 32000 + multiple_of: 256 + rms_norm_eps: 1.0e-5 + bos_token_id: 1 + eos_token_id: 2 + pad_token_id: 0 + ignore_token_id: -100 + compute_dtype: "float16" + layernorm_compute_type: "float32" + softmax_compute_type: "float16" + rotary_dtype: "float16" + param_init_type: "float16" + use_past: False + scaling_factor: 1.0 + extend_method: "None" # support "None", "PI", "NTK" + use_flash_attention: True # FA can accelerate training or finetune + offset: 0 + checkpoint_name_or_path: "llama2_7b" + repetition_penalty: 1 + max_decode_length: 512 + top_k: 3 + top_p: 1 + do_sample: False + arch: + type: LlamaForCausalLM + +processor: + return_tensors: ms + tokenizer: + unk_token: '' + bos_token: '' + eos_token: '' + pad_token: '' + type: LlamaTokenizer + type: LlamaProcessor + +# metric +metric: + type: PerplexityMetric + +# wrapper cell config +runner_wrapper: + type: MFTrainOneStepCell + scale_sense: + type: DynamicLossScaleUpdateCell + loss_scale_value: 65536 + scale_factor: 2 + scale_window: 1000 + use_clip_grad: True + +eval_callbacks: + - type: ObsMonitor + +auto_tune: False +filepath_prefix: './autotune' +autotune_per_step: 10 + +profile: False +profile_start_step: 1 +profile_stop_step: 10 +init_start_profile: False +profile_communication: False +profile_memory: True +layer_scale: False +layer_decay: 0.65 +lr_scale_factor: 256 + +# aicc +remote_save_url: "Please input obs url on AICC platform." diff --git a/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_910b_finetune.yaml b/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_910b_finetune.yaml new file mode 100644 index 0000000..8431b1d --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_910b_finetune.yaml @@ -0,0 +1,209 @@ +seed: 0 +output_dir: './output' # path to save checkpoint/strategy +load_checkpoint: '{path}/llama2_7b.ckpt' # 必填 +src_strategy_path_or_dir: '' +auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model +only_save_strategy: False +resume_training: False +run_mode: 'finetune' + +# trainer config +trainer: + type: CausalLanguageModelingTrainer + model_name: 'llama2_7b' + +# runner config +runner_config: + epochs: 2 + batch_size: 1 + sink_mode: True + sink_size: 2 + +# optimizer +optimizer: + type: FP32StateAdamWeightDecay + beta1: 0.9 + beta2: 0.999 + eps: 1.e-8 + learning_rate: 1.e-6 + +# lr sechdule +lr_schedule: + type: CosineWithWarmUpLR + learning_rate: 1.e-6 + lr_end: 0 + warmup_ratio: 0.03 + total_steps: -1 # -1 means it will load the total steps of the dataset + +# dataset +train_dataset: &train_dataset + data_loader: + type: MindDataset + dataset_dir: "" + shuffle: True + input_columns: ["input_ids", "labels"] # "input_ids", "labels" , labels are used in instruction finetune. + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 2 + repeat: 1 + numa_enable: False + prefetch_size: 1 +train_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *train_dataset +# if True, do evaluate during the training process. if false, do nothing. +# note that the task trainer should support _evaluate_in_training function. +do_eval: False + +# eval dataset +eval_dataset: &eval_dataset + data_loader: + type: MindDataset + dataset_dir: "" + shuffle: False + input_columns: ["input_ids"] + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: False + repeat: 1 + numa_enable: False + prefetch_size: 1 +eval_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *eval_dataset + +use_parallel: True +# parallel context config +parallel: + parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel + gradients_mean: False + enable_alltoall: False + full_batch: True + search_mode: "sharding_propagation" + enable_parallel_optimizer: True + strategy_ckpt_save_file: "./ckpt_strategy.ckpt" + parallel_optimizer_config: + gradient_accumulation_shard: False + parallel_optimizer_threshold: 64 +# default parallel of device num = 8 for Atlas 800T A2 +parallel_config: + data_parallel: 8 + model_parallel: 1 + pipeline_stage: 1 + use_seq_parallel: False + micro_batch_num: 1 + vocab_emb_dp: True + gradient_aggregation_group: 4 +# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process. +micro_batch_interleave_num: 1 + +# recompute config +recompute_config: + recompute: False + select_recompute: False + parallel_optimizer_comm_recompute: False + mp_comm_recompute: True + recompute_slice_activation: True + +# callbacks +callbacks: + - type: MFLossMonitor + - type: CheckpointMointor + prefix: "llama2_7b" + save_checkpoint_steps: 100000000 # big enough + integrated_save: False + async_save: False + - type: ObsMonitor + +# mindspore context init config +context: + mode: 0 #0--Graph Mode; 1--Pynative Mode + device_target: "Ascend" + enable_graph_kernel: False + graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true" + max_call_depth: 10000 + max_device_memory: "58GB" + save_graphs: False + save_graphs_path: "./graph" + device_id: 0 + runtime_num_threads: 1 + +# model config +model: + model_config: + type: LlamaConfig + batch_size: 1 # add for increase predict + seq_length: 4096 + hidden_size: 4096 + num_layers: 32 + num_heads: 32 + vocab_size: 32000 + multiple_of: 256 + rms_norm_eps: 1.0e-5 + bos_token_id: 1 + eos_token_id: 2 + pad_token_id: 0 + ignore_token_id: -100 + compute_dtype: "bfloat16" + layernorm_compute_type: "float32" + softmax_compute_type: "float16" + rotary_dtype: "float16" + param_init_type: "float32" + use_past: False + scaling_factor: 1.0 + extend_method: "None" # support "None", "PI", "NTK" + use_flash_attention: True # FA can accelerate training or finetune + offset: 0 + checkpoint_name_or_path: "llama2_7b" + repetition_penalty: 1 + max_decode_length: 512 + top_k: 3 + top_p: 1 + do_sample: False + arch: + type: LlamaForCausalLM + +processor: + return_tensors: ms + tokenizer: + unk_token: '' + bos_token: '' + eos_token: '' + pad_token: '' + type: LlamaTokenizer + type: LlamaProcessor + +# metric +metric: + type: PerplexityMetric + +# wrapper cell config +runner_wrapper: + type: MFTrainOneStepCell + scale_sense: + type: DynamicLossScaleUpdateCell + loss_scale_value: 65536 + scale_factor: 2 + scale_window: 1000 + use_clip_grad: True + +eval_callbacks: + - type: ObsMonitor + +auto_tune: False +filepath_prefix: './autotune' +autotune_per_step: 10 + +profile: False +profile_start_step: 1 +profile_stop_step: 10 +init_start_profile: False +profile_communication: False +profile_memory: True +layer_scale: False +layer_decay: 0.65 +lr_scale_factor: 256 + +# aicc +remote_save_url: "Please input obs url on AICC platform." diff --git a/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_finetune.yaml b/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_finetune.yaml new file mode 100644 index 0000000..c8286bd --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_finetune.yaml @@ -0,0 +1,214 @@ +seed: 0 +output_dir: './output' # path to save checkpoint/strategy +load_checkpoint: '{path}/llama2_7b.ckpt' # 必填 +src_strategy_path_or_dir: '' +auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model +only_save_strategy: False +resume_training: False +run_mode: 'train' + +# trainer config +trainer: + type: CausalLanguageModelingTrainer + model_name: 'llama2_7b' + +# runner config +runner_config: + epochs: 2 + batch_size: 1 + sink_mode: True + sink_size: 2 + +# optimizer +optimizer: + type: FP32StateAdamWeightDecay + beta1: 0.9 + beta2: 0.999 + eps: 1.e-8 # 1e-8 + learning_rate: 1.e-5 + +# lr sechdule +lr_schedule: + type: CosineWithWarmUpLR + learning_rate: 1.e-5 + lr_end: 0 + warmup_ratio: 0.03 + total_steps: -1 # -1 means it will load the total steps of the dataset + +# dataset +train_dataset: &train_dataset + data_loader: + type: MindDataset + dataset_dir: "/{path}/alpaca-fastchat2048.mindrecord" + shuffle: True + input_columns: ["input_ids", "labels"] # "input_ids", "labels" , labels are used in instruction finetune. + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: True + batch_size: 4 + repeat: 1 + numa_enable: False + prefetch_size: 1 +train_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *train_dataset +# if True, do evaluate during the training process. if false, do nothing. +# note that the task trainer should support _evaluate_in_training function. +do_eval: False +eval_step_interval: -1 # num of step intervals between each eval, -1 means no step end eval. +eval_epoch_interval: 50 # num of epoch intervals between each eval, 1 means eval on every epoch end. + +# eval dataset +eval_dataset: &eval_dataset + data_loader: + type: MindDataset + dataset_dir: "" + shuffle: False + input_columns: ["input_ids"] + num_parallel_workers: 8 + python_multiprocessing: False + drop_remainder: False + repeat: 1 + numa_enable: False + prefetch_size: 1 +eval_dataset_task: + type: CausalLanguageModelDataset + dataset_config: *eval_dataset + +use_parallel: True +# parallel context config +parallel: + parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel + gradients_mean: False + enable_alltoall: False + full_batch: True + search_mode: "sharding_propagation" + enable_parallel_optimizer: True + strategy_ckpt_save_file: "./ckpt_strategy.ckpt" + parallel_optimizer_config: + gradient_accumulation_shard: False + parallel_optimizer_threshold: 64 +# default parallel of device num = 8 for Atlas 800 +parallel_config: + data_parallel: 2 + model_parallel: 1 + pipeline_stage: 4 + use_seq_parallel: False + micro_batch_num: 8 + vocab_emb_dp: True + gradient_aggregation_group: 4 +# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process. +micro_batch_interleave_num: 2 + +# recompute config +recompute_config: + recompute: True + select_recompute: False + parallel_optimizer_comm_recompute: False + mp_comm_recompute: True + recompute_slice_activation: True + +# callbacks +callbacks: + - type: MFLossMonitor + - type: CheckpointMointor + prefix: "llama_7b" + save_checkpoint_steps: 10000000 # big enough + integrated_save: False + async_save: False + - type: ObsMonitor + +# mindspore context init config +context: + mode: 0 #0--Graph Mode; 1--Pynative Mode + device_target: "Ascend" + enable_graph_kernel: False + graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true" + max_call_depth: 10000 + max_device_memory: "28GB" + save_graphs: False + save_graphs_path: "./graph" + device_id: 0 + runtime_num_threads: 1 + +# model config +model: + model_config: + type: LlamaConfig + batch_size: 1 # add for increase predict + seq_length: 2048 + hidden_size: 4096 + num_layers: 32 + num_heads: 32 + max_position_embedding: 4096 + vocab_size: 32000 + multiple_of: 256 + rms_norm_eps: 1.0e-5 + bos_token_id: 1 + eos_token_id: 2 + pad_token_id: 0 + ignore_token_id: -100 + compute_dtype: "float16" + layernorm_compute_type: "float32" + softmax_compute_type: "float16" + rotary_dtype: "float16" + param_init_type: "float16" + use_past: False + pretrain_seqlen: 4096 # seqlen of the pretrain checkpoint: 2048 for llama and 4096 for llama2 + extend_method: "None" # support "None", "PI", "NTK" + compute_in_2d: False + use_flash_attention: False # FA can accelerate training or finetune + offset: 0 + use_past_shard: False + checkpoint_name_or_path: "llama2_7b" + repetition_penalty: 1 + max_decode_length: 512 + top_k: 3 + top_p: 1 + do_sample: False + arch: + type: LlamaForCausalLM + +processor: + return_tensors: ms + tokenizer: + unk_token: '' + bos_token: '' + eos_token: '' + pad_token: '' + type: LlamaTokenizer + type: LlamaProcessor + +# metric +metric: + type: PerplexityMetric + +# wrapper cell config +runner_wrapper: + type: MFTrainOneStepCell + scale_sense: + type: DynamicLossScaleUpdateCell + loss_scale_value: 65536 + scale_factor: 2 + scale_window: 1000 + use_clip_grad: True + +eval_callbacks: + - type: ObsMonitor + +auto_tune: False +filepath_prefix: './autotune' +autotune_per_step: 10 + +profile: False +profile_start_step: 1 +profile_stop_step: 10 +init_start_profile: False +profile_communication: False +profile_memory: True +layer_scale: False +layer_decay: 0.65 +lr_scale_factor: 256 + +# aicc +remote_save_url: "Please input obs url on AICC platform." diff --git a/huawei/mindspore/mindformers/models/llama2_7b/registed_tasks.sh b/huawei/mindspore/mindformers/models/llama2_7b/registed_tasks.sh new file mode 100644 index 0000000..9f1cf51 --- /dev/null +++ b/huawei/mindspore/mindformers/models/llama2_7b/registed_tasks.sh @@ -0,0 +1,10 @@ +#!/bin/bash +# 单机运行的任务 +SINGLE_NODE_LAUNCH=( \ + "run_llama2_7b_910b_finetune" \ + "run_llama2_7b_910b" \ + "run_llama2_7b_finetune" \ + "run_llama2_7b" +) +# 多机运行的任务 +MULTI_NODES_LAUNCH=() \ No newline at end of file diff --git a/huawei/mindspore/mindformers/patch_files/r1.1.rc1/patch_config.sh b/huawei/mindspore/mindformers/patch_files/r1.1.rc1/patch_config.sh new file mode 100644 index 0000000..f54d518 --- /dev/null +++ b/huawei/mindspore/mindformers/patch_files/r1.1.rc1/patch_config.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# 代码的git远程仓库信息 +export git_url="https://gitee.com/mindspore/mindformers.git" +export branch="r1.1.rc1" +export commit_id="9504c4663e2d842e46bdc6f8b1bf773b0a99acc8" +export base_code_subdir="mindformers" # git远程仓库中的代码路径,如果要用仓库的全部代码,就直接填repo名 + +# 生成 .patch文件所需信息(makepatch.sh) +export changed_code_dir="" # 基于git远程仓库原始代码做过进一步修改(嵌入AISBench的打点接口)的代码, update_patch.sh脚本中填入 +export dir_to_save_patch_file="" # 保存生成的.patch文件的文件夹路径, update_patch.sh脚本中填入 +export patch_file_name="r1.1.rc1" # 生成的patch文件名(不带文件后缀) + +# 由 .patch文件修改git远程仓库拉取的原始代码所需信息(applypatch) +export result_code_dir="" # 基于.patch文件将git远程仓库原始代码修改后保存的文件夹路径,在build.sh脚本中填入 +export patch_file_path="" # 传入的.patch文件的路径,在build.sh脚本中填入 \ No newline at end of file diff --git a/huawei/mindspore/mindformers/patch_files/r1.1.rc1/r1.1.rc1.patch b/huawei/mindspore/mindformers/patch_files/r1.1.rc1/r1.1.rc1.patch new file mode 100644 index 0000000..209d7b9 --- /dev/null +++ b/huawei/mindspore/mindformers/patch_files/r1.1.rc1/r1.1.rc1.patch @@ -0,0 +1,209 @@ +diff -Nur '--exclude=*.git*' origin/mindformers/core/callback/callback.py code/mindformers/core/callback/callback.py +--- origin/mindformers/core/callback/callback.py 2024-04-18 11:04:36.836000000 +0800 ++++ code/mindformers/core/callback/callback.py 2024-04-18 11:04:36.960000000 +0800 +@@ -39,6 +39,8 @@ + from mindformers.tools.utils import get_output_root_path, get_output_subpath, get_remote_save_url, check_in_modelarts,\ + get_real_rank, get_real_group_size + ++import ais_bench.logging as aislog ++ + __all__ = ['ObsMonitor', 'MFLossMonitor', 'CheckpointMointor', 'SummaryMonitor', 'ProfileMonitor', 'EvalCallBack'] + + _cur_dir = os.getcwd() +@@ -182,6 +184,13 @@ + """ + self.step_time = time.time() + self.run_context = run_context ++ cb_params = run_context.original_args() ++ if cb_params.dataset_sink_mode: ++ sample_per_step = int(self.global_batch_size * cb_params.batch_num / self.device_num) ++ else: ++ sample_per_step = int(self.global_batch_size / self.device_num) ++ aislog.start("train_per_step", sample_per_step) ++ + + def step_end(self, run_context): + """ +@@ -215,12 +224,15 @@ + steps_per_epoch = self.steps_per_epoch + cur_epoch_num = (cb_params.cur_step_num + self.initial_step - 1) // steps_per_epoch + 1 + cur_step_num = (cb_params.cur_step_num + self.initial_step - 1) % steps_per_epoch + 1 ++ sample_per_step = int(self.global_batch_size * cb_params.batch_num / self.device_num) + else: + origin_epochs = self.origin_epochs + per_step_seconds = step_seconds + steps_per_epoch = cb_params.batch_num + cur_epoch_num = cb_params.cur_epoch_num + cur_step_num = (cb_params.cur_step_num + self.initial_step - 1) % cb_params.batch_num + 1 ++ sample_per_step = int(self.global_batch_size / self.device_num) ++ aislog.end("train_per_step", sample_per_step) + + # compute time remaining + step_remain = (origin_epochs - cur_epoch_num + 1) * steps_per_epoch - cur_step_num +diff -Nur '--exclude=*.git*' origin/mindformers/core/metric/metric.py code/mindformers/core/metric/metric.py +--- origin/mindformers/core/metric/metric.py 2024-04-18 11:04:36.836000000 +0800 ++++ code/mindformers/core/metric/metric.py 2024-04-18 11:04:36.960000000 +0800 +@@ -39,6 +39,7 @@ + + from .utils import PerplexityCell + from ...dataset.labels import cluener_labels ++import ais_bench.logging as aislog + + __all__ = ['EntityScore', 'SQuADMetric', 'PerplexityMetric', 'ADGENMetric', 'PromptAccMetric', 'EmF1Metric'] + +@@ -541,6 +542,11 @@ + return None + avg_loss = float(self.total_loss / self.num_data) + result = {"loss": avg_loss, "PPL": math.exp(avg_loss)} ++ result_log="loss: {}, Perplexity: {}".format(avg_loss, math.exp(avg_loss)) ++ aislog.init("training", os.getenv("AIS_WORK_RESULT_PATH", "")) ++ aislog.event("accuracy", result_log) ++ aislog.event("result", "OK") ++ aislog.finish() + if self.pipeline_parallel: + print("Average Loss and PPL Metric:", result) + return result +@@ -602,6 +608,15 @@ + f'rouge-2: {self.score_dict["rouge-2"]:.4f}\n' + + f'rouge-l: {self.score_dict["rouge-l"]:.4f}\n' + + f'bleu-4: {self.score_dict["bleu-4"]:.4f}') ++ result_log = 'metric: ADGENMetric\n' + \ ++ f'rouge-1: {self.score_dict["rouge-1"]:.4f}\n' + \ ++ f'rouge-2: {self.score_dict["rouge-2"]:.4f}\n' + \ ++ f'rouge-l: {self.score_dict["rouge-l"]:.4f}\n' + \ ++ f'bleu-4: {self.score_dict["bleu-4"]:.4f}' ++ aislog.init("training", os.getenv("AIS_WORK_RESULT_PATH", "")) ++ aislog.event("accuracy", result_log) ++ aislog.event("result", "OK") ++ aislog.finish() + return self.score_dict + + +@@ -715,6 +730,12 @@ + result = {"Acc": acc_rate} + print(f"Acc: {('%.3f' % result.get('Acc', 0))}, total_acc_num: {self.total_acc_num}, " + f"total_num: {self.num_data}") ++ result_log = f"Acc: {('%.3f' % result.get('Acc', 0))}, total_acc_num: {self.total_acc_num}, " + \ ++ f"total_num: {self.num_data}" ++ aislog.init("training", os.getenv("AIS_WORK_RESULT_PATH", "")) ++ aislog.event("accuracy", result_log) ++ aislog.event("result", "OK") ++ aislog.finish() + return result + + +@@ -776,6 +797,11 @@ + """Compute final result""" + result, total_count = self.evaluate_pairs(self.gens, self.labels) + print(f"F1 score: {result.get('F1', 0)}, Em score: {result.get('Em', 0)}, total_count: {total_count}") ++ result_log = f"F1 score: {result.get('F1', 0)}, Em score: {result.get('Em', 0)}, total_count: {total_count}" ++ aislog.init("training", os.getenv("AIS_WORK_RESULT_PATH", "")) ++ aislog.event("accuracy", result_log) ++ aislog.event("result", "OK") ++ aislog.finish() + return result + + def mixed_segmentation(self, in_str, rm_punc=False): +diff -Nur '--exclude=*.git*' origin/mindformers/trainer/base_trainer.py code/mindformers/trainer/base_trainer.py +--- origin/mindformers/trainer/base_trainer.py 2024-04-18 11:04:36.848000000 +0800 ++++ code/mindformers/trainer/base_trainer.py 2024-04-18 11:04:36.972000000 +0800 +@@ -60,6 +60,7 @@ + from .optimizer_grouped_parameters import get_optimizer_grouped_parameters + from .utils import set_seed, check_train_data_loader_type, \ + check_eval_data_loader_type, check_optimizer_and_lr_type, check_wrapper_config ++import ais_bench.logging as aislog + + SUPPORT_TASKS = MindFormerBook().get_trainer_support_task_list() + SUPPORT_MODEL_NAMES = MindFormerBook().get_model_name_support_list() +@@ -622,6 +623,7 @@ + compute_metrics: Optional[Union[dict, set]] = None, + **kwargs): + """Train or Fine-tune for BaseTrainer in MindFormers.""" ++ aislog.init("training", os.getenv("AIS_WORK_RESULT_PATH", "")) + self.kwargs = kwargs + self.train_dataset = dataset if dataset else self.train_dataset + self.eval_dataset = kwargs.get('eval_dataset', None) +@@ -632,6 +634,7 @@ + + # build dataset + logger.info(".........Build Dataset For Train..........") ++ aislog.start("dataload") # logging api + dataset = self.create_train_dataset() + logger.info("Create train dataset finish, dataset size:%d", dataset.get_dataset_size()) + +@@ -661,9 +664,11 @@ + + # check rules + check_rules(config, mode='train', network=network, dataset=dataset) ++ aislog.end("dataload") # logging api + + # build network + logger.info(".........Build Net For Train..........") ++ aislog.start("train_launch") # logging api + if network is None and self.network is None: + network = self.create_network( + default_args={"parallel_config": config.parallel_config, +@@ -776,17 +781,23 @@ + save_checkpoint_steps=save_checkpoint_steps) + # ColdHotExpertMointor needs to be placed before CheckpointMointor + callbacks.insert(1, cold_hot_mointor) +- ++ aislog.end("train_launch") + logger.info(".........Starting Training Model..........") + if get_real_rank() % 8 == 0: + pprint(config) + logger.info(".........Model Compiling, Please Wait a Moment...........") ++ all_data_sum = int(dataset.get_dataset_size() * config.train_dataset.batch_size / int(os.getenv("RANK_SIZE", '8'))) * \ +++ config.runner_config.origin_epochs * config.model.model_config.seq_length ++ aislog.start("train", all_data_sum) + model.train(config.runner_config.epochs, dataset, + callbacks=callbacks, + dataset_sink_mode=config.runner_config.sink_mode, + sink_size=config.runner_config.sink_size, + initial_epoch=config.runner_config.initial_epoch) + logger.info(".........Training Over!.............") ++ aislog.end("train", all_data_sum) ++ aislog.event("result", "OK") ++ aislog.finish() + + def evaluate_process( + self, +diff -Nur '--exclude=*.git*' origin/scripts/run_distribute.sh code/scripts/run_distribute.sh +--- origin/scripts/run_distribute.sh 2024-04-18 11:04:36.888000000 +0800 ++++ code/scripts/run_distribute.sh 2024-04-18 11:04:37.012000000 +0800 +@@ -154,7 +154,7 @@ + &> $LOG_MF_PATH/rank_$RANK_ID/mindformer.log & + echo "log saved in $(realpath $LOG_MF_PATH)/rank_$RANK_ID" + cd .. +- done ++ done;wait + else + for((i=${START_DEVICE}; i<${END_DEVICE}; i++)) + do +@@ -174,7 +174,7 @@ + &> $LOG_MF_PATH/rank_$RANK_ID/mindformer.log & + echo "log saved in $(realpath $LOG_MF_PATH)/rank_$RANK_ID" + cd .. +- done ++ done;wait + fi + else + if [ $# == 5 ] +@@ -202,7 +202,7 @@ + &> $LOG_MF_PATH/rank_$RANK_ID/mindformer.log & + echo "log saved in $(realpath $LOG_MF_PATH)/rank_$RANK_ID" + cd .. +- done ++ done;wait + else + for((i=${START_DEVICE}; i<${END_DEVICE}; i++)) + do +@@ -227,7 +227,7 @@ + &> $LOG_MF_PATH/rank_$RANK_ID/mindformer.log & + echo "log saved in $(realpath $LOG_MF_PATH)/rank_$RANK_ID" + cd .. +- done ++ done;wait + fi + fi + shopt -u extglob diff --git a/huawei/mindspore/mindformers/update_patch.sh b/huawei/mindspore/mindformers/update_patch.sh new file mode 100644 index 0000000..30b46fb --- /dev/null +++ b/huawei/mindspore/mindformers/update_patch.sh @@ -0,0 +1,29 @@ +#!/bin/bash +#!/bin/bash +declare -i ret_ok=0 +declare -i ret_error=1 +CUR_DIR=$(dirname $(readlink -f $0)) +PATCH_TOOLS_PATH="$CUR_DIR/../../../tools/patch_tool/patch_tool.sh" + +main(){ + patch_version=$1 + patch_version_path="${CUR_DIR}/patch_files/${patch_version}" + patch_config_path="${patch_version_path}/patch_config.sh" + if [ ! -f $patch_config_path ];then + echo "ERROR: can not find patch config file of ${patch_version}!" + return $ret_error + fi + . $patch_config_path # 导入patch配置文件 + changed_code_dir=$2 # 改变patch配置文件export的变量 + if [ ! -d $changed_code_dir ];then + echo "ERROR: changed_code_dir ${patch_version} not exist!" + return $ret_error + fi + dir_to_save_patch_file="${patch_version_path}" # 改变patch配置文件export的变量 + bash $PATCH_TOOLS_PATH "makepatch" || { echo "makepatch failed!";return $ret_error; } + rm -r $CUR_DIR/buildtmp + return $ret_ok +} + +main "$@" +exit $? \ No newline at end of file -- Gitee From 5f10f0451ff126590732522b6d2ee7bd992bfd8c Mon Sep 17 00:00:00 2001 From: yanhe13 Date: Tue, 7 May 2024 16:49:16 +0800 Subject: [PATCH 2/2] yaml fix --- .../glm2_6b/launch_yamls/run_glm2_6b_finetune_800T_A2_64G.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800T_A2_64G.yaml b/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800T_A2_64G.yaml index d991a63..5408dfd 100644 --- a/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800T_A2_64G.yaml +++ b/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800T_A2_64G.yaml @@ -1,7 +1,7 @@ seed: 0 run_mode: 'train' output_dir: './output' # path to save checkpoint/strategy -load_checkpoint: ''path/to/glm2_6b.ckpt'' +load_checkpoint: '{path}/glm2_6b.ckpt' src_strategy_path_or_dir: '' auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model only_save_strategy: False -- Gitee