From 5e9d12bd7a96d3ebab6a4706f23311417dcf5e81 Mon Sep 17 00:00:00 2001
From: yanhe13 <yanhe13@huawei.com>
Date: Tue, 23 Apr 2024 11:01:59 +0800
Subject: [PATCH 1/2] add mindspore wkld

---
 huawei/mindspore/mindformers/README.md        |  27 ++
 huawei/mindspore/mindformers/build.sh         |  69 +++++
 .../mindspore/mindformers/code/benchmark.sh   |  65 +++++
 .../mindformers/code/evaluate_run.sh          |  40 +++
 .../mindformers/code/multi_nodes_run.sh       |  44 ++++
 .../mindformers/code/single_node_run.sh       |  57 ++++
 .../mindformers/models/glm2_6b/README.md      | 187 +++++++++++++
 .../models/glm2_6b/evaluate_scripts.sh        |  14 +
 .../models/glm2_6b/launch_config.sh           |  13 +
 .../run_glm2_6b_finetune_800T_A2_64G.yaml     | 248 +++++++++++++++++
 .../run_glm2_6b_finetune_800_32G.yaml         | 249 ++++++++++++++++++
 .../run_glm2_6b_finetune_eval.yaml            | 230 ++++++++++++++++
 .../models/glm2_6b/registed_tasks.sh          |   8 +
 .../mindformers/models/llama2_13b/README.md   | 204 ++++++++++++++
 .../models/llama2_13b/evaluate_scripts.sh     |  15 ++
 .../models/llama2_13b/launch_config.sh        |  13 +
 .../launch_yamls/run_llama2_13b.yaml          | 213 +++++++++++++++
 .../launch_yamls/run_llama2_13b_910b.yaml     | 209 +++++++++++++++
 .../run_llama2_13b_910b_finetune.yaml         | 209 +++++++++++++++
 .../launch_yamls/run_llama2_13b_finetune.yaml | 214 +++++++++++++++
 .../models/llama2_13b/registed_tasks.sh       |  11 +
 .../mindformers/models/llama2_70b/README.md   | 197 ++++++++++++++
 .../models/llama2_70b/evaluate_scripts.sh     |  11 +
 .../models/llama2_70b/launch_config.sh        |  13 +
 .../launch_yamls/predict_llama2_70b_910b.yaml | 154 +++++++++++
 .../launch_yamls/run_llama2_70b_910b.yaml     | 214 +++++++++++++++
 .../run_llama2_70b_910b_finetune.yaml         | 214 +++++++++++++++
 .../models/llama2_70b/registed_tasks.sh       |   8 +
 .../mindformers/models/llama2_7b/README.md    | 185 +++++++++++++
 .../models/llama2_7b/evaluate_scripts.sh      |  15 ++
 .../models/llama2_7b/launch_config.sh         |  13 +
 .../llama2_7b/launch_yamls/run_llama2_7b.yaml | 213 +++++++++++++++
 .../launch_yamls/run_llama2_7b_910b.yaml      | 210 +++++++++++++++
 .../run_llama2_7b_910b_finetune.yaml          | 209 +++++++++++++++
 .../launch_yamls/run_llama2_7b_finetune.yaml  | 214 +++++++++++++++
 .../models/llama2_7b/registed_tasks.sh        |  10 +
 .../patch_files/r1.1.rc1/patch_config.sh      |  15 ++
 .../patch_files/r1.1.rc1/r1.1.rc1.patch       | 209 +++++++++++++++
 huawei/mindspore/mindformers/update_patch.sh  |  29 ++
 39 files changed, 4472 insertions(+)
 create mode 100644 huawei/mindspore/mindformers/README.md
 create mode 100644 huawei/mindspore/mindformers/build.sh
 create mode 100644 huawei/mindspore/mindformers/code/benchmark.sh
 create mode 100644 huawei/mindspore/mindformers/code/evaluate_run.sh
 create mode 100644 huawei/mindspore/mindformers/code/multi_nodes_run.sh
 create mode 100644 huawei/mindspore/mindformers/code/single_node_run.sh
 create mode 100644 huawei/mindspore/mindformers/models/glm2_6b/README.md
 create mode 100644 huawei/mindspore/mindformers/models/glm2_6b/evaluate_scripts.sh
 create mode 100644 huawei/mindspore/mindformers/models/glm2_6b/launch_config.sh
 create mode 100644 huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800T_A2_64G.yaml
 create mode 100644 huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800_32G.yaml
 create mode 100644 huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_eval.yaml
 create mode 100644 huawei/mindspore/mindformers/models/glm2_6b/registed_tasks.sh
 create mode 100644 huawei/mindspore/mindformers/models/llama2_13b/README.md
 create mode 100644 huawei/mindspore/mindformers/models/llama2_13b/evaluate_scripts.sh
 create mode 100644 huawei/mindspore/mindformers/models/llama2_13b/launch_config.sh
 create mode 100644 huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b.yaml
 create mode 100644 huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_910b.yaml
 create mode 100644 huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_910b_finetune.yaml
 create mode 100644 huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_finetune.yaml
 create mode 100644 huawei/mindspore/mindformers/models/llama2_13b/registed_tasks.sh
 create mode 100644 huawei/mindspore/mindformers/models/llama2_70b/README.md
 create mode 100644 huawei/mindspore/mindformers/models/llama2_70b/evaluate_scripts.sh
 create mode 100644 huawei/mindspore/mindformers/models/llama2_70b/launch_config.sh
 create mode 100644 huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/predict_llama2_70b_910b.yaml
 create mode 100644 huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/run_llama2_70b_910b.yaml
 create mode 100644 huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/run_llama2_70b_910b_finetune.yaml
 create mode 100644 huawei/mindspore/mindformers/models/llama2_70b/registed_tasks.sh
 create mode 100644 huawei/mindspore/mindformers/models/llama2_7b/README.md
 create mode 100644 huawei/mindspore/mindformers/models/llama2_7b/evaluate_scripts.sh
 create mode 100644 huawei/mindspore/mindformers/models/llama2_7b/launch_config.sh
 create mode 100644 huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b.yaml
 create mode 100644 huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_910b.yaml
 create mode 100644 huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_910b_finetune.yaml
 create mode 100644 huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_finetune.yaml
 create mode 100644 huawei/mindspore/mindformers/models/llama2_7b/registed_tasks.sh
 create mode 100644 huawei/mindspore/mindformers/patch_files/r1.1.rc1/patch_config.sh
 create mode 100644 huawei/mindspore/mindformers/patch_files/r1.1.rc1/r1.1.rc1.patch
 create mode 100644 huawei/mindspore/mindformers/update_patch.sh

diff --git a/huawei/mindspore/mindformers/README.md b/huawei/mindspore/mindformers/README.md
new file mode 100644
index 0000000..a1c8643
--- /dev/null
+++ b/huawei/mindspore/mindformers/README.md
@@ -0,0 +1,27 @@
+#   Mindformers 负载导航
+## MindFormers 训练负载包版本归档
+### r1.1.rc1 版本
+#### mindspore版本
+```bash
+mindspore >= 2.3
+```
+#### 取包链接
+|模型|负载包链接|
+| ----- | ------------------------------- |
+|LLaMA2 7B|[x86_64](https://aisbench.obs.cn-north-4.myhuaweicloud.com/workload_packages/train/mindformers/r1.1.rc1/Ais-Benchmark-Stubs-x86_64-2.0-training-mindformers-llama2_7b-r1.1.rc1.tar.gz) <br> [aarch64](https://aisbench.obs.cn-north-4.myhuaweicloud.com/workload_packages/train/mindformers/r1.1.rc1/Ais-Benchmark-Stubs-aarch64-2.0-training-mindformers-llama2_7b-r1.1.rc1.tar.gz)|
+|LLaMA2 13B|[x86_64](https://aisbench.obs.cn-north-4.myhuaweicloud.com/workload_packages/train/mindformers/r1.1.rc1/Ais-Benchmark-Stubs-x86_64-2.0-training-mindformers-llama2_13b-r1.1.rc1.tar.gz) <br> [aarch64](https://aisbench.obs.cn-north-4.myhuaweicloud.com/workload_packages/train/mindformers/r1.1.rc1/Ais-Benchmark-Stubs-aarch64-2.0-training-mindformers-llama2_13b-r1.1.rc1.tar.gz)|
+|LLaMA2 70B|[x86_64](https://aisbench.obs.cn-north-4.myhuaweicloud.com/workload_packages/train/mindformers/r1.1.rc1/Ais-Benchmark-Stubs-x86_64-2.0-training-mindformers-llama2_13b-r1.1.rc1.tar.gz) <br> [aarch64](https://aisbench.obs.cn-north-4.myhuaweicloud.com/workload_packages/train/mindformers/r1.1.rc1/Ais-Benchmark-Stubs-aarch64-2.0-training-mindformers-llama2_70b-r1.1.rc1.tar.gz)|
+|GLM2 6B|[x86_64](https://aisbench.obs.cn-north-4.myhuaweicloud.com/workload_packages/train/mindformers/r1.1.rc1/Ais-Benchmark-Stubs-x86_64-2.0-training-mindformers-glm2_6b-r1.1.rc1.tar.gz) <br> [aarch64](https://aisbench.obs.cn-north-4.myhuaweicloud.com/workload_packages/train/mindformers/r1.1.rc1/Ais-Benchmark-Stubs-aarch64-2.0-training-mindformers-glm2_6b-r1.1.rc1.tar.gz)|
+
+## 贡献指南
+### 使用build.sh出负载包
+```bash
+bash build.sh <Stubs包路径> <训练任务类型> <mindformers patch 版本>
+```
+在`./output`路径下生成构建好的负载包，请自行打包成压缩包
+### 使用update_patch.sh更新mindformers打点的patch版本
+```
+bash update_patch.sh <patch version> <传入修改好的完成打点的ModelLink代码路径>
+```
+
+
diff --git a/huawei/mindspore/mindformers/build.sh b/huawei/mindspore/mindformers/build.sh
new file mode 100644
index 0000000..f96060e
--- /dev/null
+++ b/huawei/mindspore/mindformers/build.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+declare -i ret_ok=0
+declare -i ret_error=1
+
+CUR_DIR=$(dirname $(readlink -f $0))
+LLM_FRAMEWORK_NAME="mindformers"
+WORKLOAD_PACKAGE_NAME=""
+WORKLOAD_PACKAGE_DIR=""
+PATCH_TOOLS_PATH="$CUR_DIR/../../../tools/patch_tool/patch_tool.sh"
+PACKAGE_OUTPUT_PATH="$CUR_DIR/output"
+stubs_package_path=""
+model_type="" # 模型的文件夹目录，如llama2_7b
+patch_version="" # patch版本，在patch_files文件夹下的目录名，如v0
+
+load_stubs_package(){
+    if [ ! -f $stubs_package_path ];then
+        echo "stubs package: ${stubs_package_path} not exist!"
+        return $ret_error
+    fi
+    stubs_basename=$(basename "$stubs_package_path")
+    stubs_subname=${stubs_basename%.tar.gz}
+    cp -f "$stubs_package_path" "$PACKAGE_OUTPUT_PATH"
+    cd "${PACKAGE_OUTPUT_PATH}"
+    tar -xzf "${PACKAGE_OUTPUT_PATH}/${stubs_basename}"
+    rm -f "${PACKAGE_OUTPUT_PATH}/${stubs_basename}"
+    stubs_content_dir=$(find ./ -name "Ais-Benchmark-Stubs*" -type d)
+    cd $CUR_DIR
+    WORKLOAD_PACKAGE_NAME="${stubs_subname}-training-${LLM_FRAMEWORK_NAME}-${model_type}-${patch_version}"
+    WORKLOAD_PACKAGE_DIR=${PACKAGE_OUTPUT_PATH}/${WORKLOAD_PACKAGE_NAME}
+    mv "${PACKAGE_OUTPUT_PATH}/${stubs_content_dir}" "${WORKLOAD_PACKAGE_DIR}"
+    return $ret_ok
+}
+
+add_workload_code(){
+    code_path="${WORKLOAD_PACKAGE_DIR}/code"
+    rm -r "${code_path}"
+    cp -rf "${CUR_DIR}/code" "${WORKLOAD_PACKAGE_DIR}" || { echo "$model_type not found";return $ret_error; }
+    patch_config_path="${CUR_DIR}/patch_files/${patch_version}/patch_config.sh"
+    . $patch_config_path # 导入patch配置文件
+    result_code_dir="${code_path}/${LLM_FRAMEWORK_NAME}" # 改变patch配置文件export的变量
+    patch_file_path="${CUR_DIR}/patch_files/${patch_version}/${patch_version}.patch" # 改变patch配置文件export的变量
+    bash $PATCH_TOOLS_PATH "applypatch" || { echo "apply changes to mindformers code failed!";return $ret_error; } # 调用patch_tool
+    cp -rf ${CUR_DIR}/models/${model_type}/launch_yamls/ ${code_path} || { echo "launch yaml file not found";return $ret_error; }
+    cp -f ${CUR_DIR}/models/${model_type}/*.sh ${code_path} || { echo "registed task not found";return $ret_error; } # launch_config.sh和registed task放入code
+    cp -f ${CUR_DIR}/models/${model_type}/README.md ${WORKLOAD_PACKAGE_DIR} || { echo "copy readme failed";return $ret_error; }
+    return $ret_ok
+}
+
+
+main(){
+    stubs_package_path=$1
+    model_type=$2 # 任务的文件夹目录，如llama2_7b
+    patch_version=$3 # patch版本，在patch_files文件夹下的目录名，如v0
+
+    # 清空原来的出包路径内容，新建出包的路径
+    if [ -d $PACKAGE_OUTPUT_PATH ];then
+        rm -rf $PACKAGE_OUTPUT_PATH
+    fi
+    mkdir -p $PACKAGE_OUTPUT_PATH
+
+    load_stubs_package || { echo "ERROR: load stubs package failed!";return $ret_error; }
+    add_workload_code || { echo "ERROR: add workload code failed!";return $ret_error; }
+    rm -rf $CUR_DIR/buildtmp # 清空patch的临时数据
+
+    return $ret_ok
+}
+
+main "$@"
+exit $?
\ No newline at end of file
diff --git a/huawei/mindspore/mindformers/code/benchmark.sh b/huawei/mindspore/mindformers/code/benchmark.sh
new file mode 100644
index 0000000..0c1aaea
--- /dev/null
+++ b/huawei/mindspore/mindformers/code/benchmark.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+declare -i ret_ok=0
+declare -i ret_error=1
+
+CUR_DIR=$(cd "$(dirname "$0")";pwd)
+WORK_DIR="${CUR_DIR}/../work"
+RESULT_DIR="${CUR_DIR}/../result"
+RUN_MODE="single_node"
+
+source ${CUR_DIR}/registed_tasks.sh # 导入注册了的启动任务
+
+function contains() {
+    local list=("$@")
+    for element in "${list[@]}"; do
+        if [ "$element" == "$AIS_TRAIN_YAML" ]; then
+            return $ret_ok  # 找到，返回真
+        fi
+    done
+    return $ret_error  # 没有找到，返回假
+}
+
+function train_yaml_select(){ # from registed_tasks.sh
+    if contains "${SINGLE_NODE_LAUNCH[@]}"; then
+        RUN_MODE="single_node"
+        echo "launch with ${AIS_TRAIN_YAML}.yaml using single node mode."
+    elif contains "${MULTI_NODES_LAUNCH[@]}"; then
+        RUN_MODE="multi_nodes"
+        echo "launch with ${AIS_TRAIN_YAML}.yaml using multi nodes mode."
+    else
+        echo "invalid yaml name ${AIS_TRAIN_YAML}"
+        return $ret_error
+    fi
+    return $ret_ok
+}
+
+function prepare() {
+    export PYTHONPATH=$PYTHONPATH:${CUR_DIR}
+    rm -f ${RESULT_DIR}/*
+    if [ -d ${WORK_DIR} ];then
+        rm -rf ${WORK_DIR}
+    fi
+    mkdir -p ${WORK_DIR}
+    cp -r ${CUR_DIR}/* ${WORK_DIR}
+}
+
+function collect_result() {
+    python3 -c "from ais_bench.logging import collect_report; collect_report('training', ['$CUR_DIR/../result'])"
+}
+
+function main() {
+    source "${CUR_DIR}/launch_config.sh"
+    train_yaml_select || { echo "train task select failed!";return $ret_error; }
+    prepare
+    if [ "${RUN_MODE}" == "single_node" ]; then
+        bash "${WORK_DIR}/single_node_run.sh"
+    else
+        bash "${WORK_DIR}/multi_nodes_run.sh"
+    fi
+    bash "${WORK_DIR}/evaluate_run.sh" || { echo "evaluate run failed!";return $ret_error; }
+    collect_result || { echo "collect train result failed!";return $ret_error; }
+    return $ret_ok
+}
+
+main "$@"
+exit $?
\ No newline at end of file
diff --git a/huawei/mindspore/mindformers/code/evaluate_run.sh b/huawei/mindspore/mindformers/code/evaluate_run.sh
new file mode 100644
index 0000000..a3b7ec6
--- /dev/null
+++ b/huawei/mindspore/mindformers/code/evaluate_run.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+declare -i ret_ok=0
+declare -i ret_error=1
+CUR_DIR=$(cd "$(dirname "$0")";pwd)
+LAUNCH_SCRIPT_PATH=${CUR_DIR}/mindformers/scripts/
+TOOLS_SCRIPT_PATH=${CUR_DIR}/mindformers/mindformers/tools/
+RESULT_PATH=${CUR_DIR}/result/
+OUTPUT_PATH=${CUR_DIR}/mindformers/output/
+
+function merge_ckpt() {
+    cmd="${AIS_PYTHON} ${TOOLS_SCRIPT_PATH}/transform_ckpt.py \
+        --src_ckpt_strategy ${OUTPUT_PATH}/strategy/ \
+        --src_ckpt_dir ${OUTPUT_PATH}/checkpoint/ \
+        --dst_ckpt_dir ${OUTPUT_PATH}/target_checkpoint/ \
+        --prefix ${AIS_MODEL_NAME}"
+    eval $cmd || { echo "exec merge ckpt script ${cmd} failed!";return $ret_error; }
+    return $ret_ok
+}
+
+function launch_evaluate() {
+    source ${CUR_DIR}/evaluate_scripts.sh
+    export AIS_WORK_RESULT_PATH=${RESULT_PATH}
+    cmd=${mindformers_workload_eval_cmd} # from evaluate_scripts.sh
+    if [ "${cmd}" == "" ]; then
+        echo "evaluate cmd not given, skip"
+        return $ret_ok
+    fi
+    eval $cmd || { echo "launch eval cmd: ${cmd} failed!";return $ret_error; }
+    return $ret_ok
+}
+
+function main() {
+    merge_ckpt || { echo "merge ckpt failed!";return $ret_error; }
+    launch_evaluate || { echo "launch evaluate failed!";return $ret_error; }
+    cp -r ${RESULT_PATH}  ${CUR_DIR}/../ || { echo "cp work result to base result failed!";return $ret_error; }
+    return $ret_ok
+}
+
+main "$@"
+exit $?
diff --git a/huawei/mindspore/mindformers/code/multi_nodes_run.sh b/huawei/mindspore/mindformers/code/multi_nodes_run.sh
new file mode 100644
index 0000000..68b794e
--- /dev/null
+++ b/huawei/mindspore/mindformers/code/multi_nodes_run.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+declare -i ret_ok=0
+declare -i ret_error=1
+CUR_DIR=$(cd "$(dirname "$0")";pwd)
+WORK_DIR="${CUR_DIR}/../work"
+
+function cluster_exist_check() {
+    $AIS_PYTHON -c "import ais_bench.cluster" || { echo "can't find cluster_tools be installed";return $ret_error; }
+}
+
+function cluster_deploy_code() {
+    cmd="rm -rf ./work"
+    $AIS_PYTHON -m ais_bench.cluster multi_exec -n "${AIS_NODE_FILE_PATH}" -c "${cmd}" || { echo "clear work dir failed";return $ret_error; }
+    $AIS_PYTHON -m ais_bench.cluster multi_put -n "${AIS_NODE_FILE_PATH}" -s "${WORK_DIR}" -d "./" || { echo "cluster deploy code failed";return $ret_error; }
+}
+
+function cluster_parallel_run() {
+    cmd="source /etc/profile; \
+        source ./work/launch_config.sh; \
+        export AIS_WORK_PATH=\$PWD/work/; \
+        bash ./work/single_node_run.sh"
+    $AIS_PYTHON -m ais_bench.cluster multi_exec -n "${AIS_NODE_FILE_PATH}" -c "${cmd}" || { echo "cluster parallel run failed";return $ret_error; }
+}
+
+function cluster_collect_result() {
+    # collect outputs of nodes
+    $AIS_PYTHON -m ais_bench.cluster multi_get -n "${AIS_NODE_FILE_PATH}" -s "./work/mindformers/output" \
+        -d "${CUR_DIR}/mindformers/" || { echo "cluster collect output failed";return $ret_error; }
+    # collect logging dump file from nodes
+    $AIS_PYTHON -m ais_bench.cluster multi_get -n "${AIS_NODE_FILE_PATH}" -s "./work/result/" \
+        -d "${CUR_DIR}/" || { echo "cluster collect result failed";return $ret_error; }
+}
+
+function main() {
+    source "${CUR_DIR}/launch_config.sh"
+    cluster_exist_check || { echo "cluster_exist_check failed";return $ret_error; }
+    cluster_deploy_code || { echo "cluster_deploy_code failed";return $ret_error; }
+    cluster_parallel_run || { echo "cluster_parallel_run failed";return $ret_error; }
+    cluster_collect_result || { echo "cluster_collect_result failed";return $ret_error; }
+    return $ret_ok
+}
+
+main "$@"
+exit $?
\ No newline at end of file
diff --git a/huawei/mindspore/mindformers/code/single_node_run.sh b/huawei/mindspore/mindformers/code/single_node_run.sh
new file mode 100644
index 0000000..bff542c
--- /dev/null
+++ b/huawei/mindspore/mindformers/code/single_node_run.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+declare -i ret_ok=0
+declare -i ret_error=1
+CUR_DIR=$(cd "$(dirname "$0")";pwd)
+LAUNCH_SCRIPT_PATH=${CUR_DIR}/mindformers/scripts/
+RESULT_PATH=${CUR_DIR}/result/
+NODE_DEVICE_INFO="[0,8]"
+
+function get_node_info() {
+    if [ "${NODE_ID}" == "" ]; then
+        NODE_DEVICE_INFO="[0,${AIS_DEVICE_NUM}]"
+    else
+        RANK_START=`expr ${NODE_ID} \* $AIS_DEVICE_NUM` # NODE_ID get from cluster_tools export
+        RANK_ID_MAX=$[AIS_DEVICE_NUM+RANK_START]
+        NODE_DEVICE_INFO="[$RANK_START,$RANK_ID_MAX]"
+    fi
+}
+
+function prepare_and_clear() {
+    export AIS_WORK_PATH=${CUR_DIR}
+    source "${CUR_DIR}/launch_config.sh"
+    export MS_ASCEND_CHECK_OVERFLOW_MODE="INFNAN_MODE"  # 推荐开启INFNAN模式，llama2_7b和70b 不用设置该项
+    echo "single node run..."
+    mkdir -p ${RESULT_PATH} || { echo "mkdir work result dir:${RESULT_PATH} failed!";return $ret_error; } # logging的落盘文件在里面
+    return $ret_ok
+}
+
+function install_mindformers() {
+    pip3 install ${CUR_DIR}/mindformers/ --force-reinstall || { return $ret_error; }
+}
+
+function launch_train() {
+    get_node_info
+    cd ${LAUNCH_SCRIPT_PATH}
+    export AIS_WORK_RESULT_PATH=${RESULT_PATH}
+    cmd="bash run_distribute.sh \
+        ${CUR_DIR}/${AIS_RANK_TABLE_FILE} \
+        ${CUR_DIR}/launch_yamls/${AIS_TRAIN_YAML}.yaml \
+        ${NODE_DEVICE_INFO} \
+        ${AIS_TRAIN_TASK_TYPE} \
+        ${AIS_RANK_NUM}"
+    eval $cmd || { echo "exec launch train scripts ${cmd} failed!";return $ret_error; }
+    cd ${CUR_DIR}
+    return $ret_ok
+}
+
+
+function main() {
+    prepare_and_clear || { echo "prepare_and_clear failed!";return $ret_error; }
+    install_mindformers || { echo "install mindformers failed";return $ret_error; }
+    launch_train || { echo "launch_train failed!";return $ret_error; }
+    return $ret_ok
+}
+
+main "$@"
+exit $?
+
diff --git a/huawei/mindspore/mindformers/models/glm2_6b/README.md b/huawei/mindspore/mindformers/models/glm2_6b/README.md
new file mode 100644
index 0000000..b2b214e
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/glm2_6b/README.md
@@ -0,0 +1,187 @@
+# 基于Mindspore/mindformers框架的glm2大模型训练负载使用指南
+本文主要介绍使用基于mindformers LLaMA2 7b大模型训练业务代码构建的AISBench的负载包"Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-glm2_6b-{mindformers version}.tar.gz"，进行服务器性能测试的流程。
+本负载包支持一键拉起单机的GLM2 6b模型的微调任务，自动完成评测，并汇总训练相关的性能数据。
+## 名词定义
+|名词|	定义|
+| --- | ----------------------------------- |
+|ais-bench-stubs|启动性能测试任务的二进制程序|
+## 查看GLM2 6b 训练负载包目录结构，简单确认完整性
+解压负载包"Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-glm2_6b-{mindformers version}.tar.gz"（如果在包中看到本文档忽略此步）
+```bash
+tar xzf Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-glm2_6b-{mindformers version}.tar.gz
+```
+负载包名中"{mindformers version}"表示[mindformer仓库](https://gitee.com/mindspore/mindformers)的分支名
+查看目录结构
+```bash
+├── ais-bench-stubs # 启动测试的二进制文件
+├── code/
+│   ├── benchmark.sh
+│   ├── evaluate_run.sh
+│   ├── evaluate_scripts.sh # 评测的运行脚本，需要依据实际运行的评测命令
+│   ├── launch_config.sh
+│   ├── launch_yamls/ # 启动训练任务的配置文件
+|   │   ├──run_glm2_6b_finetune_800T_A2_64G.yaml
+|   │   ├──......
+│   ├── mindformers/ # 嵌入了logging打点接口的mindformers代码
+│   ├── multi_nodes_run.sh
+│   ├── registed_tasks.sh # 注册了可用的mindformers脚本
+│   └── single_node_run.sh
+├── config/
+│   ├── config.json
+│   └── system.json
+├── log/
+├── result/
+├── README.md # 本文档
+└── STUBS_PACKAGE_INTRO.md
+```
+**后续对于相对路径的描述都是相对于负载包中的一级目录，例如 ./**
+## 负载包运行环境准备
+### 基本环境准备
+```
+python >=3.7
+```
+### mindspore准备
+请依据负载包名中的“{mindformers version}”对应的mindformers分支版本，参考[mindformers训练负载主页](https://gitee.com/aisbench/training/tree/master/huawei/mindspore/mindformers)，安装指定版本的mindspore（python版本不限）。
+MindSpore安装参考[MindSpore官网](https://www.mindspore.cn/)MindSpore需要能成功在npu上运行，验证命令：
+```bash
+python -c "import mindspore;mindspore.set_context(device_target='Ascend');mindspore.run_check()"
+
+### logging准备
+从[logging打点工具发行版](https://gitee.com/aisbench/logging/releases)获取最新的发行版。
+参考[logging打点工具主页](https://gitee.com/aisbench/logging)的“安装与卸载/安装logging”章节安装logging打点工具。
+
+## 资源准备
+### 前置声明
+1. 以下涉及到mindformers代码仓库的链接没有指定代码分支，需要依据负载包名"{mindformers version}",自行切换到对应的分支。
+2. 运行LLaMA2训练的MindSpore/mindformers的代码全部在`./code/mindformers`文件夹中，资源准备总体参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)，具体资源的参考详见本章其他小节。<br>
+### rank_table_file准备
+确保`/etc/hccn.conf`文件已经配好（如果没配好，参考[数据中心解决方案/配置训练节点](https://www.hiascend.com/document/detail/zh/Ascend%20Data%20Center%20Solution/22.0.0/install/800_9000/install_800_9000_0029.html)配置）。<br>
+
+参考[glm2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/glm2.md)的“生成RANK_TABLE_FILE”(单机多卡情况)章节。
+
+### 模型权重下载与转换
+- 参考[glm2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/glm2.md)的“模型权重下载与转换”章节准备GLM2 6b的权重和词表文件；
+- 资源链接：
+    - [glm2_6b.ckpt](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/XFormer_for_mindspore/glm2/glm2_6b.ckpt)(点击直接下载)
+    - [tokenizer](https://ascend-repo-modelzoo.obs.cn-east-2.myhuaweicloud.com/XFormer_for_mindspore/glm2/tokenizer.model)(点击直接下载)
+### 数据集准备
+- 参考[glm2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/glm2.md)的“微调--数据集准备”章节准备微调和评测所需的数据集；
+- 资源链接：
+    - [ADGEN数据集](https://cloud.tsinghua.edu.cn/f/b3f119a008264b1cabd1/?dl=1)(下载后需解压)
+- 下载解压后目录结构为：
+    ```
+    AdvertiseGen
+    ├── train.json
+    └── dev.json
+    ```
+## 查看本负载包支持的mindformers启动配置文件
+查看`code/register_task.sh`文件：
+```bash
+#!/bin/bash
+# 单机运行的任务
+# 单机运行的任务
+SINGLE_NODE_LAUNCH=( \
+    "run_glm2_6b_finetune_800_32G" \
+    "run_glm2_6b_finetune_800T_A2_64G"
+)
+# 多机运行的任务
+MULTI_NODES_LAUNCH=()
+```
+
+```bash
+"run_glm2_6b_finetune_800T_A2_64G"  # 8卡 910B1,910B2,910B3 微调任务
+"run_glm2_6b_finetune_800_32G"  # 8卡 910A,910B4 预训练任务
+```
+需要确认`code/launch_yamls/`路径下包含了上述`code/register_task.sh`中注册的yaml文件。
+
+## 启动前配置
+### 负载通用配置文件launch_config.sh配置
+编辑`code/launch_config.sh`启动文件：
+```bash
+#!/bin/bash
+
+export AIS_PYTHON=python3 # 使用的python解释器
+export AISBENCH_LOGGING_WARM_UP_STEP_COUNT=5 # 从第几个steps之后开始统计step的性能数据
+
+export AIS_NODE_FILE_PATH=/home/xx/xx/xx/node_file.json # 分布式运行使用cluster_tools所需包含节点信息和ssh key路径的文件，单机训练不用填
+export AIS_TRAIN_TASK_TYPE="finetune" # 预训练："train"，微调："finetune"
+export AIS_MODEL_NAME="glm2_6b"
+export AIS_TRAIN_YAML="run_glm2_6b_finetune_800_32G" # 请从code/registed_task.sh中注册的yaml文件中选择一个填入
+
+export AIS_RANK_NUM=8 # 集群总加速卡数
+export AIS_DEVICE_NUM=8 # 单台服务器的加速卡数量
+export AIS_RANK_TABLE_FILE="./xx.json" # rank_table_file 的路径, 相对于当前脚本
+
+```
+### 修改yaml配置文件
+修改`code/launch_config.sh`中设置的`AIS_TRAIN_YAML`对应的`code/launch_yaml/`中的yaml配置文件。
+```yaml
+seed: 0
+run_mode: 'train'
+output_dir: './output' # path to save checkpoint/strategy
+load_checkpoint: '{path}/glm2_6b.ckpt' # 需要填入准备的权重文件.ckpt的路径
+```
+`load_checkpoint`支持绝对路径以及相对路径。`load_checkpoint`填入的相对路径`./`实际对应负载包路径`code/mindformers/`
+```yaml
+train_dataset: &train_dataset
+  data_loader:
+    type: ADGenDataLoader
+    dataset_dir: "/path/to/AdvertiseGen/train.json" # 需要填入train.json数据集的路径
+    shuffle: True
+    phase: "train"
+    version: 2
+    origin_columns: ["content", "summary"]
+  tokenizer:
+    type: ChatGLM2Tokenizer
+    vocab_file: "/path/to/tokenizer.model" # 需要填入词表文件tokenizer.model的实际路径
+```
+`dataset_dir`支持绝对路径以及相对路径。`dataset_dir`填入的相对路径`../../`实际对应负载包路径`code/mindformers/` <br>
+`vocab_file`支持绝对路径以及相对路径。`vocab_file`填入的相对路径`../../`实际对应负载包路径`code/mindformers/
+
+### 修改评测启动脚本
+`code/evaluate_scripts.sh`评测脚本的启动较为多样，为了保证可拓展性，支持用户自行修改。默认提供的是基于wikitext-2数据集的文本生成任务的评测启动脚本。`code/evaluate_scripts.sh`默认内容如下：
+```bash
+#!/bin/bash
+CUR_DIR=$(cd "$(dirname "$0")";pwd)
+MINDFORMERS_CODE_PATH=${CUR_DIR}/mindformers/
+LAUNCH_SCRIPT_PATH=${MINDFORMERS_CODE_PATH}/scripts/
+OUTPUT_PATH=${MINDFORMERS_CODE_PATH}/output/ # 训练完后output路径
+
+# eval_script 评测启动命令请自行根据实际情况修改
+export mindformers_workload_eval_cmd="${AIS_PYTHON} ${MINDFORMERS_CODE_PATH}/run_mindformer.py \
+--config ${CUR_DIR}/launch_yamls/run_glm2_6b_finetune_eval.yaml \
+--run_mode eval \
+--load_checkpoint ${OUTPUT_PATH}/target_checkpoint/rank_0/glm2_6b0.ckpt \
+--epochs 1 \
+--use_parallel False \
+--device_id 0"
+```
+按默认启动脚本执行，需要修改`code/launch_yamls/run_glm2_6b_finetune_eval.yaml`配置文件的内容：
+```yaml
+eval_dataset: &eval_dataset
+  data_loader:
+    type: ADGenDataLoader
+    dataset_dir: "/path/to/AdvertiseGen/dev.json" # 需要填入评测数据集dev.json的实际路径
+    shuffle: False
+    phase: "train"
+    version: 2
+    origin_columns: ["content", "summary"]
+  tokenizer:
+    type: ChatGLM2Tokenizer
+    vocab_file: "/path/to/tokenizer.model" # 需要填入词表文件tokenizer.model的实际路径
+```
+`dataset_dir`支持绝对路径以及相对路径。`dataset_dir`填入的相对路径`../../`实际对应负载包路径`code/mindformers/` <br>
+`vocab_file`支持绝对路径以及相对路径。`vocab_file`填入的相对路径`../../`实际对应负载包路径`code/mindformers/`
+
+
+## 3 负载启动
+### 3.1 在线测试
+执行命令
+```bash
+./ais-bench-stubs
+```
+### 3.2 轻量化离线测试
+执行命令
+```bash
+./ais-bench-stubs test
+```
\ No newline at end of file
diff --git a/huawei/mindspore/mindformers/models/glm2_6b/evaluate_scripts.sh b/huawei/mindspore/mindformers/models/glm2_6b/evaluate_scripts.sh
new file mode 100644
index 0000000..af81df3
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/glm2_6b/evaluate_scripts.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+CUR_DIR=$(cd "$(dirname "$0")";pwd)
+MINDFORMERS_CODE_PATH=${CUR_DIR}/mindformers/
+LAUNCH_SCRIPT_PATH=${MINDFORMERS_CODE_PATH}/scripts/
+OUTPUT_PATH=${MINDFORMERS_CODE_PATH}/output/ # 训练完后output路径
+
+# eval_script 评测启动命令请自行根据实际情况修改
+export mindformers_workload_eval_cmd="${AIS_PYTHON} ${MINDFORMERS_CODE_PATH}/run_mindformer.py \
+--config ${CUR_DIR}/launch_yamls/run_glm2_6b_finetune_eval.yaml \
+--run_mode eval \
+--load_checkpoint ${OUTPUT_PATH}/target_checkpoint/rank_0/glm2_6b0.ckpt \
+--epochs 1 \
+--use_parallel False \
+--device_id 0"
\ No newline at end of file
diff --git a/huawei/mindspore/mindformers/models/glm2_6b/launch_config.sh b/huawei/mindspore/mindformers/models/glm2_6b/launch_config.sh
new file mode 100644
index 0000000..b0e71f7
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/glm2_6b/launch_config.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+export AIS_PYTHON=python3 # 使用的python解释器
+export AISBENCH_LOGGING_WARM_UP_STEP_COUNT=5 # 从第几个steps之后开始统计step的性能数据
+
+export AIS_NODE_FILE_PATH=/home/xx/xx/xx/node_file.json # 分布式运行使用cluster_tools所需包含节点信息和ssh key路径的文件，单机训练不用填
+export AIS_TRAIN_TASK_TYPE="finetune" # 预训练："train"，微调："finetune"
+export AIS_MODEL_NAME="glm2_6b"
+export AIS_TRAIN_YAML="run_glm2_6b_finetune_800_32G" # 请从code/registed_task.sh中注册的yaml文件中选择一个填入
+
+export AIS_RANK_NUM=8 # 集群总加速卡数
+export AIS_DEVICE_NUM=8 # 单台服务器的加速卡数量
+export AIS_RANK_TABLE_FILE="./xx.json" # rank_table_file 的路径, 相对于当前脚本
diff --git a/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800T_A2_64G.yaml b/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800T_A2_64G.yaml
new file mode 100644
index 0000000..d991a63
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800T_A2_64G.yaml
@@ -0,0 +1,248 @@
+seed: 0
+run_mode: 'train'
+output_dir: './output' # path to save checkpoint/strategy
+load_checkpoint: ''path/to/glm2_6b.ckpt''
+src_strategy_path_or_dir: ''
+auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model
+only_save_strategy: False
+resume_training: False
+
+# ==== context config ====
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  device_target: "Ascend"
+  enable_graph_kernel: False
+  graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"
+  max_call_depth: 10000
+  max_device_memory: "59GB" # 59GB for Atlas 800T A2
+  save_graphs: False
+  device_id: 0
+
+# aicc
+remote_save_url: "Please input obs url on AICC platform."
+
+# ==== model config ====
+model:
+  model_config:
+    type: ChatGLM2Config
+    num_layers: 28
+    padded_vocab_size: 65024
+    hidden_size: 4096
+    ffn_hidden_size: 13696
+    kv_channels: 128
+    num_attention_heads: 32
+    seq_length: 192
+    hidden_dropout: 0.0
+    attention_dropout: 0.0
+    layernorm_epsilon: 1e-5
+    rmsnorm: True
+    apply_residual_connection_post_layernorm: False
+    post_layer_norm: True
+    add_bias_linear: False
+    add_qkv_bias: True
+    bias_dropout_fusion: True
+    multi_query_attention: True
+    multi_query_group_num: 2
+    apply_query_key_layer_scaling: True
+    attention_softmax_in_fp32: True
+    fp32_residual_connection: False
+    quantization_bit: 0
+    pre_seq_len: None
+    prefix_projection: False
+    param_init_type: "float16"
+    compute_dtype: "float16"
+    layernorm_compute_type: "float32"
+    use_past: False
+    use_flash_attention: True # when use FlashAttention, seq_length should be multiple of 16
+    eos_token_id: 2
+    pad_token_id: 0
+    repetition_penalty: 1.0
+    max_decode_length: 256
+    checkpoint_name_or_path: "glm2_6b"
+    top_k: 1
+    top_p: 1
+    do_sample: True
+  arch:
+    type: ChatGLM2ForConditionalGeneration
+
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'glm2_6b'
+# if True do, evaluate during the training process. if false, do nothing.
+# note that the task trainer should support _evaluate_in_training function.
+do_eval: False
+eval_step_interval: 1788
+eval_epoch_interval: -1
+
+metric:
+  type: PerplexityMetric
+
+processor:
+  return_tensors: ms
+  tokenizer:
+    type: ChatGLM2Tokenizer
+    bos_token: '<sop>'
+    eos_token: '<eop>'
+    end_token: '</s>'
+    mask_token: '[MASK]'
+    gmask_token: '[gMASK]'
+    pad_token: '<pad>'
+    unk_token: '<unk>'
+  type: GLMProcessor
+
+# ==== dataset config ====
+train_dataset: &train_dataset
+  data_loader:
+    type: ADGenDataLoader
+    dataset_dir: "/path/to/AdvertiseGen/train.json"
+    shuffle: True
+    phase: "train"
+    version: 2
+    origin_columns: ["content", "summary"]
+  tokenizer:
+    type: ChatGLM2Tokenizer
+    vocab_file: "/path/to/tokenizer.model"
+  input_columns: ["input_ids", "labels"]
+  max_source_length: 64
+  max_target_length: 127
+  ignore_pad_token_for_loss: True
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  batch_size: 8
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+  seed: 0
+
+train_dataset_task:
+  type: KeyWordGenDataset
+  dataset_config: *train_dataset
+
+eval_dataset: &eval_dataset
+  data_loader:
+    type: ADGenDataLoader
+    dataset_dir: "/path/to/AdvertiseGen/dev.json"
+    shuffle: False
+    phase: "train"
+    version: 2
+    origin_columns: ["content", "summary"]
+  tokenizer:
+    type: ChatGLM2Tokenizer
+    vocab_file: "/path/to/tokenizer.model"
+  max_source_length: 64
+  max_target_length: 127
+  ignore_pad_token_for_loss: True
+  input_columns: ["input_ids", "labels"]
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  batch_size: 8
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+  seed: 0
+
+eval_dataset_task:
+  type: KeyWordGenDataset
+  dataset_config: *eval_dataset
+
+# ==== runner config ====
+runner_config:
+  epochs: 1
+  batch_size: 8
+  sink_mode: True
+  sink_size: 4
+
+runner_wrapper:
+  type: MFTrainOneStepCell
+  scale_sense:
+    type: DynamicLossScaleUpdateCell
+    loss_scale_value: 65536
+    scale_factor: 2
+    scale_window: 1000
+  use_clip_grad: True
+
+# lr sechdule
+lr_schedule:
+  type: polynomial
+  learning_rate: 5.e-5
+  lr_end: 1.e-6
+  warmup_steps: 0
+  total_steps: -1 # -1 means it will load the total steps of the dataset
+layer_scale: False
+layer_decay: 0.65
+
+# optimizer
+optimizer:
+  type: FP32StateAdamWeightDecay
+  beta1: 0.9
+  beta2: 0.95
+  eps: 1.e-8
+  weight_decay: 0.1
+lr_scale: False
+lr_scale_factor: 256
+
+# parallel config
+use_parallel: True
+parallel:
+  parallel_mode: 1 # 0-dataset, 1-semi, 2-auto, 3-hybrid
+  gradients_mean: False
+  loss_repeated_mean: True
+  enable_alltoall: False
+  full_batch: True
+  search_mode: "sharding_propagation"
+  enable_parallel_optimizer: True  # optimizer shard
+  strategy_ckpt_config:
+    save_file: "./ckpt_strategy.ckpt"
+parallel_config:
+  data_parallel: 8
+  model_parallel: 1
+  pipeline_stage: 1
+  expert_parallel: 1
+  micro_batch_num: 1
+  vocab_emb_dp: True
+  gradient_aggregation_group: 4
+micro_batch_interleave_num: 1
+
+# moe
+moe_config:
+  expert_num: 1
+  capacity_factor: 1.05
+  aux_loss_factor: 0.05
+  num_experts_chosen: 1
+
+# recompute
+recompute_config:
+  recompute: True
+  parallel_optimizer_comm_recompute: False
+  mp_comm_recompute: True
+  recompute_slice_activation: True
+
+# autotune
+auto_tune: False
+filepath_prefix: './autotune'
+autotune_per_step: 10
+
+# profile
+profile: False
+profile_start_step: 1
+profile_stop_step: 10
+init_start_profile: True
+profile_communication: True
+profile_memory: True
+
+# callbacks
+callbacks:
+  - type: MFLossMonitor
+  - type: CheckpointMointor
+    prefix: "glm2-6b"
+    save_checkpoint_steps: 100000000 # big enough
+    keep_checkpoint_max: 1
+    integrated_save: False
+    async_save: False
+  - type: ObsMonitor
+    keep_last: False
+eval_callbacks:
+  - type: ObsMonitor
+    keep_last: False
diff --git a/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800_32G.yaml b/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800_32G.yaml
new file mode 100644
index 0000000..8cc03b1
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800_32G.yaml
@@ -0,0 +1,249 @@
+seed: 0
+run_mode: 'train'
+output_dir: './output' # path to save checkpoint/strategy
+load_checkpoint: '{path}/glm2_6b.ckpt'
+src_strategy_path_or_dir: ''
+auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model
+only_save_strategy: False
+resume_training: False
+
+# ==== context config ====
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  device_target: "Ascend"
+  enable_graph_kernel: False
+  graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"
+  max_call_depth: 10000
+  max_device_memory: "28GB" # 59GB for Atlas 800T A2
+  save_graphs: False
+  device_id: 0
+
+# aicc
+remote_save_url: "Please input obs url on AICC platform."
+
+# ==== model config ====
+model:
+  model_config:
+    type: ChatGLM2Config
+    num_layers: 28
+    padded_vocab_size: 65024
+    hidden_size: 4096
+    ffn_hidden_size: 13696
+    kv_channels: 128
+    num_attention_heads: 32
+    seq_length: 192
+    hidden_dropout: 0.0
+    attention_dropout: 0.0
+    layernorm_epsilon: 1e-5
+    rmsnorm: True
+    apply_residual_connection_post_layernorm: False
+    post_layer_norm: True
+    add_bias_linear: False
+    add_qkv_bias: True
+    bias_dropout_fusion: True
+    multi_query_attention: True
+    multi_query_group_num: 2
+    apply_query_key_layer_scaling: True
+    attention_softmax_in_fp32: True
+    fp32_residual_connection: False
+    quantization_bit: 0
+    pre_seq_len: None
+    prefix_projection: False
+    param_init_type: "float16"
+    compute_dtype: "float16"
+    layernorm_compute_type: "float32"
+    use_past: False
+    use_flash_attention: False # when use FlashAttention, seq_length should be multiple of 16
+    eos_token_id: 2
+    pad_token_id: 0
+    repetition_penalty: 1.0
+    max_decode_length: 256
+    checkpoint_name_or_path: "glm2_6b"
+    top_k: 1
+    top_p: 1
+    do_sample: True
+  arch:
+    type: ChatGLM2ForConditionalGeneration
+
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'glm2_6b'
+# if True do, evaluate during the training process. if false, do nothing.
+# note that the task trainer should support _evaluate_in_training function.
+do_eval: False
+eval_step_interval: 1788
+eval_epoch_interval: -1
+
+metric:
+  type: PerplexityMetric
+
+processor:
+  return_tensors: ms
+  tokenizer:
+    type: ChatGLM2Tokenizer
+    bos_token: '<sop>'
+    eos_token: '<eop>'
+    end_token: '</s>'
+    mask_token: '[MASK]'
+    gmask_token: '[gMASK]'
+    pad_token: '<pad>'
+    unk_token: '<unk>'
+    # vocab_file: "/path/to/tokenizer.model"
+  type: GLMProcessor
+
+# ==== dataset config ====
+train_dataset: &train_dataset
+  data_loader:
+    type: ADGenDataLoader
+    dataset_dir: "/path/to/AdvertiseGen/train.json"
+    shuffle: True
+    phase: "train"
+    version: 2
+    origin_columns: ["content", "summary"]
+  tokenizer:
+    type: ChatGLM2Tokenizer
+    vocab_file: "/path/to/tokenizer.model"
+  input_columns: ["input_ids", "labels"]
+  max_source_length: 64
+  max_target_length: 127
+  ignore_pad_token_for_loss: True
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  batch_size: 8
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+  seed: 0
+
+train_dataset_task:
+  type: KeyWordGenDataset
+  dataset_config: *train_dataset
+
+eval_dataset: &eval_dataset
+  data_loader:
+    type: ADGenDataLoader
+    dataset_dir: "/path/to/AdvertiseGen/dev.json"
+    shuffle: False
+    phase: "train"
+    version: 2
+    origin_columns: ["content", "summary"]
+  tokenizer:
+    type: ChatGLM2Tokenizer
+    vocab_file: "/path/to/tokenizer.model"
+  max_source_length: 256
+  max_target_length: 256
+  ignore_pad_token_for_loss: True
+  input_columns: ["input_ids", "labels"]
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  batch_size: 8
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+  seed: 0
+
+eval_dataset_task:
+  type: KeyWordGenDataset
+  dataset_config: *eval_dataset
+
+# ==== runner config ====
+runner_config:
+  epochs: 1
+  batch_size: 8
+  sink_mode: True
+  sink_size: 4
+
+runner_wrapper:
+  type: MFTrainOneStepCell
+  scale_sense:
+    type: DynamicLossScaleUpdateCell
+    loss_scale_value: 65536
+    scale_factor: 2
+    scale_window: 1000
+  use_clip_grad: True
+
+# lr sechdule
+lr_schedule:
+  type: polynomial
+  learning_rate: 5.e-5
+  lr_end: 1.e-6
+  warmup_steps: 0
+  total_steps: -1 # -1 means it will load the total steps of the dataset
+layer_scale: False
+layer_decay: 0.65
+
+# optimizer
+optimizer:
+  type: FP32StateAdamWeightDecay
+  beta1: 0.9
+  beta2: 0.95
+  eps: 1.e-8
+  weight_decay: 0.1
+lr_scale: False
+lr_scale_factor: 256
+
+# parallel config
+use_parallel: True
+parallel:
+  parallel_mode: 1 # 0-dataset, 1-semi, 2-auto, 3-hybrid
+  gradients_mean: False
+  loss_repeated_mean: True
+  enable_alltoall: False
+  full_batch: True
+  search_mode: "sharding_propagation"
+  enable_parallel_optimizer: True  # optimizer shard
+  strategy_ckpt_config:
+    save_file: "./ckpt_strategy.ckpt"
+parallel_config:
+  data_parallel: 8
+  model_parallel: 1
+  pipeline_stage: 1
+  expert_parallel: 1
+  micro_batch_num: 1
+  vocab_emb_dp: True
+  gradient_aggregation_group: 4
+micro_batch_interleave_num: 1
+
+# moe
+moe_config:
+  expert_num: 1
+  capacity_factor: 1.05
+  aux_loss_factor: 0.05
+  num_experts_chosen: 1
+
+# recompute
+recompute_config:
+  recompute: True
+  parallel_optimizer_comm_recompute: False
+  mp_comm_recompute: True
+  recompute_slice_activation: True
+
+# autotune
+auto_tune: False
+filepath_prefix: './autotune'
+autotune_per_step: 10
+
+# profile
+profile: False
+profile_start_step: 1
+profile_stop_step: 10
+init_start_profile: True
+profile_communication: True
+profile_memory: True
+
+# callbacks
+callbacks:
+  - type: MFLossMonitor
+  - type: CheckpointMointor
+    prefix: "glm2-6b"
+    save_checkpoint_steps: 100000000 # big enough
+    keep_checkpoint_max: 1
+    integrated_save: False
+    async_save: False
+  - type: ObsMonitor
+    keep_last: False
+eval_callbacks:
+  - type: ObsMonitor
+    keep_last: False
diff --git a/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_eval.yaml b/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_eval.yaml
new file mode 100644
index 0000000..1817d6b
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_eval.yaml
@@ -0,0 +1,230 @@
+seed: 0
+run_mode: 'eval'
+output_dir: './output' # path to save checkpoint/strategy
+load_checkpoint: './output/target_checkpoint/rank_0/glm2_6b0.ckpt'
+src_strategy_path_or_dir: ''
+auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model
+only_save_strategy: False
+resume_training: False
+
+# ==== context config ====
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  device_target: "Ascend"
+  enable_graph_kernel: False
+  graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"
+  max_call_depth: 10000
+  max_device_memory: "28GB" # 59GB for Atlas 800T A2
+  save_graphs: False
+  save_graphs_path: "./ir-graphs"
+  device_id: 0
+
+# aicc
+remote_save_url: "Please input obs url on AICC platform."
+
+# ==== model config ====
+model:
+  model_config:
+    type: ChatGLM2Config
+    # only for incremental infer,
+    # when parallel_mode=1 and full_batch=True,
+    # batch_size should be set to runner_config.batch_size * data_parallel;
+    # when single card, batch_size should be set to runner_config.batch_size
+    batch_size: 8
+    num_layers: 28
+    padded_vocab_size: 65024
+    hidden_size: 4096
+    ffn_hidden_size: 13696
+    kv_channels: 128
+    num_attention_heads: 32
+    seq_length: 256
+    hidden_dropout: 0.0
+    attention_dropout: 0.0
+    layernorm_epsilon: 1e-5
+    rmsnorm: True
+    apply_residual_connection_post_layernorm: False
+    post_layer_norm: True
+    add_bias_linear: False
+    add_qkv_bias: True
+    bias_dropout_fusion: True
+    multi_query_attention: True
+    multi_query_group_num: 2
+    apply_query_key_layer_scaling: True
+    attention_softmax_in_fp32: True
+    fp32_residual_connection: False
+    quantization_bit: 0
+    pre_seq_len: None
+    prefix_projection: False
+    param_init_type: "float16"
+    compute_dtype: "float16"
+    layernorm_compute_type: "float32"
+    use_past: True
+    use_prompt_flash_attention: False
+    use_incre_flash_attention: False
+    eos_token_id: 2
+    pad_token_id: 0
+    repetition_penalty: 1.0
+    max_decode_length: 256
+    checkpoint_name_or_path: "glm2_6b"
+    top_k: 1
+    top_p: 1
+    do_sample: True
+  arch:
+    type: ChatGLM2ForConditionalGeneration
+
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'glm2_6b'
+# if True do, evaluate during the training process. if false, do nothing.
+# note that the task trainer should support _evaluate_in_training function.
+do_eval: False
+eval_step_interval: 500
+eval_epoch_interval: -1
+
+metric:
+  type: ADGENMetric
+
+processor:
+  return_tensors: ms
+  tokenizer:
+    type: ChatGLM2Tokenizer
+    bos_token: '<sop>'
+    eos_token: '<eop>'
+    end_token: '</s>'
+    mask_token: '[MASK]'
+    gmask_token: '[gMASK]'
+    pad_token: '<pad>'
+    unk_token: '<unk>'
+    # vocab_file: "/path/to/tokenizer.model"
+  type: GLMProcessor
+
+# ==== dataset config ====
+
+eval_dataset: &eval_dataset
+  data_loader:
+    type: ADGenDataLoader
+    dataset_dir: "/path/to/AdvertiseGen/dev.json"
+    shuffle: False
+    phase: "eval"
+    version: 2
+    origin_columns: ["content", "summary"]
+  tokenizer:
+    type: ChatGLM2Tokenizer
+    vocab_file: "/path/to/tokenizer.model"
+  max_source_length: 256
+  max_target_length: 256
+  ignore_pad_token_for_loss: True
+  input_columns: ["input_ids", "labels"]
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  batch_size: 8
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+  seed: 0
+
+eval_dataset_task:
+  type: KeyWordGenDataset
+  dataset_config: *eval_dataset
+
+# ==== runner config ====
+runner_config:
+  epochs: 1
+  batch_size: 8
+  sink_mode: True
+  sink_size: 4
+
+runner_wrapper:
+  type: MFTrainOneStepCell
+  scale_sense:
+    type: DynamicLossScaleUpdateCell
+    loss_scale_value: 65536
+    scale_factor: 2
+    scale_window: 1000
+  use_clip_grad: True
+
+# lr sechdule
+lr_schedule:
+  type: polynomial
+  learning_rate: 5.e-5
+  lr_end: 1.e-6
+  warmup_steps: 0
+  total_steps: -1 # -1 means it will load the total steps of the dataset
+layer_scale: False
+layer_decay: 0.65
+
+# optimizer
+optimizer:
+  type: FP32StateAdamWeightDecay
+  beta1: 0.9
+  beta2: 0.95
+  eps: 1.e-8
+  weight_decay: 0.1
+lr_scale: False
+lr_scale_factor: 256
+
+# parallel config
+use_parallel: True
+parallel:
+  parallel_mode: 1 # 0-dataset, 1-semi, 2-auto, 3-hybrid
+  gradients_mean: False
+  loss_repeated_mean: True
+  enable_alltoall: False
+  full_batch: True
+  search_mode: "sharding_propagation"
+  enable_parallel_optimizer: False  # optimizer shard
+  strategy_ckpt_config:
+    save_file: "./ckpt_strategy.ckpt"
+    only_trainable_params: False # 设置成 False，才能在策略文件中保存所有参数
+parallel_config:
+  data_parallel: 8
+  model_parallel: 1
+  pipeline_stage: 1
+  expert_parallel: 1
+  micro_batch_num: 1
+  vocab_emb_dp: True
+  gradient_aggregation_group: 4
+micro_batch_interleave_num: 1
+
+# moe
+moe_config:
+  expert_num: 1
+  capacity_factor: 1.05
+  aux_loss_factor: 0.05
+  num_experts_chosen: 1
+
+# recompute
+recompute_config:
+  recompute: True
+  parallel_optimizer_comm_recompute: False
+  mp_comm_recompute: True
+  recompute_slice_activation: True
+
+# autotune
+auto_tune: False
+filepath_prefix: './autotune'
+autotune_per_step: 10
+
+# profile
+profile: False
+profile_start_step: 1
+profile_stop_step: 10
+init_start_profile: True
+profile_communication: True
+profile_memory: True
+
+# callbacks
+callbacks:
+  - type: MFLossMonitor
+  - type: CheckpointMointor
+    prefix: "glm2-6b"
+    save_checkpoint_steps: 1000
+    keep_checkpoint_max: 1
+    integrated_save: False
+    async_save: False
+  - type: ObsMonitor
+    keep_last: False
+eval_callbacks:
+  - type: ObsMonitor
+    keep_last: False
diff --git a/huawei/mindspore/mindformers/models/glm2_6b/registed_tasks.sh b/huawei/mindspore/mindformers/models/glm2_6b/registed_tasks.sh
new file mode 100644
index 0000000..a28fb04
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/glm2_6b/registed_tasks.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+# 单机运行的任务
+SINGLE_NODE_LAUNCH=( \
+    "run_glm2_6b_finetune_800_32G" \
+    "run_glm2_6b_finetune_800T_A2_64G"
+)
+# 多机运行的任务
+MULTI_NODES_LAUNCH=()
\ No newline at end of file
diff --git a/huawei/mindspore/mindformers/models/llama2_13b/README.md b/huawei/mindspore/mindformers/models/llama2_13b/README.md
new file mode 100644
index 0000000..0c019ab
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_13b/README.md
@@ -0,0 +1,204 @@
+# AISBench 基于Mindspore/mindformers框架的LLaMA2 13b 训练负载包使用指南
+本文主要介绍使用基于mindformers LLaMA2 13b大模型训练业务代码构建的AISBench的负载包"Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-llama2_13b-{mindformers version}.tar.gz"，进行服务器性能测试的流程。
+本负载包支持一键拉起单机或多机的LLaMA2 13b模型的预训练或微调任务，自动完成评测，并汇总训练相关的性能数据。
+## 名词定义
+|名词|	定义|
+| --- | ----------------------------------- |
+|ais-bench-stubs|启动性能测试任务的二进制程序|
+|管理节点|运行ais-bench-stubs的环境，只有一个|
+|计算节点|执行训练任务的环境，可以有多个；计算节点中有一个作为管理节点|
+
+## 查看llama2 13b 训练负载包目录结构，简单确认完整性
+解压负载包"Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-llama2_13b-{mindformers version}.tar.gz"（如果在包中看到本文档忽略此步）
+```bash
+tar xzf Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-llama2_13b-{mindformers version}.tar.gz
+```
+负载包名中"{mindformers version}"表示[mindformer仓库](https://gitee.com/mindspore/mindformers)的分支名
+查看目录结构
+```bash
+├── ais-bench-stubs # 启动测试的二进制文件
+├── code/
+│   ├── benchmark.sh
+│   ├── evaluate_run.sh
+│   ├── evaluate_scripts.sh # 评测的运行脚本，需要依据实际运行的评测命令
+│   ├── launch_config.sh
+│   ├── launch_yamls/ # 启动训练任务的配置文件
+|   │   ├──run_llama2_13b_910b.yaml
+|   │   ├──......
+│   ├── mindformers/ # 嵌入了logging打点接口的mindformers代码
+│   ├── multi_nodes_run.sh
+│   ├── registed_tasks.sh # 注册了可用的mindformers脚本
+│   └── single_node_run.sh
+├── config/
+│   ├── config.json
+│   └── system.json
+├── log/
+├── result/
+├── README.md # 本文档
+└── STUBS_PACKAGE_INTRO.md
+```
+**后续对于相对路径的描述都是相对于负载包中的一级目录，例如 ./**
+
+## 负载包运行环境准备
+### 基本环境准备
+```
+python >=3.7
+```
+### mindspore准备
+**所有计算节点需要准备**+
+请依据负载包名中的“{mindformers version}”对应的mindformers分支版本，参考[mindformers训练负载主页](https://gitee.com/aisbench/training/tree/master/huawei/mindspore/mindformers)，安装指定版本的mindspore（python版本不限）。
+MindSpore安装参考[MindSpore官网](https://www.mindspore.cn/)MindSpore需要能成功在npu上运行，验证命令：
+```bash
+python -c "import mindspore;mindspore.set_context(device_target='Ascend');mindspore.run_check()"
+
+### logging准备
+**所有计算节点需要安装**
+从[logging打点工具发行版](https://gitee.com/aisbench/logging/releases)获取最新的发行版。
+参考[logging打点工具主页](https://gitee.com/aisbench/logging)的“安装与卸载/安装logging”章节安装logging打点工具。
+### cluster_tools准备(多机运行需要)
+**仅管理节点需要安装**
+从[cluster_tools分布式运行工具发行版](https://gitee.com/aisbench/cluster_tools/releases)获取最新的发行版。
+参考[cluster_tools分布式运行工具主页](https://gitee.com/aisbench/cluster_tools/)的“安装与卸载/安装cluster_tools”章节安装cluster_tools分布式运行工具。
+
+## 资源准备
+### 前置声明
+1. 以下涉及到mindformers代码仓库的链接没有指定代码分支，需要依据负载包名"{mindformers version}",自行切换到对应的分支。
+2. 运行LLaMA2训练的MindSpore/mindformers的代码全部在`./code/mindformers`文件夹中，资源准备总体参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)，具体资源的参考详见本章其他小节。<br>
+### rank_table_file准备
+**部署在管理节点上**
+rank_table_file是一个包含集群节点和加速卡ip信息的json文件。
+准备rank_table_file前确保计算节点的`/etc/hccn.conf`文件已经配好（如果没配好，参考[数据中心解决方案/配置训练节点](https://www.hiascend.com/document/detail/zh/Ascend%20Data%20Center%20Solution/22.0.0/install/800_9000/install_800_9000_0029.html)配置）。
+
+参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)的“生成RANK_TABLE_FILE(多卡运行必须环节)”章节。
+
+### node_file准备（多机运行需要）
+**部署在管理节点上**
+node_file是需要给cluster_tools传入的文件，它包含了计算节点的具体信息。
+node_file需要自建，格式参考[cluster_tools分布式运行工具主页](https://gitee.com/aisbench/cluster_tools/)的“集群节点信息文件内容格式”章节自行创建。**注意，node_file中计算节点的顺序需要与rank_table_file中的计算节点顺序相同。**
+
+### 模型权重下载与转换
+微调任务需要，预训练任务不涉及，**部署在所有计算节点上**。
+参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)的“模型权重下载与转换”章节准备LLaMA2 13b的ckpt文件。
+
+### 数据集准备
+#### 预训练数据集准备
+**部署在所有计算节点上**，如果不想手动部署，可以放在负载包的`code/`路径中。
+参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“预训练/数据集准备”章节准备转换好的预训练数据集。
+#### 微调数据集准备
+部署在所有计算节点上，如果不想手动部署，可以放在负载包的`code/`路径中。
+参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“微调/数据集准备”章节准备转换好的微调数据集。
+#### 评测数据集准备
+依据实际的评测需求准备数据集，**部署在管理节点上**。
+**wikitext**
+参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“评测/文本生成/获取数据集”章节准备评测数据集。
+**SQuAD**
+参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“评测阅读理解/获取数据集”章节准备评测数据集。
+
+
+## 查看本负载包支持的mindformers启动配置文件
+查看`code/register_task.sh`文件：
+```bash
+#!/bin/bash
+# 单机运行的任务
+SINGLE_NODE_LAUNCH=( \
+    "run_llama2_13b_910b" \
+    "run_llama2_13b_910b_finetune"
+)
+# 多机运行的任务
+MULTI_NODES_LAUNCH=( \
+    "run_llama2_13b" \
+    "run_llama2_13b_finetune"
+)
+```
+
+```bash
+"run_llama2_13b_910b_finetune"  # 单机 8卡 910B1,910B2,910B3 微调任务
+"run_llama2_13b_910b"  # 单机 8卡 910B1,910B2,910B3 预训练任务
+"run_llama2_13b_finetune" # 多机 每机8卡 910A,910B4 微调任务
+"run_llama2_13b" # 多机 每机8卡 910A,910B4 预训练任务
+```
+需要确认`code/launch_yamls/`路径下包含了上述`code/register_task.sh`中注册的yaml文件。
+
+## 启动前配置
+### 负载通用配置文件launch_config.sh配置
+编辑`code/launch_config.sh`启动文件：
+```bash
+#!/bin/bash
+
+export AIS_PYTHON=python3 # 使用的python解释器
+export AISBENCH_LOGGING_WARM_UP_STEP_COUNT=3 # 从第几个steps之后开始统计step的性能数据
+
+export AIS_NODE_FILE_PATH=/home/xx/xx/xx/node_file.json # 分布式运行使用cluster_tools所需包含节点信息和ssh key路径的文件，单机训练不用填
+export AIS_TRAIN_TASK_TYPE="train" # 预训练："train"，微调："finetune"
+export AIS_MODEL_NAME="llama2_13b" # 维持默认
+export AIS_TRAIN_YAML="run_llama2_13b" # 请从code/registed_task.sh中注册的yaml文件中选择一个填入
+
+export AIS_RANK_NUM=16 # 集群总加速卡数
+export AIS_DEVICE_NUM=8 # 单台服务器的加速卡数量
+export AIS_RANK_TABLE_FILE="./xx.json" # rank_table_file 的路径, 相对于当前脚本
+
+```
+### 修改yaml配置文件
+修改`code/launch_config.sh`中设置的`AIS_TRAIN_YAML`对应的`code/launch_yaml/`中的yaml配置文件。
+#### 预训练任务
+```yaml
+# dataset
+train_dataset: &train_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: "" # 需要填入.mindrecord后缀的wikitext-2数据集的路径。
+    shuffle: True
+```
+`dataset_dir`支持绝对路径以及相对路径。`dataset_dir`填入的相对路径`../../`实际对应负载包路径`code/mindformers/`。
+
+#### 微调任务
+```yaml
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy，维持默认
+load_checkpoint: '{path}/llama2_13b.ckpt'  # 需要填入准备的权重文件.ckpt的路径
+src_strategy_path_or_dir: ''
+```
+`load_checkpoint`支持绝对路径以及相对路径。`load_checkpoint`填入的相对路径`./`实际对应负载包路径`code/mindformers/`
+```yaml
+# dataset
+train_dataset: &train_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: "/{path}/alpaca-fastchat2048.mindrecord" # 需要填入.mindrecord后缀的alpaca数据集的路径。
+    shuffle: True
+```
+`dataset_dir`支持绝对路径以及相对路径。`dataset_dir`填入的相对路径`../../`实际对应负载包路径`code/mindformers/`
+
+### 修改评测启动脚本
+`code/evaluate_scripts.sh`评测脚本的启动较为多样，为了保证可拓展性，支持用户自行修改。默认提供的是基于wikitext-2数据集的文本生成任务的评测启动脚本。`code/evaluate_scripts.sh`默认内容如下：
+```bash
+#!/bin/bash
+CUR_DIR=$(cd "$(dirname "$0")";pwd)
+MINDFORMERS_CODE_PATH=${CUR_DIR}/mindformers/
+LAUNCH_SCRIPT_PATH=${MINDFORMERS_CODE_PATH}/scripts/
+OUTPUT_PATH=${MINDFORMERS_CODE_PATH}/output/ # 训练完后output路径
+
+# eval_script 评测启动命令请自行根据实际情况修改
+export mindformers_workload_eval_cmd="${AIS_PYTHON} ${MINDFORMERS_CODE_PATH}/run_mindformer.py \
+--config ${CUR_DIR}/launch_yamls/run_llama2_13b.yaml \
+--eval_dataset_dir /{path}/wiki4096valid.mindrecord \
+--run_mode eval \
+--load_checkpoint ${OUTPUT_PATH}/target_checkpoint/rank_0/llama2_13b0.ckpt \
+--epochs 1 \
+--use_parallel False \
+--device_id 0"
+```
+其中`--eval_dataset_dir`需要用户自行传入wikitext-2的.mindrecord后缀的评测数据集的绝对路径。
+
+## 启动测试
+### 在线测试
+在线测试的前置准备请参考`STUBS_PACKAGE_INTRO.md`文档。启动命令：
+```bash
+./ais-bench-stubs
+```
+### 轻量化离线测试
+启动命令：
+```bash
+./ais-bench-stubs test
+```
+
diff --git a/huawei/mindspore/mindformers/models/llama2_13b/evaluate_scripts.sh b/huawei/mindspore/mindformers/models/llama2_13b/evaluate_scripts.sh
new file mode 100644
index 0000000..58c6d5d
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_13b/evaluate_scripts.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+CUR_DIR=$(cd "$(dirname "$0")";pwd)
+MINDFORMERS_CODE_PATH=${CUR_DIR}/mindformers/
+LAUNCH_SCRIPT_PATH=${MINDFORMERS_CODE_PATH}/scripts/
+OUTPUT_PATH=${MINDFORMERS_CODE_PATH}/output/ # 训练完后output路径
+
+# eval_script 评测启动命令请自行根据实际情况修改
+export mindformers_workload_eval_cmd="${AIS_PYTHON} ${MINDFORMERS_CODE_PATH}/run_mindformer.py \
+--config ${CUR_DIR}/launch_yamls/run_llama2_13b.yaml \
+--eval_dataset_dir /{path}/wiki4096valid.mindrecord \
+--run_mode eval \
+--load_checkpoint ${OUTPUT_PATH}/target_checkpoint/rank_0/llama2_13b0.ckpt \
+--epochs 1 \
+--use_parallel False \
+--device_id 0"
\ No newline at end of file
diff --git a/huawei/mindspore/mindformers/models/llama2_13b/launch_config.sh b/huawei/mindspore/mindformers/models/llama2_13b/launch_config.sh
new file mode 100644
index 0000000..5107c7d
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_13b/launch_config.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+export AIS_PYTHON=python3 # 使用的python解释器
+export AISBENCH_LOGGING_WARM_UP_STEP_COUNT=3 # 从第几个steps之后开始统计step的性能数据
+
+export AIS_NODE_FILE_PATH=/home/xx/xx/xx/node_file.json # 分布式运行使用cluster_tools所需包含节点信息和ssh key路径的文件，单机训练不用填
+export AIS_TRAIN_TASK_TYPE="train" # 预训练："train"，微调："finetune"
+export AIS_MODEL_NAME="llama2_13b"
+export AIS_TRAIN_YAML="run_llama2_13b" # 请从code/registed_task.sh中注册的yaml文件中选择一个填入
+
+export AIS_RANK_NUM=8 # 集群总加速卡数
+export AIS_DEVICE_NUM=8 # 单台服务器的加速卡数量
+export AIS_RANK_TABLE_FILE="./xx.json" # rank_table_file 的路径, 相对于当前脚本
diff --git a/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b.yaml b/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b.yaml
new file mode 100644
index 0000000..15d63b2
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b.yaml
@@ -0,0 +1,213 @@
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy
+load_checkpoint: ''
+src_strategy_path_or_dir: ''
+auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model
+only_save_strategy: False
+resume_training: False
+run_mode: 'train'
+
+# trainer config
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'llama2_13b'
+# if True, do evaluate during the training process. if false, do nothing.
+# note that the task trainer should support _evaluate_in_training function.
+do_eval: False
+eval_step_interval: -1        # num of step intervals between each eval, -1 means no step end eval.
+eval_epoch_interval: 50        # num of epoch intervals between each eval, 1 means eval on every epoch end.
+
+# runner config
+runner_config:
+  epochs: 2
+  batch_size: 1
+  sink_mode: True
+  sink_size: 2
+
+# optimizer
+optimizer:
+  type: FP32StateAdamWeightDecay
+  beta1: 0.9
+  beta2: 0.95
+  eps: 1.e-8 # 1e-8
+  learning_rate: 3.e-4
+
+# lr sechdule
+lr_schedule:
+  type: CosineWithWarmUpLR
+  learning_rate: 3.e-4
+  lr_end: 3.e-5
+  warmup_ratio: 0.03
+  total_steps: -1 # -1 means it will load the total steps of the dataset
+
+# dataset
+train_dataset: &train_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: "${AIS_WORK_PATH}/path/to/wiki4096.mindrecord"
+    shuffle: True
+  input_columns: ["input_ids"]  # "input_ids", "labels" , labels are used in instruction finetune.
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  batch_size: 4
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+train_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *train_dataset
+
+# eval dataset
+eval_dataset: &eval_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: ""
+    shuffle: False
+  input_columns: ["input_ids"]
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: False
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+eval_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *eval_dataset
+
+use_parallel: True
+# parallel context config
+parallel:
+  parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
+  gradients_mean: False
+  enable_alltoall: False
+  full_batch: True
+  search_mode: "sharding_propagation"
+  enable_parallel_optimizer: True
+  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+  parallel_optimizer_config:
+    gradient_accumulation_shard: False
+    parallel_optimizer_threshold: 64
+# default parallel of device num = 16 for Atlas 800
+parallel_config:
+  data_parallel: 2
+  model_parallel: 4
+  pipeline_stage: 2
+  use_seq_parallel: False
+  micro_batch_num: 16
+  vocab_emb_dp: True
+  gradient_aggregation_group: 4
+# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
+micro_batch_interleave_num: 1
+
+# recompute config
+recompute_config:
+  recompute: True
+  select_recompute: False
+  parallel_optimizer_comm_recompute: False
+  mp_comm_recompute: True
+  recompute_slice_activation: True
+
+# callbacks
+callbacks:
+  - type: MFLossMonitor
+  - type: CheckpointMointor
+    prefix: "llama_13b"
+    save_checkpoint_steps: 100000000 # big enough
+    integrated_save: False
+    async_save: False
+  - type: ObsMonitor
+
+# mindspore context init config
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  device_target: "Ascend"
+  enable_graph_kernel: False
+  graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"
+  max_call_depth: 10000
+  max_device_memory: "31GB"
+  save_graphs: False
+  save_graphs_path: "./graph"
+  device_id: 0
+
+# model config
+model:
+  model_config:
+    type: LlamaConfig
+    batch_size: 1 # add for increase predict
+    seq_length: 4096
+    hidden_size: 5120
+    num_layers: 40
+    num_heads: 40
+    max_position_embedding: 4096
+    vocab_size: 32000
+    multiple_of: 256
+    rms_norm_eps: 1.0e-5
+    bos_token_id: 1
+    eos_token_id: 2
+    pad_token_id: 0
+    ignore_token_id: -100
+    compute_dtype: "float16"
+    layernorm_compute_type: "float32"
+    softmax_compute_type: "float16"
+    rotary_dtype: "float16"
+    param_init_type: "float16"
+    use_past: False
+    pretrain_seqlen: 4096 # seqlen of the pretrain checkpoint: 2048 for llama and 4096 for llama2
+    extend_method: "None" # support "None", "PI", "NTK"
+    compute_in_2d: False
+    use_flash_attention: False # FA can accelerate training or finetune
+    offset: 0
+    use_past_shard: False
+    checkpoint_name_or_path: "llama2_13b"
+    repetition_penalty: 1
+    max_decode_length: 512
+    top_k: 3
+    top_p: 1
+    do_sample: False
+  arch:
+    type: LlamaForCausalLM
+
+processor:
+  return_tensors: ms
+  tokenizer:
+    unk_token: '<unk>'
+    bos_token: '<s>'
+    eos_token: '</s>'
+    pad_token: '<unk>'
+    type: LlamaTokenizer
+  type: LlamaProcessor
+
+# metric
+metric:
+  type: PerplexityMetric
+
+# wrapper cell config
+runner_wrapper:
+  type: MFTrainOneStepCell
+  scale_sense:
+    type: DynamicLossScaleUpdateCell
+    loss_scale_value: 4294967296
+    scale_factor: 2
+    scale_window: 1000
+  use_clip_grad: True
+
+eval_callbacks:
+  - type: ObsMonitor
+
+auto_tune: False
+filepath_prefix: './autotune'
+autotune_per_step: 10
+
+profile: False
+profile_start_step: 1
+profile_stop_step: 10
+init_start_profile: False
+profile_communication: False
+profile_memory: True
+layer_scale: False
+layer_decay: 0.65
+lr_scale_factor: 256
+
+# aicc
+remote_save_url: "Please input obs url on AICC platform."
diff --git a/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_910b.yaml b/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_910b.yaml
new file mode 100644
index 0000000..5da1a20
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_910b.yaml
@@ -0,0 +1,209 @@
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy
+load_checkpoint: ''
+src_strategy_path_or_dir: ''
+auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model
+only_save_strategy: False
+resume_training: False
+run_mode: 'train'
+
+# trainer config
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'llama2_13b'
+# if True, do evaluate during the training process. if false, do nothing.
+# note that the task trainer should support _evaluate_in_training function.
+do_eval: False
+
+# runner config
+runner_config:
+  epochs: 2
+  batch_size: 2
+  sink_mode: True
+  sink_size: 2
+
+# optimizer
+optimizer:
+  type: FP32StateAdamWeightDecay
+  beta1: 0.9
+  beta2: 0.95
+  eps: 1.e-8 # 1e-8
+  learning_rate: 1.e-4
+
+# lr sechdule
+lr_schedule:
+  type: CosineWithWarmUpLR
+  learning_rate: 1.e-4
+  lr_end: 1.e-5
+  warmup_ratio: 0.03
+  total_steps: -1 # -1 means it will load the total steps of the dataset
+
+# dataset
+train_dataset: &train_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: ""
+    shuffle: True
+  input_columns: ["input_ids"]  # "input_ids", "labels" , labels are used in instruction finetune.
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  batch_size: 2
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+train_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *train_dataset
+
+# eval dataset
+eval_dataset: &eval_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: ""
+    shuffle: False
+  input_columns: ["input_ids"]
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: False
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+eval_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *eval_dataset
+
+use_parallel: True
+# parallel context config
+parallel:
+  parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
+  gradients_mean: False
+  enable_alltoall: False
+  full_batch: True
+  search_mode: "sharding_propagation"
+  enable_parallel_optimizer: True
+  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+  parallel_optimizer_config:
+    gradient_accumulation_shard: False
+    parallel_optimizer_threshold: 64
+# default parallel of device num = 16 for Atlas 800T A2
+parallel_config:
+  data_parallel: 8
+  model_parallel: 1
+  pipeline_stage: 1
+  use_seq_parallel: False
+  micro_batch_num: 1
+  vocab_emb_dp: True
+  gradient_aggregation_group: 4
+# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
+micro_batch_interleave_num: 1
+
+# recompute config
+recompute_config:
+  recompute: True
+  select_recompute: False
+  parallel_optimizer_comm_recompute: False
+  mp_comm_recompute: True
+  recompute_slice_activation: True
+
+# callbacks
+callbacks:
+  - type: MFLossMonitor
+  - type: CheckpointMointor
+    prefix: "llama2_13b"
+    save_checkpoint_steps: 100000000 # big enough
+    integrated_save: False
+    async_save: False
+  - type: ObsMonitor
+
+# mindspore context init config
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  device_target: "Ascend"
+  enable_graph_kernel: False
+  graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"
+  max_call_depth: 10000
+  max_device_memory: "58GB"
+  save_graphs: False
+  save_graphs_path: "./graph"
+  device_id: 0
+  runtime_num_threads: 1
+
+# model config
+model:
+  model_config:
+    type: LlamaConfig
+    batch_size: 1 # add for increase predict
+    seq_length: 4096
+    hidden_size: 5120
+    num_layers: 40
+    num_heads: 40
+    vocab_size: 32000
+    multiple_of: 256
+    rms_norm_eps: 1.0e-5
+    bos_token_id: 1
+    eos_token_id: 2
+    pad_token_id: 0
+    ignore_token_id: -100
+    compute_dtype: "float16"
+    layernorm_compute_type: "float32"
+    softmax_compute_type: "float16"
+    rotary_dtype: "float16"
+    param_init_type: "float16"
+    use_past: False
+    scaling_factor: 1.0
+    extend_method: "None" # support "None", "PI", "NTK"
+    use_flash_attention: False # FA can accelerate training or finetune
+    offset: 0
+    checkpoint_name_or_path: "llama2_13b"
+    repetition_penalty: 1
+    max_decode_length: 512
+    top_k: 3
+    top_p: 1
+    do_sample: False
+  arch:
+    type: LlamaForCausalLM
+
+processor:
+  return_tensors: ms
+  tokenizer:
+    unk_token: '<unk>'
+    bos_token: '<s>'
+    eos_token: '</s>'
+    pad_token: '<unk>'
+    type: LlamaTokenizer
+  type: LlamaProcessor
+
+# metric
+metric:
+  type: PerplexityMetric
+
+# wrapper cell config
+runner_wrapper:
+  type: MFTrainOneStepCell
+  scale_sense:
+    type: DynamicLossScaleUpdateCell
+    loss_scale_value: 4294967296
+    scale_factor: 2
+    scale_window: 1000
+  use_clip_grad: True
+
+eval_callbacks:
+  - type: ObsMonitor
+
+auto_tune: False
+filepath_prefix: './autotune'
+autotune_per_step: 10
+
+profile: False
+profile_start_step: 1
+profile_stop_step: 10
+init_start_profile: False
+profile_communication: False
+profile_memory: True
+layer_scale: False
+layer_decay: 0.65
+lr_scale_factor: 256
+
+# aicc
+remote_save_url: "Please input obs url on AICC platform."
diff --git a/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_910b_finetune.yaml b/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_910b_finetune.yaml
new file mode 100644
index 0000000..bd05b96
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_910b_finetune.yaml
@@ -0,0 +1,209 @@
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy
+load_checkpoint: '{path}/llama2_13b.ckpt'  # 必填
+src_strategy_path_or_dir: ''
+auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model
+only_save_strategy: False
+resume_training: False
+run_mode: 'finetune'
+
+# trainer config
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'llama2_13b'
+# if True, do evaluate during the training process. if false, do nothing.
+# note that the task trainer should support _evaluate_in_training function.
+do_eval: False
+
+# runner config
+runner_config:
+  epochs: 2
+  batch_size: 4
+  sink_mode: True
+  sink_size: 2
+
+# optimizer
+optimizer:
+  type: FP32StateAdamWeightDecay
+  beta1: 0.9
+  beta2: 0.95
+  eps: 1.e-8 # 1e-8
+  learning_rate: 1.e-5
+
+# lr sechdule
+lr_schedule:
+  type: CosineWithWarmUpLR
+  learning_rate: 1.e-5
+  lr_end: 0
+  warmup_ratio: 0.03
+  total_steps: -1 # -1 means it will load the total steps of the dataset
+
+# dataset
+train_dataset: &train_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: ""
+    shuffle: True
+  input_columns: ["input_ids", "labels"]  # "input_ids", "labels" , labels are used in instruction finetune.
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  batch_size: 4
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+train_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *train_dataset
+
+# eval dataset
+eval_dataset: &eval_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: ""
+    shuffle: False
+  input_columns: ["input_ids", "labels"]
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: False
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+eval_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *eval_dataset
+
+use_parallel: True
+# parallel context config
+parallel:
+  parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
+  gradients_mean: False
+  enable_alltoall: False
+  full_batch: True
+  search_mode: "sharding_propagation"
+  enable_parallel_optimizer: True
+  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+  parallel_optimizer_config:
+    gradient_accumulation_shard: False
+    parallel_optimizer_threshold: 64
+# default parallel of device num = 16 for Atlas 800T A2
+parallel_config:
+  data_parallel: 8
+  model_parallel: 1
+  pipeline_stage: 1
+  use_seq_parallel: False
+  micro_batch_num: 1
+  vocab_emb_dp: True
+  gradient_aggregation_group: 4
+# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
+micro_batch_interleave_num: 1
+
+# recompute config
+recompute_config:
+  recompute: True
+  select_recompute: False
+  parallel_optimizer_comm_recompute: False
+  mp_comm_recompute: True
+  recompute_slice_activation: True
+
+# callbacks
+callbacks:
+  - type: MFLossMonitor
+  - type: CheckpointMointor
+    prefix: "llama2_13b"
+    save_checkpoint_steps: 100000000 # big enough
+    integrated_save: False
+    async_save: False
+  - type: ObsMonitor
+
+# mindspore context init config
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  device_target: "Ascend"
+  enable_graph_kernel: False
+  graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"
+  max_call_depth: 10000
+  max_device_memory: "58GB"
+  save_graphs: False
+  save_graphs_path: "./graph"
+  device_id: 0
+  runtime_num_threads: 1
+
+# model config
+model:
+  model_config:
+    type: LlamaConfig
+    batch_size: 1 # add for increase predict
+    seq_length: 4096
+    hidden_size: 5120
+    num_layers: 40
+    num_heads: 40
+    vocab_size: 32000
+    multiple_of: 256
+    rms_norm_eps: 1.0e-5
+    bos_token_id: 1
+    eos_token_id: 2
+    pad_token_id: 0
+    ignore_token_id: -100
+    compute_dtype: "float16"
+    layernorm_compute_type: "float32"
+    softmax_compute_type: "float16"
+    rotary_dtype: "float16"
+    param_init_type: "float16"
+    use_past: False
+    scaling_factor: 1.0
+    extend_method: "None" # support "None", "PI", "NTK"
+    use_flash_attention: True # FA can accelerate training or finetune
+    offset: 0
+    checkpoint_name_or_path: "llama2_13b"
+    repetition_penalty: 1
+    max_decode_length: 512
+    top_k: 3
+    top_p: 1
+    do_sample: False
+  arch:
+    type: LlamaForCausalLM
+
+processor:
+  return_tensors: ms
+  tokenizer:
+    unk_token: '<unk>'
+    bos_token: '<s>'
+    eos_token: '</s>'
+    pad_token: '<unk>'
+    type: LlamaTokenizer
+  type: LlamaProcessor
+
+# metric
+metric:
+  type: PerplexityMetric
+
+# wrapper cell config
+runner_wrapper:
+  type: MFTrainOneStepCell
+  scale_sense:
+    type: DynamicLossScaleUpdateCell
+    loss_scale_value: 4294967296
+    scale_factor: 2
+    scale_window: 1000
+  use_clip_grad: True
+
+eval_callbacks:
+  - type: ObsMonitor
+
+auto_tune: False
+filepath_prefix: './autotune'
+autotune_per_step: 10
+
+profile: False
+profile_start_step: 1
+profile_stop_step: 10
+init_start_profile: False
+profile_communication: False
+profile_memory: True
+layer_scale: False
+layer_decay: 0.65
+lr_scale_factor: 256
+
+# aicc
+remote_save_url: "Please input obs url on AICC platform."
diff --git a/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_finetune.yaml b/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_finetune.yaml
new file mode 100644
index 0000000..f0521c0
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_13b/launch_yamls/run_llama2_13b_finetune.yaml
@@ -0,0 +1,214 @@
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy
+load_checkpoint: '{path}/llama2_13b.ckpt'
+src_strategy_path_or_dir: ''
+auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model
+only_save_strategy: False
+resume_training: False
+run_mode: 'train'
+
+# trainer config
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'llama2_13b'
+# if True, do evaluate during the training process. if false, do nothing.
+# note that the task trainer should support _evaluate_in_training function.
+do_eval: False
+eval_step_interval: -1        # num of step intervals between each eval, -1 means no step end eval.
+eval_epoch_interval: 50        # num of epoch intervals between each eval, 1 means eval on every epoch end.
+
+# runner config
+runner_config:
+  epochs: 2
+  batch_size: 1
+  sink_mode: True
+  sink_size: 2
+
+# optimizer
+optimizer:
+  type: FP32StateAdamWeightDecay
+  beta1: 0.9
+  beta2: 0.999
+  eps: 1.e-8 # 1e-8
+  learning_rate: 1.e-5
+
+# lr sechdule
+lr_schedule:
+  type: CosineWithWarmUpLR
+  learning_rate: 1.e-5
+  lr_end: 0
+  warmup_ratio: 0.03
+  total_steps: -1 # -1 means it will load the total steps of the dataset
+
+# dataset
+train_dataset: &train_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: "../../path/to/alpaca-fastchat2048.mindrecord" # abs path or relative path(../../ mean)
+    shuffle: True
+  input_columns: ["input_ids", "labels"]  # "input_ids", "labels" , labels are used in instruction finetune.
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  batch_size: 4
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+train_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *train_dataset
+
+# eval dataset
+eval_dataset: &eval_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: ""
+    shuffle: False
+  input_columns: ["input_ids"]
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: False
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+eval_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *eval_dataset
+
+use_parallel: True
+# parallel context config
+parallel:
+  parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
+  gradients_mean: False
+  enable_alltoall: False
+  full_batch: True
+  search_mode: "sharding_propagation"
+  enable_parallel_optimizer: True
+  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+  parallel_optimizer_config:
+    gradient_accumulation_shard: False
+    parallel_optimizer_threshold: 64
+# default parallel of device num = 16 for Atlas 800
+parallel_config:
+  data_parallel: 2
+  model_parallel: 4
+  pipeline_stage: 2
+  use_seq_parallel: False
+  micro_batch_num: 16
+  vocab_emb_dp: True
+  gradient_aggregation_group: 4
+# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
+micro_batch_interleave_num: 1
+
+# recompute config
+recompute_config:
+  recompute: True
+  select_recompute: False
+  parallel_optimizer_comm_recompute: False
+  mp_comm_recompute: True
+  recompute_slice_activation: True
+
+# callbacks
+callbacks:
+  - type: MFLossMonitor
+  - type: CheckpointMointor
+    prefix: "llama_13b"
+    save_checkpoint_steps: 100000000 # big enough
+    integrated_save: False
+    async_save: False
+  - type: ObsMonitor
+
+# mindspore context init config
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  device_target: "Ascend"
+  enable_graph_kernel: False
+  graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"
+  max_call_depth: 10000
+  max_device_memory: "31GB"
+  save_graphs: False
+  save_graphs_path: "./graph"
+  device_id: 0
+  runtime_num_threads: 1
+
+# model config
+model:
+  model_config:
+    type: LlamaConfig
+    batch_size: 1 # add for increase predict
+    seq_length: 2048
+    hidden_size: 5120
+    num_layers: 40
+    num_heads: 40
+    max_position_embedding: 4096
+    vocab_size: 32000
+    multiple_of: 256
+    rms_norm_eps: 1.0e-5
+    bos_token_id: 1
+    eos_token_id: 2
+    pad_token_id: 0
+    ignore_token_id: -100
+    compute_dtype: "float16"
+    layernorm_compute_type: "float32"
+    softmax_compute_type: "float16"
+    rotary_dtype: "float16"
+    param_init_type: "float16"
+    use_past: False
+    pretrain_seqlen: 4096 # seqlen of the pretrain checkpoint: 2048 for llama and 4096 for llama2
+    extend_method: "None" # support "None", "PI", "NTK"
+    compute_in_2d: False
+    use_flash_attention: False # FA can accelerate training or finetune
+    offset: 0
+    use_past_shard: False
+    checkpoint_name_or_path: "llama2_13b"
+    repetition_penalty: 1
+    max_decode_length: 512
+    top_k: 3
+    top_p: 1
+    do_sample: False
+  arch:
+    type: LlamaForCausalLM
+
+processor:
+  return_tensors: ms
+  tokenizer:
+    unk_token: '<unk>'
+    bos_token: '<s>'
+    eos_token: '</s>'
+    pad_token: '<unk>'
+    type: LlamaTokenizer
+  type: LlamaProcessor
+
+# metric
+metric:
+  type: PerplexityMetric
+
+# wrapper cell config
+runner_wrapper:
+  type: MFTrainOneStepCell
+  scale_sense:
+    type: DynamicLossScaleUpdateCell
+    loss_scale_value: 4294967296
+    scale_factor: 2
+    scale_window: 1000
+  use_clip_grad: True
+
+eval_callbacks:
+  - type: ObsMonitor
+
+auto_tune: False
+filepath_prefix: './autotune'
+autotune_per_step: 10
+
+profile: False
+profile_start_step: 1
+profile_stop_step: 10
+init_start_profile: False
+profile_communication: False
+profile_memory: True
+layer_scale: False
+layer_decay: 0.65
+lr_scale_factor: 256
+
+# aicc
+remote_save_url: "Please input obs url on AICC platform."
diff --git a/huawei/mindspore/mindformers/models/llama2_13b/registed_tasks.sh b/huawei/mindspore/mindformers/models/llama2_13b/registed_tasks.sh
new file mode 100644
index 0000000..399b9fc
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_13b/registed_tasks.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# 单机运行的任务
+SINGLE_NODE_LAUNCH=( \
+    "run_llama2_13b_910b" \
+    "run_llama2_13b_910b_finetune"
+)
+# 多机运行的任务
+MULTI_NODES_LAUNCH=( \
+    "run_llama2_13b" \
+    "run_llama2_13b_finetune"
+)
\ No newline at end of file
diff --git a/huawei/mindspore/mindformers/models/llama2_70b/README.md b/huawei/mindspore/mindformers/models/llama2_70b/README.md
new file mode 100644
index 0000000..2bcedae
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_70b/README.md
@@ -0,0 +1,197 @@
+# AISBench 基于Mindspore/mindformers框架的LLaMA2 70b 训练负载包使用指南
+本文主要介绍使用基于mindformers LLaMA2 70b大模型训练业务代码构建的AISBench的负载包"Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-llama2_70b-{mindformers version}.tar.gz"，进行服务器性能测试的流程。
+本负载包支持一键拉起单机或多机的LLaMA2 70b模型的预训练或微调任务，自动完成评测，并汇总训练相关的性能数据。
+## 名词定义
+|名词|	定义|
+| --- | ----------------------------------- |
+|ais-bench-stubs|启动性能测试任务的二进制程序|
+|管理节点|运行ais-bench-stubs的环境，只有一个|
+|计算节点|执行训练任务的环境，可以有多个；计算节点中有一个作为管理节点|
+
+## 查看llama2 13b 训练负载包目录结构，简单确认完整性
+解压负载包"Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-llama2_70b-{mindformers version}.tar.gz"（如果在包中看到本文档忽略此步）
+```bash
+tar xzf Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-llama2_70b-{mindformers version}.tar.gz
+```
+负载包名中"{mindformers version}"表示[mindformer仓库](https://gitee.com/mindspore/mindformers)的分支名
+查看目录结构
+```bash
+├── ais-bench-stubs # 启动测试的二进制文件
+├── code/
+│   ├── benchmark.sh
+│   ├── evaluate_run.sh
+│   ├── evaluate_scripts.sh # 评测的运行脚本，需要依据实际运行的评测命令
+│   ├── launch_config.sh
+│   ├── launch_yamls/ # 启动训练任务的配置文件
+|   │   ├──run_llama2_70b_910b.yaml
+|   │   ├──......
+│   ├── mindformers/ # 嵌入了logging打点接口的mindformers代码
+│   ├── multi_nodes_run.sh
+│   ├── registed_tasks.sh # 注册了可用的mindformers脚本
+│   └── single_node_run.sh
+├── config/
+│   ├── config.json
+│   └── system.json
+├── log/
+├── result/
+├── README.md # 本文档
+└── STUBS_PACKAGE_INTRO.md
+```
+**后续对于相对路径的描述都是相对于负载包中的一级目录，例如 ./**
+
+## 负载包运行环境准备
+### 基本环境准备
+```
+python >=3.7
+```
+### mindspore准备
+**所有计算节点需要准备**+
+请依据负载包名中的“{mindformers version}”对应的mindformers分支版本，参考[mindformers训练负载主页](https://gitee.com/aisbench/training/tree/master/huawei/mindspore/mindformers)，安装指定版本的mindspore（python版本不限）。
+MindSpore安装参考[MindSpore官网](https://www.mindspore.cn/)MindSpore需要能成功在npu上运行，验证命令：
+```bash
+python -c "import mindspore;mindspore.set_context(device_target='Ascend');mindspore.run_check()"
+
+### logging准备
+**所有计算节点需要安装**
+从[logging打点工具发行版](https://gitee.com/aisbench/logging/releases)获取最新的发行版。
+参考[logging打点工具主页](https://gitee.com/aisbench/logging)的“安装与卸载/安装logging”章节安装logging打点工具。
+### cluster_tools准备(多机运行需要)
+**仅管理节点需要安装**
+从[cluster_tools分布式运行工具发行版](https://gitee.com/aisbench/cluster_tools/releases)获取最新的发行版。
+参考[cluster_tools分布式运行工具主页](https://gitee.com/aisbench/cluster_tools/)的“安装与卸载/安装cluster_tools”章节安装cluster_tools分布式运行工具。
+
+## 资源准备
+### 前置声明
+1. 以下涉及到mindformers代码仓库的链接没有指定代码分支，需要依据负载包名"{mindformers version}",自行切换到对应的分支。
+2. 运行LLaMA2训练的MindSpore/mindformers的代码全部在`./code/mindformers`文件夹中，资源准备总体参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)，具体资源的参考详见本章其他小节。<br>
+### rank_table_file准备
+**部署在管理节点上**
+rank_table_file是一个包含集群节点和加速卡ip信息的json文件。
+准备rank_table_file前确保计算节点的`/etc/hccn.conf`文件已经配好（如果没配好，参考[数据中心解决方案/配置训练节点](https://www.hiascend.com/document/detail/zh/Ascend%20Data%20Center%20Solution/22.0.0/install/800_9000/install_800_9000_0029.html)配置）。
+
+参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)的“生成RANK_TABLE_FILE(多卡运行必须环节)”章节。
+
+### node_file准备（多机运行需要）
+**部署在管理节点上**
+node_file是需要给cluster_tools传入的文件，它包含了计算节点的具体信息。
+node_file需要自建，格式参考[cluster_tools分布式运行工具主页](https://gitee.com/aisbench/cluster_tools/)的“集群节点信息文件内容格式”章节自行创建。**注意，node_file中计算节点的顺序需要与rank_table_file中的计算节点顺序相同。**
+
+### 模型权重下载与转换
+微调任务需要，预训练任务不涉及，**部署在所有计算节点上**。
+参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)的“模型权重下载与转换”章节准备LLaMA2 70b的ckpt文件。
+
+### 数据集准备
+#### 预训练数据集准备
+**部署在所有计算节点上**，如果不想手动部署，可以放在负载包的`code/`路径中。
+参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“预训练/数据集准备”章节准备转换好的预训练数据集。
+#### 微调数据集准备
+部署在所有计算节点上，如果不想手动部署，可以放在负载包的`code/`路径中。
+参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“微调/数据集准备”章节准备转换好的微调数据集。
+#### 评测数据集准备
+依据实际的评测需求准备数据集，**部署在管理节点上**。
+**wikitext**
+参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“评测/文本生成/获取数据集”章节准备评测数据集。
+**SQuAD**
+参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“评测阅读理解/获取数据集”章节准备评测数据集。
+
+
+## 查看本负载包支持的mindformers启动配置文件
+查看`code/register_task.sh`文件：
+```bash
+#!/bin/bash
+# 单机运行的任务
+SINGLE_NODE_LAUNCH=()
+# 多机运行的任务
+MULTI_NODES_LAUNCH=( \
+    "run_llama2_70b_910b" \
+    "run_llama2_70b_910b_finetune"
+)
+```
+
+```bash
+"run_llama2_70b_910b_finetune"  # 多机 每机8卡 910B1,910B2,910B3 微调任务
+"run_llama2_70b_910b"  # 多机 每机8卡 910B1,910B2,910B3 预训练任务
+```
+需要确认`code/launch_yamls/`路径下包含了上述`code/register_task.sh`中注册的yaml文件。
+
+## 启动前配置
+### 负载通用配置文件launch_config.sh配置
+编辑`code/launch_config.sh`启动文件：
+```bash
+#!/bin/bash
+
+export AIS_PYTHON=python3 # 使用的python解释器
+export AISBENCH_LOGGING_WARM_UP_STEP_COUNT=3 # 从第几个steps之后开始统计step的性能数据
+
+export AIS_NODE_FILE_PATH=/home/xx/xx/xx/node_file.json # 分布式运行使用cluster_tools所需包含节点信息和ssh key路径的文件，单机训练不用填
+export AIS_TRAIN_TASK_TYPE="train" # 预训练："train"，微调："finetune"
+export AIS_MODEL_NAME="llama2_70b" # 维持默认
+export AIS_TRAIN_YAML="run_llama2_70b" # 请从code/registed_task.sh中注册的yaml文件中选择一个填入
+
+export AIS_RANK_NUM=64 # 集群总加速卡数
+export AIS_DEVICE_NUM=8 # 单台服务器的加速卡数量
+export AIS_RANK_TABLE_FILE="./xx.json" # rank_table_file 的路径, 相对于当前脚本
+
+```
+### 修改yaml配置文件
+修改`code/launch_config.sh`中设置的`AIS_TRAIN_YAML`对应的`code/launch_yaml/`中的yaml配置文件。
+#### 预训练任务
+```yaml
+# dataset
+train_dataset: &train_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: "" # 需要填入.mindrecord后缀的wikitext-2数据集的路径。
+    shuffle: True
+```
+`dataset_dir`支持绝对路径以及相对路径。`dataset_dir`填入的相对路径`../../`实际对应负载包路径`code/mindformers/`。
+
+#### 微调任务
+```yaml
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy，维持默认
+load_checkpoint: '{path}/llama2_70b.ckpt'  # 需要填入准备的权重文件.ckpt的路径
+src_strategy_path_or_dir: ''
+```
+`load_checkpoint`支持绝对路径以及相对路径。`load_checkpoint`填入的相对路径`./`实际对应负载包路径`code/mindformers/`
+```yaml
+# dataset
+train_dataset: &train_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: "/{path}/alpaca-fastchat2048.mindrecord" # 需要填入.mindrecord后缀的alpaca数据集的路径。
+    shuffle: True
+```
+`dataset_dir`支持绝对路径以及相对路径。`dataset_dir`填入的相对路径`../../`实际对应负载包路径`code/mindformers/`
+
+### 修改评测启动脚本
+`code/evaluate_scripts.sh`评测脚本的启动较为多样，为了保证可拓展性，支持用户自行修改。默认提供的是基于wikitext-2数据集的文本生成任务的评测启动脚本。`code/evaluate_scripts.sh`默认内容如下：
+```bash
+#!/bin/bash
+#!/bin/bash
+CUR_DIR=$(cd "$(dirname "$0")";pwd)
+MINDFORMERS_CODE_PATH=${CUR_DIR}/mindformers/
+LAUNCH_SCRIPT_PATH=${MINDFORMERS_CODE_PATH}/scripts/
+OUTPUT_PATH=${MINDFORMERS_CODE_PATH}/output/ # 训练完后output路径
+
+# eval_script 评测启动命令请自行根据实际情况修改配置文件
+export mindformers_workload_eval_cmd="bash  ${LAUNCH_SCRIPT_PATH}/run_distribute.sh \
+--config ${CUR_DIR}/launch_yamls/predict_llama2_70b_910b.yaml \
+[0,8] \
+eval"
+```
+具体运行`predict_llama2_70b_910b.yaml`评测脚本需要做的准备，请参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)中的“评测/分布式评测”章节。
+
+
+## 启动测试
+### 在线测试
+在线测试的前置准备请参考`STUBS_PACKAGE_INTRO.md`文档。启动命令：
+```bash
+./ais-bench-stubs
+```
+### 轻量化离线测试
+启动命令：
+```bash
+./ais-bench-stubs test
+```
+
diff --git a/huawei/mindspore/mindformers/models/llama2_70b/evaluate_scripts.sh b/huawei/mindspore/mindformers/models/llama2_70b/evaluate_scripts.sh
new file mode 100644
index 0000000..6fa35c5
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_70b/evaluate_scripts.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+CUR_DIR=$(cd "$(dirname "$0")";pwd)
+MINDFORMERS_CODE_PATH=${CUR_DIR}/mindformers/
+LAUNCH_SCRIPT_PATH=${MINDFORMERS_CODE_PATH}/scripts/
+OUTPUT_PATH=${MINDFORMERS_CODE_PATH}/output/ # 训练完后output路径
+
+# eval_script 评测启动命令请自行根据实际情况修改配置文件
+export mindformers_workload_eval_cmd="bash  ${LAUNCH_SCRIPT_PATH}/run_distribute.sh \
+--config ${CUR_DIR}/launch_yamls/run_llama2_7b.yaml \
+[0,8] \
+eval"
\ No newline at end of file
diff --git a/huawei/mindspore/mindformers/models/llama2_70b/launch_config.sh b/huawei/mindspore/mindformers/models/llama2_70b/launch_config.sh
new file mode 100644
index 0000000..6b03a19
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_70b/launch_config.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+export AIS_PYTHON=python3 # 使用的python解释器
+export AISBENCH_LOGGING_WARM_UP_STEP_COUNT=3 # 从第几个steps之后开始统计step的性能数据
+
+export AIS_NODE_FILE_PATH=/home/xx/xx/xx/node_file.json # 分布式运行使用cluster_tools所需包含节点信息和ssh key路径的文件，单机训练不用填
+export AIS_TRAIN_TASK_TYPE="train" # 预训练："train"，微调："finetune"
+export AIS_MODEL_NAME="llama2_70b"
+export AIS_TRAIN_YAML="run_llama2_70b_910b" # 请从code/registed_task.sh中注册的yaml文件中选择一个填入
+
+export AIS_RANK_NUM=8 # 集群总加速卡数
+export AIS_DEVICE_NUM=8 # 单台服务器的加速卡数量
+export AIS_RANK_TABLE_FILE="./xx.json" # rank_table_file 的路径, 相对于当前脚本
diff --git a/huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/predict_llama2_70b_910b.yaml b/huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/predict_llama2_70b_910b.yaml
new file mode 100644
index 0000000..14bb735
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/predict_llama2_70b_910b.yaml
@@ -0,0 +1,154 @@
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy
+load_checkpoint: ''
+src_strategy_path_or_dir: ''
+auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model
+only_save_strategy: False
+resume_training: False
+run_mode: 'predict'
+
+# trainer config
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'llama_70b'
+# if True, do evaluate during the training process. if false, do nothing.
+# note that the task trainer should support _evaluate_in_training function.
+do_eval: False
+eval_step_interval: -1        # num of step intervals between each eval, -1 means no step end eval.
+eval_epoch_interval: 50        # num of epoch intervals between each eval, 1 means eval on every epoch end.
+
+# runner config
+runner_config:
+  epochs: 2
+  batch_size: 1
+  sink_mode: True
+  sink_size: 2
+
+use_parallel: True
+# parallel context config
+parallel:
+  parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
+  gradients_mean: False
+  enable_alltoall: False
+  full_batch: True
+  search_mode: "sharding_propagation"
+  enable_parallel_optimizer: True
+  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+  parallel_optimizer_config:
+    gradient_accumulation_shard: False
+    parallel_optimizer_threshold: 64
+# default parallel of device num = 32 for Atlas 800T A2
+parallel_config:
+  data_parallel: 1
+  model_parallel: 8
+  pipeline_stage: 1
+  use_seq_parallel: False
+  micro_batch_num: 1
+  vocab_emb_dp: True
+  gradient_aggregation_group: 4
+# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
+micro_batch_interleave_num: 1
+
+# callbacks
+callbacks:
+  - type: MFLossMonitor
+  - type: CheckpointMointor
+    prefix: "llama_70b"
+    save_checkpoint_steps: 1000
+    integrated_save: False
+    async_save: False
+  - type: ObsMonitor
+
+# mindspore context init config
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  device_target: "Ascend"
+  enable_graph_kernel: False
+  graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"
+  ascend_config:
+    precision_mode: "must_keep_origin_dtype"
+  max_call_depth: 10000
+  max_device_memory: "58GB"
+  save_graphs: False
+  save_graphs_path: "./graph"
+  device_id: 0
+
+# model config
+model:
+  model_config:
+    type: LlamaConfig
+    batch_size: 1 # add for increase predict
+    seq_length: 4096
+    hidden_size: 8192
+    num_layers: 80
+    num_heads: 64
+    max_position_embedding: 4096
+    vocab_size: 32000
+    multiple_of: 256
+    n_kv_heads: 8
+    ffn_dim_multiplier: 1.3
+    rms_norm_eps: 1.0e-5
+    bos_token_id: 1
+    eos_token_id: 2
+    pad_token_id: 0
+    ignore_token_id: -100
+    compute_dtype: "float16"
+    layernorm_compute_type: "float32"
+    softmax_compute_type: "float16"
+    rotary_dtype: "float16"
+    param_init_type: "float16"
+    use_past: True
+    scaling_factor: 1.0
+    extend_method: "None" # support "None", "PI", "NTK"
+    use_flash_attention: False
+    use_paged_attention: False  # PA only supported in inference
+    block_size: 16
+    num_blocks: 512
+    is_dynamic: False
+    use_kvcache_op: False
+    is_flexible_shape: False
+    offset: 0
+    use_rope_slice: False
+    checkpoint_name_or_path: "llama2_70b"
+    repetition_penalty: 1
+    max_decode_length: 512
+    top_k: 3
+    top_p: 1
+    do_sample: False
+  arch:
+    type: LlamaForCausalLM
+
+processor:
+  return_tensors: ms
+  tokenizer:
+    unk_token: '<unk>'
+    bos_token: '<s>'
+    eos_token: '</s>'
+    pad_token: '<unk>'
+    type: LlamaTokenizer
+    vocab_file: ""
+  type: LlamaProcessor
+
+# metric
+metric:
+  type: PerplexityMetric
+
+eval_callbacks:
+  - type: ObsMonitor
+
+auto_tune: False
+filepath_prefix: './autotune'
+autotune_per_step: 10
+
+profile: False
+profile_start_step: 1
+profile_stop_step: 10
+init_start_profile: False
+profile_communication: False
+profile_memory: True
+layer_scale: False
+layer_decay: 0.65
+lr_scale_factor: 256
+
+# aicc
+remote_save_url: "Please input obs url on AICC platform."
diff --git a/huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/run_llama2_70b_910b.yaml b/huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/run_llama2_70b_910b.yaml
new file mode 100644
index 0000000..33babe9
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/run_llama2_70b_910b.yaml
@@ -0,0 +1,214 @@
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy
+load_checkpoint: ''
+src_strategy_path_or_dir: ''
+auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model
+only_save_strategy: False
+resume_training: False
+run_mode: 'train'
+
+# trainer config
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'llama2_70b'
+# if True, do evaluate during the training process. if false, do nothing.
+# note that the task trainer should support _evaluate_in_training function.
+do_eval: False
+eval_step_interval: -1        # num of step intervals between each eval, -1 means no step end eval.
+eval_epoch_interval: 50        # num of epoch intervals between each eval, 1 means eval on every epoch end.
+
+# runner config
+runner_config:
+  epochs: 3
+  batch_size: 1
+  sink_mode: True
+  sink_size: 2
+# optimizer
+optimizer:
+  type: FP32StateAdamWeightDecay
+  beta1: 0.9
+  beta2: 0.95
+  eps: 1.e-8 # 1e-8
+  learning_rate: 1.e-5
+
+# lr sechdule
+lr_schedule:
+  type: CosineWithWarmUpLR
+  learning_rate: 1.e-5
+  lr_end: 0
+  warmup_ratio: 0.03
+  total_steps: -1 # -1 means it will load the total steps of the dataset
+
+# dataset
+train_dataset: &train_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: ""
+    shuffle: True
+  input_columns: ["input_ids"]  # "input_ids", "labels" , labels are used in instruction finetune.
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  batch_size: 1
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+train_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *train_dataset
+
+# eval dataset
+eval_dataset: &eval_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: ""
+    shuffle: False
+  input_columns: ["input_ids"]
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: False
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+eval_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *eval_dataset
+
+use_parallel: True
+# parallel context config
+parallel:
+  parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
+  gradients_mean: False
+  enable_alltoall: False
+  full_batch: True
+  search_mode: "sharding_propagation"
+  enable_parallel_optimizer: True
+  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+  parallel_optimizer_config:
+    gradient_accumulation_shard: False
+    parallel_optimizer_threshold: 64
+# default parallel of device num = 32 for Atlas 800T A2
+parallel_config:
+  data_parallel: 2
+  model_parallel: 4
+  pipeline_stage: 8
+  use_seq_parallel: True
+  micro_batch_num: 128
+  vocab_emb_dp: True
+  gradient_aggregation_group: 4
+# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
+micro_batch_interleave_num: 1
+
+# recompute config
+recompute_config:
+  recompute: False
+  select_recompute: False
+  parallel_optimizer_comm_recompute: False
+  mp_comm_recompute: True
+  recompute_slice_activation: True
+
+# callbacks
+callbacks:
+  - type: MFLossMonitor
+  - type: CheckpointMointor
+    prefix: "llama_70b"
+    save_checkpoint_steps: 100000000 # big enough
+    integrated_save: False
+    async_save: False
+  - type: ObsMonitor
+
+# mindspore context init config
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  device_target: "Ascend"
+  enable_graph_kernel: False
+  graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"
+  max_call_depth: 10000
+  max_device_memory: "54GB"
+  save_graphs: False
+  save_graphs_path: "./graph"
+  device_id: 0
+  runtime_num_threads: 1
+
+# model config
+model:
+  model_config:
+    type: LlamaConfig
+    batch_size: 1 # add for increase predict
+    seq_length: 4096
+    hidden_size: 8192
+    num_layers: 80
+    num_heads: 64
+    vocab_size: 32000
+    multiple_of: 256
+    n_kv_heads: 8
+    ffn_dim_multiplier: 1.3
+    rms_norm_eps: 1.0e-5
+    bos_token_id: 1
+    eos_token_id: 2
+    pad_token_id: 0
+    ignore_token_id: -100
+    compute_dtype: "float16"
+    layernorm_compute_type: "float32"
+    softmax_compute_type: "float16"
+    rotary_dtype: "float16"
+    param_init_type: "float16"
+    use_past: False
+    scaling_factor: 1.0
+    extend_method: "None" # support "None", "PI", "NTK"
+    use_flash_attention: True
+    fine_grain_interleave: 2
+    qkv_concat: False
+    offset: 0
+    checkpoint_name_or_path: ""
+    repetition_penalty: 1
+    max_decode_length: 512
+    top_k: 3
+    top_p: 1
+    do_sample: False
+  arch:
+    type: LlamaForCausalLM
+
+processor:
+  return_tensors: ms
+  tokenizer:
+    unk_token: '<unk>'
+    bos_token: '<s>'
+    eos_token: '</s>'
+    pad_token: '<unk>'
+    type: LlamaTokenizer
+  type: LlamaProcessor
+
+# metric
+metric:
+  type: PerplexityMetric
+
+# wrapper cell config
+runner_wrapper:
+  type: MFTrainOneStepCell
+  scale_sense:
+    type: DynamicLossScaleUpdateCell
+    loss_scale_value: 65536
+    scale_factor: 2
+    scale_window: 1000
+  use_clip_grad: True
+
+eval_callbacks:
+  - type: ObsMonitor
+
+auto_tune: False
+filepath_prefix: './autotune'
+autotune_per_step: 10
+
+profile: False
+profile_start_step: 1
+profile_stop_step: 10
+init_start_profile: False
+profile_communication: False
+profile_memory: True
+layer_scale: False
+layer_decay: 0.65
+lr_scale_factor: 256
+
+# aicc
+remote_save_url: "Please input obs url on AICC platform."
diff --git a/huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/run_llama2_70b_910b_finetune.yaml b/huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/run_llama2_70b_910b_finetune.yaml
new file mode 100644
index 0000000..d193748
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_70b/launch_yamls/run_llama2_70b_910b_finetune.yaml
@@ -0,0 +1,214 @@
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy
+load_checkpoint: '{path}/llama2_13b.ckpt'
+src_strategy_path_or_dir: ''
+auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model
+only_save_strategy: False
+resume_training: False
+run_mode: 'finetune'
+
+# trainer config
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'llama2_70b'
+# if True, do evaluate during the training process. if false, do nothing.
+# note that the task trainer should support _evaluate_in_training function.
+do_eval: False
+eval_step_interval: -1        # num of step intervals between each eval, -1 means no step end eval.
+eval_epoch_interval: 50        # num of epoch intervals between each eval, 1 means eval on every epoch end.
+
+# runner config
+runner_config:
+  epochs: 2
+  batch_size: 1
+  sink_mode: True
+  sink_size: 2
+# optimizer
+optimizer:
+  type: FP32StateAdamWeightDecay
+  beta1: 0.9
+  beta2: 0.99
+  eps: 1.e-8 # 1e-8
+  learning_rate: 1.e-5
+
+# lr sechdule
+lr_schedule:
+  type: CosineWithWarmUpLR
+  learning_rate: 1.e-5
+  lr_end: 0
+  warmup_ratio: 0.03
+  total_steps: -1 # -1 means it will load the total steps of the dataset
+
+# dataset
+train_dataset: &train_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: ""
+    shuffle: True
+  input_columns: ["input_ids"]  # "input_ids", "labels" , labels are used in instruction finetune.
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  batch_size: 1
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+train_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *train_dataset
+
+# eval dataset
+eval_dataset: &eval_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: ""
+    shuffle: False
+  input_columns: ["input_ids"]
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: False
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+eval_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *eval_dataset
+
+use_parallel: True
+# parallel context config
+parallel:
+  parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
+  gradients_mean: False
+  enable_alltoall: False
+  full_batch: True
+  search_mode: "sharding_propagation"
+  enable_parallel_optimizer: True
+  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+  parallel_optimizer_config:
+    gradient_accumulation_shard: False
+    parallel_optimizer_threshold: 64
+# default parallel of device num = 32 for Atlas 800T A2
+parallel_config:
+  data_parallel: 2
+  model_parallel: 4
+  pipeline_stage: 8
+  use_seq_parallel: True
+  micro_batch_num: 128
+  vocab_emb_dp: True
+  gradient_aggregation_group: 4
+# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
+micro_batch_interleave_num: 1
+
+# recompute config
+recompute_config:
+  recompute: False
+  select_recompute: False
+  parallel_optimizer_comm_recompute: False
+  mp_comm_recompute: True
+  recompute_slice_activation: True
+
+# callbacks
+callbacks:
+  - type: MFLossMonitor
+  - type: CheckpointMointor
+    prefix: "llama_70b"
+    save_checkpoint_steps: 100000000 # big enough
+    integrated_save: False
+    async_save: False
+  - type: ObsMonitor
+
+# mindspore context init config
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  device_target: "Ascend"
+  enable_graph_kernel: False
+  graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"
+  max_call_depth: 10000
+  max_device_memory: "54GB"
+  save_graphs: False
+  save_graphs_path: "./graph"
+  device_id: 0
+
+# model config
+model:
+  model_config:
+    type: LlamaConfig
+    batch_size: 1 # add for increase predict
+    seq_length: 4096
+    hidden_size: 8192
+    num_layers: 80
+    num_heads: 64
+    vocab_size: 32000
+    multiple_of: 256
+    n_kv_heads: 8
+    ffn_dim_multiplier: 1.3
+    rms_norm_eps: 1.0e-5
+    bos_token_id: 1
+    eos_token_id: 2
+    pad_token_id: 0
+    ignore_token_id: -100
+    compute_dtype: "float16"
+    layernorm_compute_type: "float32"
+    softmax_compute_type: "float16"
+    rotary_dtype: "float16"
+    param_init_type: "float16"
+    use_past: False
+    scaling_factor: 1.0
+    extend_method: "None" # support "None", "PI", "NTK"
+    use_flash_attention: True  # FA can accelerate training or finetune
+    fine_grain_interleave: 2
+    qkv_concat: false
+    offset: 0
+    use_past_shard: False
+    checkpoint_name_or_path: ""
+    repetition_penalty: 1
+    max_decode_length: 512
+    top_k: 3
+    top_p: 1
+    do_sample: False
+  arch:
+    type: LlamaForCausalLM
+
+processor:
+  return_tensors: ms
+  tokenizer:
+    unk_token: '<unk>'
+    bos_token: '<s>'
+    eos_token: '</s>'
+    pad_token: '<unk>'
+    type: LlamaTokenizer
+  type: LlamaProcessor
+
+# metric
+metric:
+  type: PerplexityMetric
+
+# wrapper cell config
+runner_wrapper:
+  type: MFTrainOneStepCell
+  scale_sense:
+    type: DynamicLossScaleUpdateCell
+    loss_scale_value: 65536
+    scale_factor: 2
+    scale_window: 1000
+  use_clip_grad: True
+
+eval_callbacks:
+  - type: ObsMonitor
+
+auto_tune: False
+filepath_prefix: './autotune'
+autotune_per_step: 10
+
+profile: False
+profile_start_step: 1
+profile_stop_step: 10
+init_start_profile: False
+profile_communication: False
+profile_memory: True
+layer_scale: False
+layer_decay: 0.65
+lr_scale_factor: 256
+
+# aicc
+remote_save_url: "Please input obs url on AICC platform."
diff --git a/huawei/mindspore/mindformers/models/llama2_70b/registed_tasks.sh b/huawei/mindspore/mindformers/models/llama2_70b/registed_tasks.sh
new file mode 100644
index 0000000..c2153a9
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_70b/registed_tasks.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+# 单机运行的任务
+SINGLE_NODE_LAUNCH=()
+# 多机运行的任务
+MULTI_NODES_LAUNCH=( \
+    "run_llama2_70b_910b" \
+    "run_llama2_70b_910b_finetune"
+)
\ No newline at end of file
diff --git a/huawei/mindspore/mindformers/models/llama2_7b/README.md b/huawei/mindspore/mindformers/models/llama2_7b/README.md
new file mode 100644
index 0000000..7915f21
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_7b/README.md
@@ -0,0 +1,185 @@
+# AISBench 基于Mindspore/mindformers框架的LLaMA2 7b 训练负载包使用指南
+本文主要介绍使用基于mindformers LLaMA2 7b大模型训练业务代码构建的AISBench的负载包"Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-llama2_7b-{mindformers version}.tar.gz"，进行服务器性能测试的流程。
+本负载包支持一键拉起单机的LLaMA2 7b模型的预训练或微调任务，自动完成评测，并汇总训练相关的性能数据。
+## 名词定义
+|名词|	定义|
+| --- | ----------------------------------- |
+|ais-bench-stubs|启动性能测试任务的二进制程序|
+## 查看llama2 7b 训练负载包目录结构，简单确认完整性
+解压负载包"Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-llama2_7b-{mindformers version}.tar.gz"（如果在包中看到本文档忽略此步）
+```bash
+tar xzf Ais-Benchmark-Stubs-{arch}-{Stubs version}-training-mindformers-llama2_7b-{mindformers version}.tar.gz
+```
+负载包名中"{mindformers version}"表示[mindformer仓库](https://gitee.com/mindspore/mindformers)的分支名
+查看目录结构
+```bash
+├── ais-bench-stubs # 启动测试的二进制文件
+├── code/
+│   ├── benchmark.sh
+│   ├── evaluate_run.sh
+│   ├── evaluate_scripts.sh # 评测的运行脚本，需要依据实际运行的评测命令
+│   ├── launch_config.sh
+│   ├── launch_yamls/ # 启动训练任务的配置文件
+|   │   ├──run_llama2_7b_910b.yaml
+|   │   ├──......
+│   ├── mindformers/ # 嵌入了logging打点接口的mindformers代码
+│   ├── multi_nodes_run.sh
+│   ├── registed_tasks.sh # 注册了可用的mindformers脚本
+│   └── single_node_run.sh
+├── config/
+│   ├── config.json
+│   └── system.json
+├── log/
+├── result/
+├── README.md # 本文档
+└── STUBS_PACKAGE_INTRO.md
+```
+**后续对于相对路径的描述都是相对于负载包中的一级目录，例如 ./**
+
+## 负载包运行环境准备
+### 基本环境准备
+```
+python >=3.7
+```
+### mindspore准备
+请依据负载包名中的“{mindformers version}”对应的mindformers分支版本，参考[mindformers训练负载主页](https://gitee.com/aisbench/training/tree/master/huawei/mindspore/mindformers)，安装指定版本的mindspore（python版本不限）。
+MindSpore安装参考[MindSpore官网](https://www.mindspore.cn/)MindSpore需要能成功在npu上运行，验证命令：
+```bash
+python -c "import mindspore;mindspore.set_context(device_target='Ascend');mindspore.run_check()"
+
+### logging准备
+从[logging打点工具发行版](https://gitee.com/aisbench/logging/releases)获取最新的发行版。
+参考[logging打点工具主页](https://gitee.com/aisbench/logging)的“安装与卸载/安装logging”章节安装logging打点工具。
+
+## 资源准备
+### 前置声明
+1. 以下涉及到mindformers代码仓库的链接没有指定代码分支，需要依据负载包名"{mindformers version}",自行切换到对应的分支。
+2. 运行LLaMA2训练的MindSpore/mindformers的代码全部在`./code/mindformers`文件夹中，资源准备总体参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)，具体资源的参考详见本章其他小节。<br>
+### rank_table_file准备
+rank_table_file是一个包含集群节点和加速卡ip信息的json文件。
+准备rank_table_file前确保计算节点的`/etc/hccn.conf`文件已经配好（如果没配好，参考[数据中心解决方案/配置训练节点](https://www.hiascend.com/document/detail/zh/Ascend%20Data%20Center%20Solution/22.0.0/install/800_9000/install_800_9000_0029.html)配置）。
+
+参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)的“生成RANK_TABLE_FILE(多卡运行必须环节)”章节。
+
+### 模型权重下载与转换
+微调任务需要，预训练任务不涉及。
+参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)的“模型权重下载与转换”章节准备LLaMA2 7b的ckpt文件。
+
+### 数据集准备
+#### 预训练数据集准备
+参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“预训练/数据集准备”章节准备转换好的预训练数据集。
+#### 微调数据集准备
+参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“微调/数据集准备”章节准备转换好的微调数据集。
+#### 评测数据集准备
+依据实际的评测需求准备数据集
+**wikitext**
+参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“评测/文本生成/获取数据集”章节准备评测数据集。
+**SQuAD**
+参考[LLaMA2资源准备](https://gitee.com/mindspore/mindformers/blob/ac5bb9ec8d1ea85fd2021ca5c6f13b6ae821c270/docs/model_cards/llama2.md)“评测阅读理解/获取数据集”章节准备评测数据集。
+
+
+## 查看本负载包支持的mindformers启动配置文件
+查看`code/register_task.sh`文件：
+```bash
+#!/bin/bash
+# 单机运行的任务
+SINGLE_NODE_LAUNCH=( \
+    "run_llama2_7b_910b_finetune" \
+    "run_llama2_7b_910b" \
+    "run_llama2_7b_finetune" \
+    "run_llama2_7b"
+)
+# 多机运行的任务，LLaMA2 7b不涉及
+MULTI_NODES_LAUNCH=()
+```
+
+```bash
+"run_llama2_7b_910b_finetune"  # 8卡 910B1,910B2,910B3 微调任务
+"run_llama2_7b_910b"  # 8卡 910B1,910B2,910B3 预训练任务
+"run_llama2_7b_finetune" # 8卡 910A,910B4 微调任务
+"run_llama2_7b" # 8卡 910A,910B4 预训练任务
+```
+需要确认`code/launch_yamls/`路径下包含了上述`code/register_task.sh`中注册的yaml文件。
+
+## 启动前配置
+### 负载通用配置文件launch_config.sh配置
+编辑`code/launch_config.sh`启动文件：
+```bash
+#!/bin/bash
+
+export AIS_PYTHON=python3 # 使用的python解释器
+export AISBENCH_LOGGING_WARM_UP_STEP_COUNT=3 # 从第几个steps之后开始统计step的性能数据
+
+export AIS_NODE_FILE_PATH=/home/xx/xx/xx/node_file.json # 分布式运行使用cluster_tools所需包含节点信息和ssh key路径的文件，单机训练不用填
+export AIS_TRAIN_TASK_TYPE="train" # 预训练："train"，微调："finetune"
+export AIS_MODEL_NAME="llama2_7b" # 维持默认
+export AIS_TRAIN_YAML="run_llama2_7b" # 请从code/registed_task.sh中注册的yaml文件中选择一个填入
+
+export AIS_RANK_NUM=8 # 集群总加速卡数
+export AIS_DEVICE_NUM=8 # 单台服务器的加速卡数量
+export AIS_RANK_TABLE_FILE="./xx.json" # rank_table_file 的路径, 相对于当前脚本
+```
+### 修改yaml配置文件
+修改`code/launch_config.sh`中设置的`AIS_TRAIN_YAML`对应的`code/launch_yaml/`中的yaml配置文件。
+#### 预训练任务
+```yaml
+# dataset
+train_dataset: &train_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: "" # 需要填入.mindrecord后缀的wikitext-2数据集的路径。
+    shuffle: True
+```
+`dataset_dir`支持绝对路径以及相对路径。`dataset_dir`填入的相对路径`../../`实际对应负载包路径`code/mindformers/`
+
+#### 微调任务
+```yaml
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy，维持默认
+load_checkpoint: '{path}/llama2_7b.ckpt'  # 需要填入准备的权重文件.ckpt的路径
+src_strategy_path_or_dir: ''
+```
+`load_checkpoint`支持绝对路径以及相对路径。`load_checkpoint`填入的相对路径`./`实际对应负载包路径`code/mindformers/`
+```yaml
+# dataset
+train_dataset: &train_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: "/{path}/alpaca-fastchat2048.mindrecord" # 需要填入.mindrecord后缀的alpaca数据集的路径。
+    shuffle: True
+```
+`dataset_dir`支持绝对路径以及相对路径。`dataset_dir`填入的相对路径`../../`实际对应负载包路径`code/mindformers/`
+
+### 修改评测启动脚本
+`code/evaluate_scripts.sh`评测脚本的启动较为多样，为了保证可拓展性，支持用户自行修改。默认提供的是基于wikitext-2数据集的文本生成任务的评测启动脚本。`code/evaluate_scripts.sh`默认内容如下：
+```bash
+#!/bin/bash
+CUR_DIR=$(cd "$(dirname "$0")";pwd)
+MINDFORMERS_CODE_PATH=${CUR_DIR}/mindformers/
+LAUNCH_SCRIPT_PATH=${MINDFORMERS_CODE_PATH}/scripts/
+OUTPUT_PATH=${MINDFORMERS_CODE_PATH}/output/ # 训练完后output路径
+
+# eval_script 评测启动命令请自行根据实际情况修改
+export mindformers_workload_eval_cmd="${AIS_PYTHON} ${MINDFORMERS_CODE_PATH}/run_mindformer.py \
+--config ${CUR_DIR}/launch_yamls/run_llama2_7b.yaml \
+--eval_dataset_dir /{path}/wiki4096valid.mindrecord \
+--run_mode eval \
+--load_checkpoint ${OUTPUT_PATH}/target_checkpoint/rank_0/llama2_7b0.ckpt \
+--epochs 1 \
+--use_parallel False \
+--device_id 0"
+```
+其中`--eval_dataset_dir`需要用户自行传入wikitext-2的.mindrecord后缀的评测数据集的绝对路径。
+
+## 启动测试
+### 在线测试
+在线测试的前置准备请参考`STUBS_PACKAGE_INTRO.md`文档。启动命令：
+```bash
+./ais-bench-stubs
+```
+### 轻量化离线测试
+启动命令：
+```bash
+./ais-bench-stubs test
+```
+
diff --git a/huawei/mindspore/mindformers/models/llama2_7b/evaluate_scripts.sh b/huawei/mindspore/mindformers/models/llama2_7b/evaluate_scripts.sh
new file mode 100644
index 0000000..6ea1691
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_7b/evaluate_scripts.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+CUR_DIR=$(cd "$(dirname "$0")";pwd)
+MINDFORMERS_CODE_PATH=${CUR_DIR}/mindformers/
+LAUNCH_SCRIPT_PATH=${MINDFORMERS_CODE_PATH}/scripts/
+OUTPUT_PATH=${MINDFORMERS_CODE_PATH}/output/ # 训练完后output路径
+
+# eval_script 评测启动命令请自行根据实际情况修改
+export mindformers_workload_eval_cmd="${AIS_PYTHON} ${MINDFORMERS_CODE_PATH}/run_mindformer.py \
+--config ${CUR_DIR}/launch_yamls/run_llama2_7b.yaml \
+--eval_dataset_dir /{path}/wiki4096valid.mindrecord \
+--run_mode eval \
+--load_checkpoint ${OUTPUT_PATH}/target_checkpoint/rank_0/llama2_7b0.ckpt \
+--epochs 1 \
+--use_parallel False \
+--device_id 0"
\ No newline at end of file
diff --git a/huawei/mindspore/mindformers/models/llama2_7b/launch_config.sh b/huawei/mindspore/mindformers/models/llama2_7b/launch_config.sh
new file mode 100644
index 0000000..1b6e5d8
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_7b/launch_config.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+export AIS_PYTHON=python3 # 使用的python解释器
+export AISBENCH_LOGGING_WARM_UP_STEP_COUNT=3 # 从第几个steps之后开始统计step的性能数据
+
+export AIS_NODE_FILE_PATH=/home/xx/xx/xx/node_file.json # 分布式运行使用cluster_tools所需包含节点信息和ssh key路径的文件，单机训练不用填
+export AIS_TRAIN_TASK_TYPE="train" # 预训练："train"，微调："finetune"
+export AIS_MODEL_NAME="llama2_7b" # 维持默认
+export AIS_TRAIN_YAML="run_llama2_7b" # 请从code/registed_task.sh中注册的yaml文件中选择一个填入
+
+export AIS_RANK_NUM=8 # 集群总加速卡数
+export AIS_DEVICE_NUM=8 # 单台服务器的加速卡数量
+export AIS_RANK_TABLE_FILE="./xx.json" # rank_table_file 的路径, 相对于当前脚本
diff --git a/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b.yaml b/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b.yaml
new file mode 100644
index 0000000..7bf7e66
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b.yaml
@@ -0,0 +1,213 @@
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy
+load_checkpoint: ''
+src_strategy_path_or_dir: ''
+auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model
+only_save_strategy: False
+resume_training: False
+run_mode: 'train'
+
+# trainer config
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'llama2_7b'
+
+# runner config
+runner_config:
+  epochs: 2
+  batch_size: 1
+  sink_mode: True
+  sink_size: 2
+
+# optimizer
+optimizer:
+  type: FP32StateAdamWeightDecay
+  beta1: 0.9
+  beta2: 0.95
+  eps: 1.e-8 # 1e-8
+  learning_rate: 3.e-4
+
+# lr sechdule
+lr_schedule:
+  type: CosineWithWarmUpLR
+  learning_rate: 3.e-4
+  lr_end: 3.e-5
+  warmup_ratio: 0.03
+  total_steps: -1 # -1 means it will load the total steps of the dataset
+
+# dataset
+train_dataset: &train_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: ""
+    shuffle: True
+  input_columns: ["input_ids"]  # "input_ids", "labels" , labels are used in instruction finetune.
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  batch_size: 4
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+train_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *train_dataset
+# if True, do evaluate during the training process. if false, do nothing.
+# note that the task trainer should support _evaluate_in_training function.
+do_eval: False
+eval_step_interval: -1        # num of step intervals between each eval, -1 means no step end eval.
+eval_epoch_interval: 50        # num of epoch intervals between each eval, 1 means eval on every epoch end.
+
+# eval dataset
+eval_dataset: &eval_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: ""
+    shuffle: False
+  input_columns: ["input_ids"]
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: False
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+eval_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *eval_dataset
+
+use_parallel: True
+# parallel context config
+parallel:
+  parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
+  gradients_mean: False
+  enable_alltoall: False
+  full_batch: True
+  search_mode: "sharding_propagation"
+  enable_parallel_optimizer: True
+  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+  parallel_optimizer_config:
+    gradient_accumulation_shard: False
+    parallel_optimizer_threshold: 64
+# default parallel of device num = 8 for Atlas 800
+parallel_config:
+  data_parallel: 2
+  model_parallel: 4
+  pipeline_stage: 1
+  use_seq_parallel: False
+  micro_batch_num: 8
+  vocab_emb_dp: True
+  gradient_aggregation_group: 4
+# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
+micro_batch_interleave_num: 2
+
+# recompute config
+recompute_config:
+  recompute: True
+  select_recompute: False
+  parallel_optimizer_comm_recompute: False
+  mp_comm_recompute: True
+  recompute_slice_activation: True
+
+# callbacks
+callbacks:
+  - type: MFLossMonitor
+  - type: CheckpointMointor
+    prefix: "llama_7b"
+    save_checkpoint_steps: 10000000 # big enough
+    integrated_save: False
+    async_save: False
+  - type: ObsMonitor
+
+# mindspore context init config
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  device_target: "Ascend"
+  enable_graph_kernel: False
+  graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"
+  max_call_depth: 10000
+  max_device_memory: "28GB"
+  save_graphs: False
+  save_graphs_path: "./graph"
+  device_id: 0
+
+# model config
+model:
+  model_config:
+    type: LlamaConfig
+    batch_size: 1 # add for increase predict
+    seq_length: 4096
+    hidden_size: 4096
+    num_layers: 32
+    num_heads: 32
+    max_position_embedding: 4096
+    vocab_size: 32000
+    multiple_of: 256
+    rms_norm_eps: 1.0e-5
+    bos_token_id: 1
+    eos_token_id: 2
+    pad_token_id: 0
+    ignore_token_id: -100
+    compute_dtype: "float16"
+    layernorm_compute_type: "float32"
+    softmax_compute_type: "float16"
+    rotary_dtype: "float16"
+    param_init_type: "float16"
+    use_past: False
+    pretrain_seqlen: 4096 # seqlen of the pretrain checkpoint: 2048 for llama and 4096 for llama2
+    extend_method: "None" # support "None", "PI", "NTK"
+    compute_in_2d: False
+    use_flash_attention: False # FA can accelerate training or finetune
+    offset: 0
+    use_past_shard: False
+    checkpoint_name_or_path: "llama2_7b"
+    repetition_penalty: 1
+    max_decode_length: 512
+    top_k: 3
+    top_p: 1
+    do_sample: False
+  arch:
+    type: LlamaForCausalLM
+
+processor:
+  return_tensors: ms
+  tokenizer:
+    unk_token: '<unk>'
+    bos_token: '<s>'
+    eos_token: '</s>'
+    pad_token: '<unk>'
+    type: LlamaTokenizer
+  type: LlamaProcessor
+
+# metric
+metric:
+  type: PerplexityMetric
+
+# wrapper cell config
+runner_wrapper:
+  type: MFTrainOneStepCell
+  scale_sense:
+    type: DynamicLossScaleUpdateCell
+    loss_scale_value: 65536
+    scale_factor: 2
+    scale_window: 1000
+  use_clip_grad: True
+
+eval_callbacks:
+  - type: ObsMonitor
+
+auto_tune: False
+filepath_prefix: './autotune'
+autotune_per_step: 10
+
+profile: False
+profile_start_step: 1
+profile_stop_step: 10
+init_start_profile: False
+profile_communication: False
+profile_memory: True
+layer_scale: False
+layer_decay: 0.65
+lr_scale_factor: 256
+
+# aicc
+remote_save_url: "Please input obs url on AICC platform."
diff --git a/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_910b.yaml b/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_910b.yaml
new file mode 100644
index 0000000..01fb8a0
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_910b.yaml
@@ -0,0 +1,210 @@
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy
+load_checkpoint: ''
+src_strategy_path_or_dir: ''
+auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model
+only_save_strategy: False
+resume_training: False
+run_mode: 'train'
+
+# trainer config
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'llama2_7b'
+
+# runner config
+runner_config:
+  epochs: 2
+  batch_size: 1
+  sink_mode: True
+  sink_size: 2
+  gradient_accumulation_steps: 8
+
+# optimizer
+optimizer:
+  type: FP32StateAdamWeightDecay
+  beta1: 0.9
+  beta2: 0.95
+  eps: 1.e-8
+  learning_rate: 5.e-5
+
+# lr sechdule
+lr_schedule:
+  type: CosineWithWarmUpLR
+  learning_rate: 5.e-5
+  lr_end: 0
+  warmup_ratio: 0.03
+  total_steps: -1 # -1 means it will load the total steps of the dataset
+
+# dataset
+train_dataset: &train_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: ""
+    shuffle: True
+  input_columns: ["input_ids"]  # "input_ids", "labels" , labels are used in instruction finetune.
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  batch_size: 6
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+train_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *train_dataset
+# if True, do evaluate during the training process. if false, do nothing.
+# note that the task trainer should support _evaluate_in_training function.
+do_eval: False
+
+# eval dataset
+eval_dataset: &eval_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: ""
+    shuffle: False
+  input_columns: ["input_ids"]
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: False
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+eval_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *eval_dataset
+
+use_parallel: True
+# parallel context config
+parallel:
+  parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
+  gradients_mean: False
+  enable_alltoall: False
+  full_batch: True
+  search_mode: "sharding_propagation"
+  enable_parallel_optimizer: True
+  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+  parallel_optimizer_config:
+    gradient_accumulation_shard: False
+    parallel_optimizer_threshold: 64
+# default parallel of device num = 8 for Atlas 800T A2
+parallel_config:
+  data_parallel: 8
+  model_parallel: 1
+  pipeline_stage: 1
+  use_seq_parallel: False
+  micro_batch_num: 1
+  vocab_emb_dp: True
+  gradient_aggregation_group: 4
+# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
+micro_batch_interleave_num: 1
+
+# recompute config
+recompute_config:
+  recompute: False
+  select_recompute: False
+  parallel_optimizer_comm_recompute: False
+  mp_comm_recompute: True
+  recompute_slice_activation: True
+
+# callbacks
+callbacks:
+  - type: MFLossMonitor
+  - type: CheckpointMointor
+    prefix: "llama2_7b"
+    save_checkpoint_steps: 10000000 # big enough
+    integrated_save: False
+    async_save: False
+  - type: ObsMonitor
+
+# mindspore context init config
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  device_target: "Ascend"
+  enable_graph_kernel: False
+  graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"
+  max_call_depth: 10000
+  max_device_memory: "58GB"
+  save_graphs: False
+  save_graphs_path: "./graph"
+  device_id: 0
+  runtime_num_threads: 1
+
+# model config
+model:
+  model_config:
+    type: LlamaConfig
+    batch_size: 1 # add for increase predict
+    seq_length: 4096
+    hidden_size: 4096
+    num_layers: 32
+    num_heads: 32
+    vocab_size: 32000
+    multiple_of: 256
+    rms_norm_eps: 1.0e-5
+    bos_token_id: 1
+    eos_token_id: 2
+    pad_token_id: 0
+    ignore_token_id: -100
+    compute_dtype: "float16"
+    layernorm_compute_type: "float32"
+    softmax_compute_type: "float16"
+    rotary_dtype: "float16"
+    param_init_type: "float16"
+    use_past: False
+    scaling_factor: 1.0
+    extend_method: "None" # support "None", "PI", "NTK"
+    use_flash_attention: True # FA can accelerate training or finetune
+    offset: 0
+    checkpoint_name_or_path: "llama2_7b"
+    repetition_penalty: 1
+    max_decode_length: 512
+    top_k: 3
+    top_p: 1
+    do_sample: False
+  arch:
+    type: LlamaForCausalLM
+
+processor:
+  return_tensors: ms
+  tokenizer:
+    unk_token: '<unk>'
+    bos_token: '<s>'
+    eos_token: '</s>'
+    pad_token: '<unk>'
+    type: LlamaTokenizer
+  type: LlamaProcessor
+
+# metric
+metric:
+  type: PerplexityMetric
+
+# wrapper cell config
+runner_wrapper:
+  type: MFTrainOneStepCell
+  scale_sense:
+    type: DynamicLossScaleUpdateCell
+    loss_scale_value: 65536
+    scale_factor: 2
+    scale_window: 1000
+  use_clip_grad: True
+
+eval_callbacks:
+  - type: ObsMonitor
+
+auto_tune: False
+filepath_prefix: './autotune'
+autotune_per_step: 10
+
+profile: False
+profile_start_step: 1
+profile_stop_step: 10
+init_start_profile: False
+profile_communication: False
+profile_memory: True
+layer_scale: False
+layer_decay: 0.65
+lr_scale_factor: 256
+
+# aicc
+remote_save_url: "Please input obs url on AICC platform."
diff --git a/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_910b_finetune.yaml b/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_910b_finetune.yaml
new file mode 100644
index 0000000..8431b1d
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_910b_finetune.yaml
@@ -0,0 +1,209 @@
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy
+load_checkpoint: '{path}/llama2_7b.ckpt'  # 必填
+src_strategy_path_or_dir: ''
+auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model
+only_save_strategy: False
+resume_training: False
+run_mode: 'finetune'
+
+# trainer config
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'llama2_7b'
+
+# runner config
+runner_config:
+  epochs: 2
+  batch_size: 1
+  sink_mode: True
+  sink_size: 2
+
+# optimizer
+optimizer:
+  type: FP32StateAdamWeightDecay
+  beta1: 0.9
+  beta2: 0.999
+  eps: 1.e-8
+  learning_rate: 1.e-6
+
+# lr sechdule
+lr_schedule:
+  type: CosineWithWarmUpLR
+  learning_rate: 1.e-6
+  lr_end: 0
+  warmup_ratio: 0.03
+  total_steps: -1 # -1 means it will load the total steps of the dataset
+
+# dataset
+train_dataset: &train_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: ""
+    shuffle: True
+  input_columns: ["input_ids", "labels"]  # "input_ids", "labels" , labels are used in instruction finetune.
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  batch_size: 2
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+train_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *train_dataset
+# if True, do evaluate during the training process. if false, do nothing.
+# note that the task trainer should support _evaluate_in_training function.
+do_eval: False
+
+# eval dataset
+eval_dataset: &eval_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: ""
+    shuffle: False
+  input_columns: ["input_ids"]
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: False
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+eval_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *eval_dataset
+
+use_parallel: True
+# parallel context config
+parallel:
+  parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
+  gradients_mean: False
+  enable_alltoall: False
+  full_batch: True
+  search_mode: "sharding_propagation"
+  enable_parallel_optimizer: True
+  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+  parallel_optimizer_config:
+    gradient_accumulation_shard: False
+    parallel_optimizer_threshold: 64
+# default parallel of device num = 8 for Atlas 800T A2
+parallel_config:
+  data_parallel: 8
+  model_parallel: 1
+  pipeline_stage: 1
+  use_seq_parallel: False
+  micro_batch_num: 1
+  vocab_emb_dp: True
+  gradient_aggregation_group: 4
+# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
+micro_batch_interleave_num: 1
+
+# recompute config
+recompute_config:
+  recompute: False
+  select_recompute: False
+  parallel_optimizer_comm_recompute: False
+  mp_comm_recompute: True
+  recompute_slice_activation: True
+
+# callbacks
+callbacks:
+  - type: MFLossMonitor
+  - type: CheckpointMointor
+    prefix: "llama2_7b"
+    save_checkpoint_steps: 100000000 # big enough
+    integrated_save: False
+    async_save: False
+  - type: ObsMonitor
+
+# mindspore context init config
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  device_target: "Ascend"
+  enable_graph_kernel: False
+  graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"
+  max_call_depth: 10000
+  max_device_memory: "58GB"
+  save_graphs: False
+  save_graphs_path: "./graph"
+  device_id: 0
+  runtime_num_threads: 1
+
+# model config
+model:
+  model_config:
+    type: LlamaConfig
+    batch_size: 1 # add for increase predict
+    seq_length: 4096
+    hidden_size: 4096
+    num_layers: 32
+    num_heads: 32
+    vocab_size: 32000
+    multiple_of: 256
+    rms_norm_eps: 1.0e-5
+    bos_token_id: 1
+    eos_token_id: 2
+    pad_token_id: 0
+    ignore_token_id: -100
+    compute_dtype: "bfloat16"
+    layernorm_compute_type: "float32"
+    softmax_compute_type: "float16"
+    rotary_dtype: "float16"
+    param_init_type: "float32"
+    use_past: False
+    scaling_factor: 1.0
+    extend_method: "None" # support "None", "PI", "NTK"
+    use_flash_attention: True # FA can accelerate training or finetune
+    offset: 0
+    checkpoint_name_or_path: "llama2_7b"
+    repetition_penalty: 1
+    max_decode_length: 512
+    top_k: 3
+    top_p: 1
+    do_sample: False
+  arch:
+    type: LlamaForCausalLM
+
+processor:
+  return_tensors: ms
+  tokenizer:
+    unk_token: '<unk>'
+    bos_token: '<s>'
+    eos_token: '</s>'
+    pad_token: '<unk>'
+    type: LlamaTokenizer
+  type: LlamaProcessor
+
+# metric
+metric:
+  type: PerplexityMetric
+
+# wrapper cell config
+runner_wrapper:
+  type: MFTrainOneStepCell
+  scale_sense:
+    type: DynamicLossScaleUpdateCell
+    loss_scale_value: 65536
+    scale_factor: 2
+    scale_window: 1000
+  use_clip_grad: True
+
+eval_callbacks:
+  - type: ObsMonitor
+
+auto_tune: False
+filepath_prefix: './autotune'
+autotune_per_step: 10
+
+profile: False
+profile_start_step: 1
+profile_stop_step: 10
+init_start_profile: False
+profile_communication: False
+profile_memory: True
+layer_scale: False
+layer_decay: 0.65
+lr_scale_factor: 256
+
+# aicc
+remote_save_url: "Please input obs url on AICC platform."
diff --git a/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_finetune.yaml b/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_finetune.yaml
new file mode 100644
index 0000000..c8286bd
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_7b/launch_yamls/run_llama2_7b_finetune.yaml
@@ -0,0 +1,214 @@
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy
+load_checkpoint: '{path}/llama2_7b.ckpt'  # 必填
+src_strategy_path_or_dir: ''
+auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model
+only_save_strategy: False
+resume_training: False
+run_mode: 'train'
+
+# trainer config
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'llama2_7b'
+
+# runner config
+runner_config:
+  epochs: 2
+  batch_size: 1
+  sink_mode: True
+  sink_size: 2
+
+# optimizer
+optimizer:
+  type: FP32StateAdamWeightDecay
+  beta1: 0.9
+  beta2: 0.999
+  eps: 1.e-8 # 1e-8
+  learning_rate: 1.e-5
+
+# lr sechdule
+lr_schedule:
+  type: CosineWithWarmUpLR
+  learning_rate: 1.e-5
+  lr_end: 0
+  warmup_ratio: 0.03
+  total_steps: -1 # -1 means it will load the total steps of the dataset
+
+# dataset
+train_dataset: &train_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: "/{path}/alpaca-fastchat2048.mindrecord"
+    shuffle: True
+  input_columns: ["input_ids", "labels"]  # "input_ids", "labels" , labels are used in instruction finetune.
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: True
+  batch_size: 4
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+train_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *train_dataset
+# if True, do evaluate during the training process. if false, do nothing.
+# note that the task trainer should support _evaluate_in_training function.
+do_eval: False
+eval_step_interval: -1        # num of step intervals between each eval, -1 means no step end eval.
+eval_epoch_interval: 50        # num of epoch intervals between each eval, 1 means eval on every epoch end.
+
+# eval dataset
+eval_dataset: &eval_dataset
+  data_loader:
+    type: MindDataset
+    dataset_dir: ""
+    shuffle: False
+  input_columns: ["input_ids"]
+  num_parallel_workers: 8
+  python_multiprocessing: False
+  drop_remainder: False
+  repeat: 1
+  numa_enable: False
+  prefetch_size: 1
+eval_dataset_task:
+  type: CausalLanguageModelDataset
+  dataset_config: *eval_dataset
+
+use_parallel: True
+# parallel context config
+parallel:
+  parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
+  gradients_mean: False
+  enable_alltoall: False
+  full_batch: True
+  search_mode: "sharding_propagation"
+  enable_parallel_optimizer: True
+  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+  parallel_optimizer_config:
+    gradient_accumulation_shard: False
+    parallel_optimizer_threshold: 64
+# default parallel of device num = 8 for Atlas 800
+parallel_config:
+  data_parallel: 2
+  model_parallel: 1
+  pipeline_stage: 4
+  use_seq_parallel: False
+  micro_batch_num: 8
+  vocab_emb_dp: True
+  gradient_aggregation_group: 4
+# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process.
+micro_batch_interleave_num: 2
+
+# recompute config
+recompute_config:
+  recompute: True
+  select_recompute: False
+  parallel_optimizer_comm_recompute: False
+  mp_comm_recompute: True
+  recompute_slice_activation: True
+
+# callbacks
+callbacks:
+  - type: MFLossMonitor
+  - type: CheckpointMointor
+    prefix: "llama_7b"
+    save_checkpoint_steps: 10000000 # big enough
+    integrated_save: False
+    async_save: False
+  - type: ObsMonitor
+
+# mindspore context init config
+context:
+  mode: 0 #0--Graph Mode; 1--Pynative Mode
+  device_target: "Ascend"
+  enable_graph_kernel: False
+  graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"
+  max_call_depth: 10000
+  max_device_memory: "28GB"
+  save_graphs: False
+  save_graphs_path: "./graph"
+  device_id: 0
+  runtime_num_threads: 1
+
+# model config
+model:
+  model_config:
+    type: LlamaConfig
+    batch_size: 1 # add for increase predict
+    seq_length: 2048
+    hidden_size: 4096
+    num_layers: 32
+    num_heads: 32
+    max_position_embedding: 4096
+    vocab_size: 32000
+    multiple_of: 256
+    rms_norm_eps: 1.0e-5
+    bos_token_id: 1
+    eos_token_id: 2
+    pad_token_id: 0
+    ignore_token_id: -100
+    compute_dtype: "float16"
+    layernorm_compute_type: "float32"
+    softmax_compute_type: "float16"
+    rotary_dtype: "float16"
+    param_init_type: "float16"
+    use_past: False
+    pretrain_seqlen: 4096 # seqlen of the pretrain checkpoint: 2048 for llama and 4096 for llama2
+    extend_method: "None" # support "None", "PI", "NTK"
+    compute_in_2d: False
+    use_flash_attention: False # FA can accelerate training or finetune
+    offset: 0
+    use_past_shard: False
+    checkpoint_name_or_path: "llama2_7b"
+    repetition_penalty: 1
+    max_decode_length: 512
+    top_k: 3
+    top_p: 1
+    do_sample: False
+  arch:
+    type: LlamaForCausalLM
+
+processor:
+  return_tensors: ms
+  tokenizer:
+    unk_token: '<unk>'
+    bos_token: '<s>'
+    eos_token: '</s>'
+    pad_token: '<unk>'
+    type: LlamaTokenizer
+  type: LlamaProcessor
+
+# metric
+metric:
+  type: PerplexityMetric
+
+# wrapper cell config
+runner_wrapper:
+  type: MFTrainOneStepCell
+  scale_sense:
+    type: DynamicLossScaleUpdateCell
+    loss_scale_value: 65536
+    scale_factor: 2
+    scale_window: 1000
+  use_clip_grad: True
+
+eval_callbacks:
+  - type: ObsMonitor
+
+auto_tune: False
+filepath_prefix: './autotune'
+autotune_per_step: 10
+
+profile: False
+profile_start_step: 1
+profile_stop_step: 10
+init_start_profile: False
+profile_communication: False
+profile_memory: True
+layer_scale: False
+layer_decay: 0.65
+lr_scale_factor: 256
+
+# aicc
+remote_save_url: "Please input obs url on AICC platform."
diff --git a/huawei/mindspore/mindformers/models/llama2_7b/registed_tasks.sh b/huawei/mindspore/mindformers/models/llama2_7b/registed_tasks.sh
new file mode 100644
index 0000000..9f1cf51
--- /dev/null
+++ b/huawei/mindspore/mindformers/models/llama2_7b/registed_tasks.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# 单机运行的任务
+SINGLE_NODE_LAUNCH=( \
+    "run_llama2_7b_910b_finetune" \
+    "run_llama2_7b_910b" \
+    "run_llama2_7b_finetune" \
+    "run_llama2_7b"
+)
+# 多机运行的任务
+MULTI_NODES_LAUNCH=()
\ No newline at end of file
diff --git a/huawei/mindspore/mindformers/patch_files/r1.1.rc1/patch_config.sh b/huawei/mindspore/mindformers/patch_files/r1.1.rc1/patch_config.sh
new file mode 100644
index 0000000..f54d518
--- /dev/null
+++ b/huawei/mindspore/mindformers/patch_files/r1.1.rc1/patch_config.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# 代码的git远程仓库信息
+export git_url="https://gitee.com/mindspore/mindformers.git"
+export branch="r1.1.rc1"
+export commit_id="9504c4663e2d842e46bdc6f8b1bf773b0a99acc8"
+export base_code_subdir="mindformers" # git远程仓库中的代码路径，如果要用仓库的全部代码，就直接填repo名
+
+# 生成 .patch文件所需信息（makepatch.sh）
+export changed_code_dir="" # 基于git远程仓库原始代码做过进一步修改（嵌入AISBench的打点接口）的代码, update_patch.sh脚本中填入
+export dir_to_save_patch_file="" # 保存生成的.patch文件的文件夹路径, update_patch.sh脚本中填入
+export patch_file_name="r1.1.rc1" # 生成的patch文件名（不带文件后缀）
+
+# 由 .patch文件修改git远程仓库拉取的原始代码所需信息（applypatch）
+export result_code_dir="" # 基于.patch文件将git远程仓库原始代码修改后保存的文件夹路径，在build.sh脚本中填入
+export patch_file_path="" # 传入的.patch文件的路径，在build.sh脚本中填入
\ No newline at end of file
diff --git a/huawei/mindspore/mindformers/patch_files/r1.1.rc1/r1.1.rc1.patch b/huawei/mindspore/mindformers/patch_files/r1.1.rc1/r1.1.rc1.patch
new file mode 100644
index 0000000..209d7b9
--- /dev/null
+++ b/huawei/mindspore/mindformers/patch_files/r1.1.rc1/r1.1.rc1.patch
@@ -0,0 +1,209 @@
+diff -Nur '--exclude=*.git*' origin/mindformers/core/callback/callback.py code/mindformers/core/callback/callback.py
+--- origin/mindformers/core/callback/callback.py	2024-04-18 11:04:36.836000000 +0800
++++ code/mindformers/core/callback/callback.py	2024-04-18 11:04:36.960000000 +0800
+@@ -39,6 +39,8 @@
+ from mindformers.tools.utils import get_output_root_path, get_output_subpath, get_remote_save_url, check_in_modelarts,\
+     get_real_rank, get_real_group_size
+ 
++import ais_bench.logging as aislog
++
+ __all__ = ['ObsMonitor', 'MFLossMonitor', 'CheckpointMointor', 'SummaryMonitor', 'ProfileMonitor', 'EvalCallBack']
+ 
+ _cur_dir = os.getcwd()
+@@ -182,6 +184,13 @@
+         """
+         self.step_time = time.time()
+         self.run_context = run_context
++        cb_params = run_context.original_args()
++        if cb_params.dataset_sink_mode:
++            sample_per_step = int(self.global_batch_size * cb_params.batch_num / self.device_num)
++        else:
++            sample_per_step = int(self.global_batch_size / self.device_num)
++        aislog.start("train_per_step", sample_per_step)
++
+ 
+     def step_end(self, run_context):
+         """
+@@ -215,12 +224,15 @@
+             steps_per_epoch = self.steps_per_epoch
+             cur_epoch_num = (cb_params.cur_step_num + self.initial_step - 1) // steps_per_epoch + 1
+             cur_step_num = (cb_params.cur_step_num + self.initial_step - 1) % steps_per_epoch + 1
++            sample_per_step = int(self.global_batch_size * cb_params.batch_num / self.device_num)
+         else:
+             origin_epochs = self.origin_epochs
+             per_step_seconds = step_seconds
+             steps_per_epoch = cb_params.batch_num
+             cur_epoch_num = cb_params.cur_epoch_num
+             cur_step_num = (cb_params.cur_step_num + self.initial_step - 1) % cb_params.batch_num + 1
++            sample_per_step = int(self.global_batch_size / self.device_num)
++        aislog.end("train_per_step", sample_per_step)
+ 
+         # compute time remaining
+         step_remain = (origin_epochs - cur_epoch_num + 1) * steps_per_epoch - cur_step_num
+diff -Nur '--exclude=*.git*' origin/mindformers/core/metric/metric.py code/mindformers/core/metric/metric.py
+--- origin/mindformers/core/metric/metric.py	2024-04-18 11:04:36.836000000 +0800
++++ code/mindformers/core/metric/metric.py	2024-04-18 11:04:36.960000000 +0800
+@@ -39,6 +39,7 @@
+ 
+ from .utils import PerplexityCell
+ from ...dataset.labels import cluener_labels
++import ais_bench.logging as aislog
+ 
+ __all__ = ['EntityScore', 'SQuADMetric', 'PerplexityMetric', 'ADGENMetric', 'PromptAccMetric', 'EmF1Metric']
+ 
+@@ -541,6 +542,11 @@
+             return None
+         avg_loss = float(self.total_loss / self.num_data)
+         result = {"loss": avg_loss, "PPL": math.exp(avg_loss)}
++        result_log="loss: {}, Perplexity: {}".format(avg_loss, math.exp(avg_loss))
++        aislog.init("training", os.getenv("AIS_WORK_RESULT_PATH", ""))
++        aislog.event("accuracy", result_log)
++        aislog.event("result", "OK")
++        aislog.finish()
+         if self.pipeline_parallel:
+             print("Average Loss and PPL Metric:", result)
+         return result
+@@ -602,6 +608,15 @@
+               f'rouge-2: {self.score_dict["rouge-2"]:.4f}\n' +
+               f'rouge-l: {self.score_dict["rouge-l"]:.4f}\n' +
+               f'bleu-4:  {self.score_dict["bleu-4"]:.4f}')
++        result_log = 'metric: ADGENMetric\n' + \
++            f'rouge-1: {self.score_dict["rouge-1"]:.4f}\n' + \
++            f'rouge-2: {self.score_dict["rouge-2"]:.4f}\n' + \
++            f'rouge-l: {self.score_dict["rouge-l"]:.4f}\n' + \
++            f'bleu-4:  {self.score_dict["bleu-4"]:.4f}'
++        aislog.init("training", os.getenv("AIS_WORK_RESULT_PATH", ""))
++        aislog.event("accuracy", result_log)
++        aislog.event("result", "OK")
++        aislog.finish()
+         return self.score_dict
+ 
+ 
+@@ -715,6 +730,12 @@
+         result = {"Acc": acc_rate}
+         print(f"Acc: {('%.3f' % result.get('Acc', 0))}, total_acc_num: {self.total_acc_num}, "
+               f"total_num: {self.num_data}")
++        result_log = f"Acc: {('%.3f' % result.get('Acc', 0))}, total_acc_num: {self.total_acc_num}, " + \
++            f"total_num: {self.num_data}"
++        aislog.init("training", os.getenv("AIS_WORK_RESULT_PATH", ""))
++        aislog.event("accuracy", result_log)
++        aislog.event("result", "OK")
++        aislog.finish()
+         return result
+ 
+ 
+@@ -776,6 +797,11 @@
+         """Compute final result"""
+         result, total_count = self.evaluate_pairs(self.gens, self.labels)
+         print(f"F1 score: {result.get('F1', 0)}, Em score: {result.get('Em', 0)}, total_count: {total_count}")
++        result_log = f"F1 score: {result.get('F1', 0)}, Em score: {result.get('Em', 0)}, total_count: {total_count}"
++        aislog.init("training", os.getenv("AIS_WORK_RESULT_PATH", ""))
++        aislog.event("accuracy", result_log)
++        aislog.event("result", "OK")
++        aislog.finish()
+         return result
+ 
+     def mixed_segmentation(self, in_str, rm_punc=False):
+diff -Nur '--exclude=*.git*' origin/mindformers/trainer/base_trainer.py code/mindformers/trainer/base_trainer.py
+--- origin/mindformers/trainer/base_trainer.py	2024-04-18 11:04:36.848000000 +0800
++++ code/mindformers/trainer/base_trainer.py	2024-04-18 11:04:36.972000000 +0800
+@@ -60,6 +60,7 @@
+ from .optimizer_grouped_parameters import get_optimizer_grouped_parameters
+ from .utils import set_seed, check_train_data_loader_type, \
+     check_eval_data_loader_type, check_optimizer_and_lr_type, check_wrapper_config
++import ais_bench.logging as aislog
+ 
+ SUPPORT_TASKS = MindFormerBook().get_trainer_support_task_list()
+ SUPPORT_MODEL_NAMES = MindFormerBook().get_model_name_support_list()
+@@ -622,6 +623,7 @@
+             compute_metrics: Optional[Union[dict, set]] = None,
+             **kwargs):
+         """Train or Fine-tune for BaseTrainer in MindFormers."""
++        aislog.init("training", os.getenv("AIS_WORK_RESULT_PATH", ""))
+         self.kwargs = kwargs
+         self.train_dataset = dataset if dataset else self.train_dataset
+         self.eval_dataset = kwargs.get('eval_dataset', None)
+@@ -632,6 +634,7 @@
+ 
+         # build dataset
+         logger.info(".........Build Dataset For Train..........")
++        aislog.start("dataload") # logging api
+         dataset = self.create_train_dataset()
+         logger.info("Create train dataset finish, dataset size:%d", dataset.get_dataset_size())
+ 
+@@ -661,9 +664,11 @@
+ 
+         # check rules
+         check_rules(config, mode='train', network=network, dataset=dataset)
++        aislog.end("dataload") # logging api
+ 
+         # build network
+         logger.info(".........Build Net For Train..........")
++        aislog.start("train_launch") # logging api
+         if network is None and self.network is None:
+             network = self.create_network(
+                 default_args={"parallel_config": config.parallel_config,
+@@ -776,17 +781,23 @@
+                 save_checkpoint_steps=save_checkpoint_steps)
+             # ColdHotExpertMointor needs to be placed before CheckpointMointor
+             callbacks.insert(1, cold_hot_mointor)
+-
++        aislog.end("train_launch")
+         logger.info(".........Starting Training Model..........")
+         if get_real_rank() % 8 == 0:
+             pprint(config)
+         logger.info(".........Model Compiling, Please Wait a Moment...........")
++        all_data_sum = int(dataset.get_dataset_size() * config.train_dataset.batch_size / int(os.getenv("RANK_SIZE", '8'))) * \
+++             config.runner_config.origin_epochs * config.model.model_config.seq_length
++        aislog.start("train", all_data_sum)
+         model.train(config.runner_config.epochs, dataset,
+                     callbacks=callbacks,
+                     dataset_sink_mode=config.runner_config.sink_mode,
+                     sink_size=config.runner_config.sink_size,
+                     initial_epoch=config.runner_config.initial_epoch)
+         logger.info(".........Training Over!.............")
++        aislog.end("train", all_data_sum)
++        aislog.event("result", "OK")
++        aislog.finish()
+ 
+     def evaluate_process(
+             self,
+diff -Nur '--exclude=*.git*' origin/scripts/run_distribute.sh code/scripts/run_distribute.sh
+--- origin/scripts/run_distribute.sh	2024-04-18 11:04:36.888000000 +0800
++++ code/scripts/run_distribute.sh	2024-04-18 11:04:37.012000000 +0800
+@@ -154,7 +154,7 @@
+                &> $LOG_MF_PATH/rank_$RANK_ID/mindformer.log &
+         echo "log saved in $(realpath $LOG_MF_PATH)/rank_$RANK_ID"
+         cd ..
+-    done
++    done;wait
+   else
+     for((i=${START_DEVICE}; i<${END_DEVICE}; i++))
+     do
+@@ -174,7 +174,7 @@
+                &> $LOG_MF_PATH/rank_$RANK_ID/mindformer.log &
+         echo "log saved in $(realpath $LOG_MF_PATH)/rank_$RANK_ID"
+         cd ..
+-    done
++    done;wait
+   fi
+ else
+   if [ $# == 5 ]
+@@ -202,7 +202,7 @@
+                &> $LOG_MF_PATH/rank_$RANK_ID/mindformer.log &
+         echo "log saved in $(realpath $LOG_MF_PATH)/rank_$RANK_ID"
+         cd ..
+-    done
++    done;wait
+   else
+     for((i=${START_DEVICE}; i<${END_DEVICE}; i++))
+     do
+@@ -227,7 +227,7 @@
+                &> $LOG_MF_PATH/rank_$RANK_ID/mindformer.log &
+         echo "log saved in $(realpath $LOG_MF_PATH)/rank_$RANK_ID"
+         cd ..
+-    done
++    done;wait
+   fi
+ fi
+ shopt -u extglob
diff --git a/huawei/mindspore/mindformers/update_patch.sh b/huawei/mindspore/mindformers/update_patch.sh
new file mode 100644
index 0000000..30b46fb
--- /dev/null
+++ b/huawei/mindspore/mindformers/update_patch.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+#!/bin/bash
+declare -i ret_ok=0
+declare -i ret_error=1
+CUR_DIR=$(dirname $(readlink -f $0))
+PATCH_TOOLS_PATH="$CUR_DIR/../../../tools/patch_tool/patch_tool.sh"
+
+main(){
+    patch_version=$1
+    patch_version_path="${CUR_DIR}/patch_files/${patch_version}"
+    patch_config_path="${patch_version_path}/patch_config.sh"
+    if [ ! -f $patch_config_path ];then
+        echo "ERROR: can not find patch config file of ${patch_version}!"
+        return $ret_error
+    fi
+    . $patch_config_path # 导入patch配置文件
+    changed_code_dir=$2 # 改变patch配置文件export的变量
+    if [ ! -d $changed_code_dir ];then
+        echo "ERROR: changed_code_dir ${patch_version} not exist!"
+        return $ret_error
+    fi
+    dir_to_save_patch_file="${patch_version_path}" # 改变patch配置文件export的变量
+    bash $PATCH_TOOLS_PATH "makepatch" || { echo "makepatch failed!";return $ret_error; }
+    rm -r $CUR_DIR/buildtmp
+    return $ret_ok
+}
+
+main "$@"
+exit $?
\ No newline at end of file
-- 
Gitee


From 5f10f0451ff126590732522b6d2ee7bd992bfd8c Mon Sep 17 00:00:00 2001
From: yanhe13 <yanhe13@huawei.com>
Date: Tue, 7 May 2024 16:49:16 +0800
Subject: [PATCH 2/2] yaml fix

---
 .../glm2_6b/launch_yamls/run_glm2_6b_finetune_800T_A2_64G.yaml  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800T_A2_64G.yaml b/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800T_A2_64G.yaml
index d991a63..5408dfd 100644
--- a/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800T_A2_64G.yaml
+++ b/huawei/mindspore/mindformers/models/glm2_6b/launch_yamls/run_glm2_6b_finetune_800T_A2_64G.yaml
@@ -1,7 +1,7 @@
 seed: 0
 run_mode: 'train'
 output_dir: './output' # path to save checkpoint/strategy
-load_checkpoint: ''path/to/glm2_6b.ckpt''
+load_checkpoint: '{path}/glm2_6b.ckpt'
 src_strategy_path_or_dir: ''
 auto_trans_ckpt: False  # If true, auto transform load_checkpoint to load in distributed model
 only_save_strategy: False
-- 
Gitee