1 Star 0 Fork 0

lengyanju8/Megatron-LM

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
.gitlab-ci.yml 24.44 KB
一键复制 编辑 原始数据 按行查看 历史
Maanu Grover 提交于 2024-02-01 12:14 . JET Migration Updates
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966
image: nvcr.io/nvidia/pytorch:23.04-py3
stages:
- test
- jet
- cleanup
variables: &VARS
SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
DATA_DIR: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data"
PYTORCH_IMAGE: /lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/nvcr_pytorch_23.04.sqsh # This is the image that is run by all nodes on selene for tests
PYTHON_VIRTUAL_ENV: /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate
TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED: "MR_TESTS JET" # Can specify levels
TESTS_TO_RUN_AFTER_MERGING: "MR_TESTS NIGHTLY_TESTS" # Can specify levels
TESTS_TO_RUN_ON_THIS_COMMIT: unit_tests
TEST_REGEX_ON_THIS_COMMIT: NONE #https://github.com/google/re2/wiki/Syntax (Can define regex as in this spec) e.g /.*gpt3.*/
JET_CUSTOM_FILTER: ""
DISPLAY_OUTPUT: "True" # Set to true for new tests to copy the logs for creating golden truth file
TIME_LIMIT: "10:00" # Default time limit for all jobs
MOE_GROUPED_GEMM: 0 # Set to 1 to enable grouped gemm for MoE
include:
- jet-tests.yml
unit_tests:
image: nvcr.io/nvidia/pytorch:23.04-py3
tags:
- docker_local_runner
stage: test
script:
- pip install pytest-cov
- pip install pytest_mock
- pip install nltk
- pip install wrapt
- pip install zarr "tensorstore==0.1.45" # for distributed checkpointing tests
- pip install git+https://github.com/fanshiqing/grouped_gemm@main # for grouped gemm tests
- torchrun --nproc_per_node=8 -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/unit_tests
coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
artifacts:
paths:
- coverage
expire_in: 30 days
rules:
- when: always
docs_build_test:
stage: test
tags:
- docker_local_runner
script:
- cd ..
- rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab-master.nvidia.com/nemo-megatron-core-tme/documentation.git
- mv megatron-lm/ documentation/
- cd documentation/
- ./repo docs
allow_failure: true
except:
- main
formatting:
image: nvcr.io/nvidia/pytorch:23.04-py3
tags:
- docker_local_runner
stage: test
script:
- pip install --upgrade black==19.10b0 isort click==8.0.2
- black megatron/core --check --verbose --diff
- isort megatron/core --check
rules:
- when: always
.selene_test_resume_checkpoint_launcher: &selene-test-resume-checkpoint-launcher
tags:
- ssh_selene_runner
stage: test
script: &selene-test-resume-launcher-script
- echo "Running selene resume from checkpoint test. "
- pwd
- run_cmd="bash tests/functional_tests/shell_test_utils/run_selene_test_resume_checkpoint_launcher_script.sh RUN_MODEL=$RUN_MODEL TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE VP_SIZE=$VP_SIZE NUM_NODES=$NUM_NODES SELENE_ADLR_CI_PATH=$SELENE_ADLR_CI_PATH CI_PIPELINE_ID=$CI_PIPELINE_ID RUN_NAME=$RUN_NAME PYTORCH_IMAGE=$PYTORCH_IMAGE DATA_DIR=$DATA_DIR TIME_LIMIT=$TIME_LIMIT"
- echo "$run_cmd"
- ${run_cmd}
- echo "Completed the job"
rules:
- if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
when: always
- if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
when: always
- if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
when: always
- if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ "READY FOR REVIEW" && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
when: always
allow_failure: false
retry: 2
.selene_test_launcher: &selene-test-launcher
tags:
- ssh_selene_runner
stage: test
script: &selene-test-launcher-script
- echo "Running selene test"
- pwd
- run_cmd="bash tests/functional_tests/shell_test_utils/run_selene_test_launcher_script.sh RUN_MODEL=$RUN_MODEL TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE VP_SIZE=$VP_SIZE NUM_NODES=$NUM_NODES SELENE_ADLR_CI_PATH=$SELENE_ADLR_CI_PATH CI_PIPELINE_ID=$CI_PIPELINE_ID RUN_NAME=$RUN_NAME MAX_STEPS=$MAX_STEPS PYTORCH_IMAGE=$PYTORCH_IMAGE DATA_DIR=$DATA_DIR USE_CORE=$USE_CORE USE_TE=$USE_TE MOE_GROUPED_GEMM=$MOE_GROUPED_GEMM TIME_LIMIT=$TIME_LIMIT"
- echo "$run_cmd"
- ${run_cmd}
- echo "Completed the job"
rules:
- if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
when: always
- if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
when: always
- if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
when: always
- if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ "READY FOR REVIEW" && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
when: always
allow_failure: false
retry: 2
train.te_gpt3.345m_tp2_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 1
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: MR_TESTS
PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
train.gpt3_core.345m_tp4_pp1_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 4
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: NIGHTLY_TESTS
train.gpt3_core.345m_tp2_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: MR_TESTS
train.gpt3_core.345m_tp1_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TIME_LIMIT: "10:00"
TEST_LEVEL: NIGHTLY_TESTS
train.gpt3_core.345m_tp1_pp4_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: NIGHTLY_TESTS
train.gpt3_core.345m_tp1_pp4_interleaved_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: MR_TESTS
train.gpt3_core.345m_tp1_pp2_1node_50steps_rope:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: MR_TESTS
METADATA: rope_embeddings
ADDITIONAL_PARAMS: "--position-embedding-type rope"
train.gpt3_core.345m_tp1_pp4_1node_50steps_swiglu:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: MR_TESTS
METADATA: swiglu
ADDITIONAL_PARAMS: "--swiglu"
train.gpt3_core.345m_tp1_pp4_1node_50steps_disable_bias_linear:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: MR_TESTS
METADATA: disable_bias_linear
ADDITIONAL_PARAMS: "--disable-bias-linear"
train.gpt3_core.345m_tp1_pp4_1node_50steps_untie_embeddings_and_outputs:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: MR_TESTS
METADATA: untie_embeddings_and_outputs
ADDITIONAL_PARAMS: "--untie-embeddings-and-output-weights"
train.gpt3_core.345m_tp1_pp4_1node_50steps_sequence_parallel:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: MR_TESTS
METADATA: sequence_parallel
ADDITIONAL_PARAMS: "--sequence-parallel"
train.gpt3.345m_tp4_pp1_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 4
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
train.gpt3.345m_tp2_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: MR_TESTS
train.gpt3.345m_tp1_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
train.gpt3.345m_tp1_pp4_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
train.gpt3.345m_tp1_pp4_interleaved_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: MR_TESTS
resume.checkpoint.gpt3.345m_tp1_pp2_1node:
<<: *selene-test-resume-checkpoint-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
TIME_LIMIT: "15:00"
TEST_LEVEL: MR_TESTS
train.gpt3.345m_tp1_pp1_1node_50steps_dist_optimizer:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: MR_TESTS
METADATA: dist_optimizer
ADDITIONAL_PARAMS: "--use-distributed-optimizer"
train.gpt3.345m_tp1_pp1_1node_50steps_overlap_grad_reduce:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
METADATA: overlap_grad_reduce
ADDITIONAL_PARAMS: "--overlap-grad-reduce"
train.gpt3.345m_tp1_pp1_1node_50steps_dist_optimizer_overlap_grad_reduce:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
METADATA: dist_optimizer_overlap_grad_reduce
ADDITIONAL_PARAMS: "--use-distributed-optimizer --overlap-grad-reduce"
train.gpt3.345m_tp1_pp1_1node_50steps_dist_optimizer_overlap_grad_reduce_param_gather:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
METADATA: dist_optimizer_overlap_grad_reduce_param_gather
ADDITIONAL_PARAMS: "--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather"
train.gpt3.345m_tp4_pp1_1node_50steps_overlap_grad_reduce:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 4
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
METADATA: overlap_grad_reduce
ADDITIONAL_PARAMS: "--overlap-grad-reduce"
train.gpt3.345m_tp4_pp1_1node_50steps_dist_optimizer_overlap_grad_reduce:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 4
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: MR_TESTS
METADATA: dist_optimizer_overlap_grad_reduce
ADDITIONAL_PARAMS: "--use-distributed-optimizer --overlap-grad-reduce"
train.gpt3.345m_tp4_pp1_1node_50steps_dist_optimizer_overlap_grad_reduce_param_gather:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 4
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: MR_TESTS
METADATA: dist_optimizer_overlap_grad_reduce_param_gather
ADDITIONAL_PARAMS: "--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather"
train.gpt3.345m_tp1_pp4_1node_50steps_overlap_grad_reduce:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
METADATA: overlap_grad_reduce
ADDITIONAL_PARAMS: "--overlap-grad-reduce"
train.gpt3.345m_tp1_pp4_interleaved_1node_50steps_overlap_grad_reduce:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
METADATA: overlap_grad_reduce
ADDITIONAL_PARAMS: "--overlap-grad-reduce"
train.gpt3.345m_tp1_pp4_interleaved_1node_50steps_dist_optimizer_overlap_grad_reduce:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: MR_TESTS
METADATA: dist_optimizer_overlap_grad_reduce
ADDITIONAL_PARAMS: "--use-distributed-optimizer --overlap-grad-reduce"
train.gpt3.345m_tp1_pp4_interleaved_1node_50steps_dist_optimizer_overlap_grad_reduce_param_gather:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: MR_TESTS
METADATA: dist_optimizer_overlap_grad_reduce_param_gather
ADDITIONAL_PARAMS: "--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather"
train.gpt3.345m_tp2_pp2_1node_50steps_overlap_grad_reduce:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
METADATA: overlap_grad_reduce
ADDITIONAL_PARAMS: "--overlap-grad-reduce"
train.gpt3_core.345m_cp2_tp2_pp1_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TIME_LIMIT: "20:00"
TEST_LEVEL: MR_TESTS
METADATA: "context_parallelism_cp2"
PYTORCH_IMAGE: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/pytorch_23.10_flash_attn_1.0.9_context_parallelism.sqsh"
ADDITIONAL_PARAMS: "--context-parallel-size 2 --sequence-parallel --hidden-dropout 0.0 --attention-dropout 0.0"
train.gpt3_core.345m_cp2_tp2_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TIME_LIMIT: "20:00"
TEST_LEVEL: MR_TESTS
METADATA: "context_parallelism_cp2"
PYTORCH_IMAGE: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/pytorch_23.10_flash_attn_1.0.9_context_parallelism.sqsh"
ADDITIONAL_PARAMS: "--context-parallel-size 2 --sequence-parallel --hidden-dropout 0.0 --attention-dropout 0.0"
# Note: Core MoE models currently will run TE by default
train.te_core_moe_gpt3.345m_tp2_pp2_2experts_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: NIGHTLY_TESTS
METADATA: "te_2experts"
ADDITIONAL_PARAMS: "--num-experts 2 --sequence-parallel --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"
train.te_core_moe_gpt3.345m_tp2_pp2_4experts2parallel_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: NIGHTLY_TESTS
METADATA: "te_4experts2parallel"
ADDITIONAL_PARAMS: "--sequence-parallel --num-experts 4 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"
train.te_core_moe_gpt3.345m_tp2_pp1_8experts2parallel_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: MR_TESTS
METADATA: "te_8experts2parallel"
ADDITIONAL_PARAMS: "--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"
train.te_core_moe_gpt3.345m_tp2_pp1_8experts2parallel_groupedGEMM_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
MOE_GROUPED_GEMM: 1
TEST_LEVEL: MR_TESTS
METADATA: "te_8experts2parallel_groupedGEMM"
ADDITIONAL_PARAMS: "--moe-grouped-gemm --disable-bias-linear --sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"
train.te_core_moe_gpt3.345m_tp2_pp1_8experts2parallel_top2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
MOE_GROUPED_GEMM: 1
TEST_LEVEL: MR_TESTS
METADATA: "te_8experts2parallel_top2router"
ADDITIONAL_PARAMS: "--disable-bias-linear --sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type aux_loss --moe-router-topk 2 --moe-aux-loss-coeff 1e-2"
train.moe_gpt3.345m_tp2_pp2_4experts_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
METADATA: "4experts"
ADDITIONAL_PARAMS: "--sequence-parallel --num-experts 4 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"
train.bert.345m_tp4_pp1_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 4
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
TIME_LIMIT: "10:00"
TEST_LEVEL: NIGHTLY_TESTS
train.bert.345m_tp2_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
TEST_LEVEL: MR_TESTS
train.bert.345m_tp1_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
TEST_LEVEL: NIGHTLY_TESTS
train.bert.345m_tp1_pp4_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 4
NUM_NODES: 1
MAX_STEPS: 50
TEST_LEVEL: NIGHTLY_TESTS
train.bert.345m_tp1_pp4_interleaved_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
TEST_LEVEL: MR_TESTS
train.bert_core.345m_tp4_pp1_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 4
PP_SIZE: 1
NUM_NODES: 1
USE_CORE: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: NIGHTLY_TESTS
train.bert_core.345m_tp2_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
USE_CORE: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: MR_TESTS
train.bert_core.345m_tp2_pp2_1node_50steps_local_spec:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
USE_CORE: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: MR_TESTS
METADATA: local_spec
ADDITIONAL_PARAMS: "--spec local"
train.bert_core.345m_tp1_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
USE_CORE: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: NIGHTLY_TESTS
train.bert_core.345m_tp1_pp4_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 2
NUM_NODES: 1
USE_CORE: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: NIGHTLY_TESTS
train.bert_core.345m_tp1_pp2_1node_50steps_rope:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
USE_CORE: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: L0
METADATA: rope_embeddings
ADDITIONAL_PARAMS: "--position-embedding-type rope"
train.bert_core.345m_tp1_pp2_1node_50steps_sequence_parallel:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
USE_CORE: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: L0
METADATA: sequence_parallel
ADDITIONAL_PARAMS: "--sequence-parallel"
resume.checkpoint.bert.345m_tp1_pp2_1node:
<<: *selene-test-resume-checkpoint-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
TEST_LEVEL: MR_TESTS
train.retro_core.tp1_pp1_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: retro
USE_TE: 0
USE_CORE: 1
TP_SIZE: 1
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: MONTHLY_TESTS
train.t5_core.220m_tp1_pp1_1node_100steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: t5
USE_TE: 0
USE_CORE: 1
TP_SIZE: 1
PP_SIZE: 1
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 100
TIME_LIMIT: "30:00"
TEST_LEVEL: MONTHLY_TESTS
PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
train.t5_core.220m_tp2_pp1_1node_100steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: t5
USE_TE: 0
USE_CORE: 1
TP_SIZE: 2
PP_SIZE: 1
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 100
TIME_LIMIT: "30:00"
TEST_LEVEL: MONTHLY_TESTS
PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
train.t5_core.220m_te_tp1_pp1_1node_100steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: t5
USE_TE: 1
USE_CORE: 1
TP_SIZE: 1
PP_SIZE: 1
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 100
TIME_LIMIT: "30:00"
TEST_LEVEL: MR_TESTS
PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
train.t5_core.220m_te_tp2_pp1_1node_100steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: t5
USE_TE: 1
USE_CORE: 1
TP_SIZE: 2
PP_SIZE: 1
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 100
TIME_LIMIT: "30:00"
TEST_LEVEL: MONTHLY_TESTS
PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
train.t5_core.220m_te_tp2_pp1_sp_1node_100steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: t5
USE_TE: 1
USE_CORE: 1
TP_SIZE: 2
PP_SIZE: 1
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 100
TIME_LIMIT: "30:00"
TEST_LEVEL: MONTHLY_TESTS
PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
ADDITIONAL_PARAMS: "--sequence-parallel"
resume.checkpoint.t5_core.220m_tp1_pp1_1node:
<<: *selene-test-resume-checkpoint-launcher
variables:
<<: [*VARS]
RUN_MODEL: t5
USE_TE: 0
USE_CORE: 1
TP_SIZE: 1
PP_SIZE: 1
VP_SIZE: 1
NUM_NODES: 1
TIME_LIMIT: "30:00"
TEST_LEVEL: MONTHLY_TESTS
PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
resume.checkpoint.t5_core.220m_te_tp1_pp1_1node:
<<: *selene-test-resume-checkpoint-launcher
variables:
<<: [*VARS]
RUN_MODEL: t5
USE_TE: 1
USE_CORE: 1
TP_SIZE: 1
PP_SIZE: 1
VP_SIZE: 1
NUM_NODES: 1
TIME_LIMIT: "30:00"
TEST_LEVEL: MONTHLY_TESTS
PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
cleanup.selene:
tags:
- ssh_selene_runner
stage: cleanup
variables:
<<: [*VARS]
script:
- set +e
- NUM_CLEANUP=`find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | wc -l`
- find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | xargs rm -rf
- find ${SELENE_ADLR_CI_PATH}/* -type d -name "checkpoints" -ctime +2 | grep -v data | xargs rm -rf
- echo "Finished cleaning $NUM_CLEANUP directories older than 20 days everything in Selene"
allow_failure: true
rules:
- when: always
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/lengyanju8/Megatron-LM.git
git@gitee.com:lengyanju8/Megatron-LM.git
lengyanju8
Megatron-LM
Megatron-LM
main

搜索帮助

0d507c66 1850385 C8b1a773 1850385