代码拉取完成,页面将自动刷新
image: nvcr.io/nvidia/pytorch:23.04-py3
stages:
- test
- jet
- cleanup
variables: &VARS
SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
DATA_DIR: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data"
PYTORCH_IMAGE: /lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/nvcr_pytorch_23.04.sqsh # This is the image that is run by all nodes on selene for tests
PYTHON_VIRTUAL_ENV: /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate
TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED: "MR_TESTS JET" # Can specify levels
TESTS_TO_RUN_AFTER_MERGING: "MR_TESTS NIGHTLY_TESTS" # Can specify levels
TESTS_TO_RUN_ON_THIS_COMMIT: unit_tests
TEST_REGEX_ON_THIS_COMMIT: NONE #https://github.com/google/re2/wiki/Syntax (Can define regex as in this spec) e.g /.*gpt3.*/
JET_CUSTOM_FILTER: ""
DISPLAY_OUTPUT: "True" # Set to true for new tests to copy the logs for creating golden truth file
TIME_LIMIT: "10:00" # Default time limit for all jobs
MOE_GROUPED_GEMM: 0 # Set to 1 to enable grouped gemm for MoE
include:
- jet-tests.yml
unit_tests:
image: nvcr.io/nvidia/pytorch:23.04-py3
tags:
- docker_local_runner
stage: test
script:
- pip install pytest-cov
- pip install pytest_mock
- pip install nltk
- pip install wrapt
- pip install zarr "tensorstore==0.1.45" # for distributed checkpointing tests
- pip install git+https://github.com/fanshiqing/grouped_gemm@main # for grouped gemm tests
- torchrun --nproc_per_node=8 -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/unit_tests
coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
artifacts:
paths:
- coverage
expire_in: 30 days
rules:
- when: always
docs_build_test:
stage: test
tags:
- docker_local_runner
script:
- cd ..
- rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab-master.nvidia.com/nemo-megatron-core-tme/documentation.git
- mv megatron-lm/ documentation/
- cd documentation/
- ./repo docs
allow_failure: true
except:
- main
formatting:
image: nvcr.io/nvidia/pytorch:23.04-py3
tags:
- docker_local_runner
stage: test
script:
- pip install --upgrade black==19.10b0 isort click==8.0.2
- black megatron/core --check --verbose --diff
- isort megatron/core --check
rules:
- when: always
.selene_test_resume_checkpoint_launcher: &selene-test-resume-checkpoint-launcher
tags:
- ssh_selene_runner
stage: test
script: &selene-test-resume-launcher-script
- echo "Running selene resume from checkpoint test. "
- pwd
- run_cmd="bash tests/functional_tests/shell_test_utils/run_selene_test_resume_checkpoint_launcher_script.sh RUN_MODEL=$RUN_MODEL TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE VP_SIZE=$VP_SIZE NUM_NODES=$NUM_NODES SELENE_ADLR_CI_PATH=$SELENE_ADLR_CI_PATH CI_PIPELINE_ID=$CI_PIPELINE_ID RUN_NAME=$RUN_NAME PYTORCH_IMAGE=$PYTORCH_IMAGE DATA_DIR=$DATA_DIR TIME_LIMIT=$TIME_LIMIT"
- echo "$run_cmd"
- ${run_cmd}
- echo "Completed the job"
rules:
- if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
when: always
- if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
when: always
- if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
when: always
- if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ "READY FOR REVIEW" && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
when: always
allow_failure: false
retry: 2
.selene_test_launcher: &selene-test-launcher
tags:
- ssh_selene_runner
stage: test
script: &selene-test-launcher-script
- echo "Running selene test"
- pwd
- run_cmd="bash tests/functional_tests/shell_test_utils/run_selene_test_launcher_script.sh RUN_MODEL=$RUN_MODEL TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE VP_SIZE=$VP_SIZE NUM_NODES=$NUM_NODES SELENE_ADLR_CI_PATH=$SELENE_ADLR_CI_PATH CI_PIPELINE_ID=$CI_PIPELINE_ID RUN_NAME=$RUN_NAME MAX_STEPS=$MAX_STEPS PYTORCH_IMAGE=$PYTORCH_IMAGE DATA_DIR=$DATA_DIR USE_CORE=$USE_CORE USE_TE=$USE_TE MOE_GROUPED_GEMM=$MOE_GROUPED_GEMM TIME_LIMIT=$TIME_LIMIT"
- echo "$run_cmd"
- ${run_cmd}
- echo "Completed the job"
rules:
- if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
when: always
- if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
when: always
- if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
when: always
- if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ "READY FOR REVIEW" && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
when: always
allow_failure: false
retry: 2
train.te_gpt3.345m_tp2_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 1
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: MR_TESTS
PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
train.gpt3_core.345m_tp4_pp1_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 4
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: NIGHTLY_TESTS
train.gpt3_core.345m_tp2_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: MR_TESTS
train.gpt3_core.345m_tp1_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TIME_LIMIT: "10:00"
TEST_LEVEL: NIGHTLY_TESTS
train.gpt3_core.345m_tp1_pp4_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: NIGHTLY_TESTS
train.gpt3_core.345m_tp1_pp4_interleaved_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: MR_TESTS
train.gpt3_core.345m_tp1_pp2_1node_50steps_rope:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: MR_TESTS
METADATA: rope_embeddings
ADDITIONAL_PARAMS: "--position-embedding-type rope"
train.gpt3_core.345m_tp1_pp4_1node_50steps_swiglu:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: MR_TESTS
METADATA: swiglu
ADDITIONAL_PARAMS: "--swiglu"
train.gpt3_core.345m_tp1_pp4_1node_50steps_disable_bias_linear:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: MR_TESTS
METADATA: disable_bias_linear
ADDITIONAL_PARAMS: "--disable-bias-linear"
train.gpt3_core.345m_tp1_pp4_1node_50steps_untie_embeddings_and_outputs:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: MR_TESTS
METADATA: untie_embeddings_and_outputs
ADDITIONAL_PARAMS: "--untie-embeddings-and-output-weights"
train.gpt3_core.345m_tp1_pp4_1node_50steps_sequence_parallel:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: MR_TESTS
METADATA: sequence_parallel
ADDITIONAL_PARAMS: "--sequence-parallel"
train.gpt3.345m_tp4_pp1_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 4
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
train.gpt3.345m_tp2_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: MR_TESTS
train.gpt3.345m_tp1_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
train.gpt3.345m_tp1_pp4_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
train.gpt3.345m_tp1_pp4_interleaved_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: MR_TESTS
resume.checkpoint.gpt3.345m_tp1_pp2_1node:
<<: *selene-test-resume-checkpoint-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
TIME_LIMIT: "15:00"
TEST_LEVEL: MR_TESTS
train.gpt3.345m_tp1_pp1_1node_50steps_dist_optimizer:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: MR_TESTS
METADATA: dist_optimizer
ADDITIONAL_PARAMS: "--use-distributed-optimizer"
train.gpt3.345m_tp1_pp1_1node_50steps_overlap_grad_reduce:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
METADATA: overlap_grad_reduce
ADDITIONAL_PARAMS: "--overlap-grad-reduce"
train.gpt3.345m_tp1_pp1_1node_50steps_dist_optimizer_overlap_grad_reduce:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
METADATA: dist_optimizer_overlap_grad_reduce
ADDITIONAL_PARAMS: "--use-distributed-optimizer --overlap-grad-reduce"
train.gpt3.345m_tp1_pp1_1node_50steps_dist_optimizer_overlap_grad_reduce_param_gather:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
METADATA: dist_optimizer_overlap_grad_reduce_param_gather
ADDITIONAL_PARAMS: "--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather"
train.gpt3.345m_tp4_pp1_1node_50steps_overlap_grad_reduce:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 4
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
METADATA: overlap_grad_reduce
ADDITIONAL_PARAMS: "--overlap-grad-reduce"
train.gpt3.345m_tp4_pp1_1node_50steps_dist_optimizer_overlap_grad_reduce:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 4
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: MR_TESTS
METADATA: dist_optimizer_overlap_grad_reduce
ADDITIONAL_PARAMS: "--use-distributed-optimizer --overlap-grad-reduce"
train.gpt3.345m_tp4_pp1_1node_50steps_dist_optimizer_overlap_grad_reduce_param_gather:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 4
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: MR_TESTS
METADATA: dist_optimizer_overlap_grad_reduce_param_gather
ADDITIONAL_PARAMS: "--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather"
train.gpt3.345m_tp1_pp4_1node_50steps_overlap_grad_reduce:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
METADATA: overlap_grad_reduce
ADDITIONAL_PARAMS: "--overlap-grad-reduce"
train.gpt3.345m_tp1_pp4_interleaved_1node_50steps_overlap_grad_reduce:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
METADATA: overlap_grad_reduce
ADDITIONAL_PARAMS: "--overlap-grad-reduce"
train.gpt3.345m_tp1_pp4_interleaved_1node_50steps_dist_optimizer_overlap_grad_reduce:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: MR_TESTS
METADATA: dist_optimizer_overlap_grad_reduce
ADDITIONAL_PARAMS: "--use-distributed-optimizer --overlap-grad-reduce"
train.gpt3.345m_tp1_pp4_interleaved_1node_50steps_dist_optimizer_overlap_grad_reduce_param_gather:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: MR_TESTS
METADATA: dist_optimizer_overlap_grad_reduce_param_gather
ADDITIONAL_PARAMS: "--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather"
train.gpt3.345m_tp2_pp2_1node_50steps_overlap_grad_reduce:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
METADATA: overlap_grad_reduce
ADDITIONAL_PARAMS: "--overlap-grad-reduce"
train.gpt3_core.345m_cp2_tp2_pp1_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TIME_LIMIT: "20:00"
TEST_LEVEL: MR_TESTS
METADATA: "context_parallelism_cp2"
PYTORCH_IMAGE: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/pytorch_23.10_flash_attn_1.0.9_context_parallelism.sqsh"
ADDITIONAL_PARAMS: "--context-parallel-size 2 --sequence-parallel --hidden-dropout 0.0 --attention-dropout 0.0"
train.gpt3_core.345m_cp2_tp2_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TIME_LIMIT: "20:00"
TEST_LEVEL: MR_TESTS
METADATA: "context_parallelism_cp2"
PYTORCH_IMAGE: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/pytorch_23.10_flash_attn_1.0.9_context_parallelism.sqsh"
ADDITIONAL_PARAMS: "--context-parallel-size 2 --sequence-parallel --hidden-dropout 0.0 --attention-dropout 0.0"
# Note: Core MoE models currently will run TE by default
train.te_core_moe_gpt3.345m_tp2_pp2_2experts_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: NIGHTLY_TESTS
METADATA: "te_2experts"
ADDITIONAL_PARAMS: "--num-experts 2 --sequence-parallel --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"
train.te_core_moe_gpt3.345m_tp2_pp2_4experts2parallel_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: NIGHTLY_TESTS
METADATA: "te_4experts2parallel"
ADDITIONAL_PARAMS: "--sequence-parallel --num-experts 4 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"
train.te_core_moe_gpt3.345m_tp2_pp1_8experts2parallel_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
TEST_LEVEL: MR_TESTS
METADATA: "te_8experts2parallel"
ADDITIONAL_PARAMS: "--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"
train.te_core_moe_gpt3.345m_tp2_pp1_8experts2parallel_groupedGEMM_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
MOE_GROUPED_GEMM: 1
TEST_LEVEL: MR_TESTS
METADATA: "te_8experts2parallel_groupedGEMM"
ADDITIONAL_PARAMS: "--moe-grouped-gemm --disable-bias-linear --sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"
train.te_core_moe_gpt3.345m_tp2_pp1_8experts2parallel_top2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 1
MOE_GROUPED_GEMM: 1
TEST_LEVEL: MR_TESTS
METADATA: "te_8experts2parallel_top2router"
ADDITIONAL_PARAMS: "--disable-bias-linear --sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type aux_loss --moe-router-topk 2 --moe-aux-loss-coeff 1e-2"
train.moe_gpt3.345m_tp2_pp2_4experts_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
USE_TE: 0
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
USE_CORE: 0
TEST_LEVEL: NIGHTLY_TESTS
METADATA: "4experts"
ADDITIONAL_PARAMS: "--sequence-parallel --num-experts 4 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"
train.bert.345m_tp4_pp1_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 4
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
TIME_LIMIT: "10:00"
TEST_LEVEL: NIGHTLY_TESTS
train.bert.345m_tp2_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
TEST_LEVEL: MR_TESTS
train.bert.345m_tp1_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
TEST_LEVEL: NIGHTLY_TESTS
train.bert.345m_tp1_pp4_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 4
NUM_NODES: 1
MAX_STEPS: 50
TEST_LEVEL: NIGHTLY_TESTS
train.bert.345m_tp1_pp4_interleaved_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
TEST_LEVEL: MR_TESTS
train.bert_core.345m_tp4_pp1_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 4
PP_SIZE: 1
NUM_NODES: 1
USE_CORE: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: NIGHTLY_TESTS
train.bert_core.345m_tp2_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
USE_CORE: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: MR_TESTS
train.bert_core.345m_tp2_pp2_1node_50steps_local_spec:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
USE_CORE: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: MR_TESTS
METADATA: local_spec
ADDITIONAL_PARAMS: "--spec local"
train.bert_core.345m_tp1_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
USE_CORE: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: NIGHTLY_TESTS
train.bert_core.345m_tp1_pp4_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 2
NUM_NODES: 1
USE_CORE: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: NIGHTLY_TESTS
train.bert_core.345m_tp1_pp2_1node_50steps_rope:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
USE_CORE: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: L0
METADATA: rope_embeddings
ADDITIONAL_PARAMS: "--position-embedding-type rope"
train.bert_core.345m_tp1_pp2_1node_50steps_sequence_parallel:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
USE_CORE: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: L0
METADATA: sequence_parallel
ADDITIONAL_PARAMS: "--sequence-parallel"
resume.checkpoint.bert.345m_tp1_pp2_1node:
<<: *selene-test-resume-checkpoint-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
TEST_LEVEL: MR_TESTS
train.retro_core.tp1_pp1_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: retro
USE_TE: 0
USE_CORE: 1
TP_SIZE: 1
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: MONTHLY_TESTS
train.t5_core.220m_tp1_pp1_1node_100steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: t5
USE_TE: 0
USE_CORE: 1
TP_SIZE: 1
PP_SIZE: 1
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 100
TIME_LIMIT: "30:00"
TEST_LEVEL: MONTHLY_TESTS
PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
train.t5_core.220m_tp2_pp1_1node_100steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: t5
USE_TE: 0
USE_CORE: 1
TP_SIZE: 2
PP_SIZE: 1
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 100
TIME_LIMIT: "30:00"
TEST_LEVEL: MONTHLY_TESTS
PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
train.t5_core.220m_te_tp1_pp1_1node_100steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: t5
USE_TE: 1
USE_CORE: 1
TP_SIZE: 1
PP_SIZE: 1
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 100
TIME_LIMIT: "30:00"
TEST_LEVEL: MR_TESTS
PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
train.t5_core.220m_te_tp2_pp1_1node_100steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: t5
USE_TE: 1
USE_CORE: 1
TP_SIZE: 2
PP_SIZE: 1
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 100
TIME_LIMIT: "30:00"
TEST_LEVEL: MONTHLY_TESTS
PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
train.t5_core.220m_te_tp2_pp1_sp_1node_100steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: t5
USE_TE: 1
USE_CORE: 1
TP_SIZE: 2
PP_SIZE: 1
VP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 100
TIME_LIMIT: "30:00"
TEST_LEVEL: MONTHLY_TESTS
PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
ADDITIONAL_PARAMS: "--sequence-parallel"
resume.checkpoint.t5_core.220m_tp1_pp1_1node:
<<: *selene-test-resume-checkpoint-launcher
variables:
<<: [*VARS]
RUN_MODEL: t5
USE_TE: 0
USE_CORE: 1
TP_SIZE: 1
PP_SIZE: 1
VP_SIZE: 1
NUM_NODES: 1
TIME_LIMIT: "30:00"
TEST_LEVEL: MONTHLY_TESTS
PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
resume.checkpoint.t5_core.220m_te_tp1_pp1_1node:
<<: *selene-test-resume-checkpoint-launcher
variables:
<<: [*VARS]
RUN_MODEL: t5
USE_TE: 1
USE_CORE: 1
TP_SIZE: 1
PP_SIZE: 1
VP_SIZE: 1
NUM_NODES: 1
TIME_LIMIT: "30:00"
TEST_LEVEL: MONTHLY_TESTS
PYTORCH_IMAGE: nvcr.io/nvidia/pytorch:23.07-py3
cleanup.selene:
tags:
- ssh_selene_runner
stage: cleanup
variables:
<<: [*VARS]
script:
- set +e
- NUM_CLEANUP=`find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | wc -l`
- find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | grep -v data | xargs rm -rf
- find ${SELENE_ADLR_CI_PATH}/* -type d -name "checkpoints" -ctime +2 | grep -v data | xargs rm -rf
- echo "Finished cleaning $NUM_CLEANUP directories older than 20 days everything in Selene"
allow_failure: true
rules:
- when: always
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。