diff --git a/scripts/performance/configs/qwen/qwen3_llm_pretrain.py b/scripts/performance/configs/qwen/qwen3_llm_pretrain.py index 88fb22b112..1ab9af77a5 100644 --- a/scripts/performance/configs/qwen/qwen3_llm_pretrain.py +++ b/scripts/performance/configs/qwen/qwen3_llm_pretrain.py @@ -187,6 +187,8 @@ def qwen3_235b_a22b_pretrain_config_b200( set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) + if precision == "fp8_mx" and is_full_iteration_cuda_graph(cfg.model): + set_full_iter_cg_configs(cfg) return cfg @@ -349,6 +351,8 @@ def qwen3_30b_a3b_pretrain_config_b200( set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) + if precision == "fp8_mx" and is_full_iteration_cuda_graph(cfg.model): + set_full_iter_cg_configs(cfg) return cfg diff --git a/scripts/performance/configs/qwen/qwen3_workload_base_configs.py b/scripts/performance/configs/qwen/qwen3_workload_base_configs.py index 461de741b3..efa97431df 100644 --- a/scripts/performance/configs/qwen/qwen3_workload_base_configs.py +++ b/scripts/performance/configs/qwen/qwen3_workload_base_configs.py @@ -262,7 +262,15 @@ ) -QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_MX_V2 = QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_CS_V2 +QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_MX_V2 = replace( + QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_CS_V2, + virtual_pipeline_model_parallel_size=3, + moe_a2a_overlap=True, + cuda_graph_impl="full_iteration", + cuda_graph_scope=[], + cutedsl_fused_grouped_mlp=True, + fp8_dot_product_attention=True, +) QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_NVFP4_V2 = QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_CS_V2 @@ -280,7 +288,16 @@ ) -QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_MX_V2 = QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_CS_V2 +QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_MX_V2 = replace( + QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_CS_V2, + virtual_pipeline_model_parallel_size=3, + moe_flex_dispatcher_backend="hybridep", + moe_a2a_overlap=True, + cuda_graph_impl="full_iteration", + cuda_graph_scope=[], + cutedsl_fused_grouped_mlp=True, + fp8_dot_product_attention=True, +) QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_NVFP4_V2 = QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_CS_V2 @@ -452,7 +469,14 @@ QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_FP8_CS_V1 = QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_BF16_V1 -QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_FP8_MX_V1 = QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_BF16_V1 +QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_FP8_MX_V1 = replace( + QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_FP8_CS_V1, + moe_a2a_overlap=True, + cuda_graph_impl="full_iteration", + cuda_graph_scope=[], + cutedsl_fused_grouped_mlp=True, + fp8_dot_product_attention=True, +) QWEN3_30B_A3B_PRETRAIN_CONFIG_H100_BF16_V1 = replace(