diff --git a/scripts/performance/configs/qwen/qwen3_llm_pretrain.py b/scripts/performance/configs/qwen/qwen3_llm_pretrain.py
index 88fb22b112..1ab9af77a5 100644
--- a/scripts/performance/configs/qwen/qwen3_llm_pretrain.py
+++ b/scripts/performance/configs/qwen/qwen3_llm_pretrain.py
@@ -187,6 +187,8 @@ def qwen3_235b_a22b_pretrain_config_b200(
 
     set_qwen3_common_configs(cfg)
     set_workload_base_configs(cfg, base_cfg)
+    if precision == "fp8_mx" and is_full_iteration_cuda_graph(cfg.model):
+        set_full_iter_cg_configs(cfg)
 
     return cfg
 
@@ -349,6 +351,8 @@ def qwen3_30b_a3b_pretrain_config_b200(
 
     set_qwen3_common_configs(cfg)
     set_workload_base_configs(cfg, base_cfg)
+    if precision == "fp8_mx" and is_full_iteration_cuda_graph(cfg.model):
+        set_full_iter_cg_configs(cfg)
 
     return cfg
 
diff --git a/scripts/performance/configs/qwen/qwen3_workload_base_configs.py b/scripts/performance/configs/qwen/qwen3_workload_base_configs.py
index 461de741b3..efa97431df 100644
--- a/scripts/performance/configs/qwen/qwen3_workload_base_configs.py
+++ b/scripts/performance/configs/qwen/qwen3_workload_base_configs.py
@@ -262,7 +262,15 @@
 )
 
 
-QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_MX_V2 = QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_CS_V2
+QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_MX_V2 = replace(
+    QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_CS_V2,
+    virtual_pipeline_model_parallel_size=3,
+    moe_a2a_overlap=True,
+    cuda_graph_impl="full_iteration",
+    cuda_graph_scope=[],
+    cutedsl_fused_grouped_mlp=True,
+    fp8_dot_product_attention=True,
+)
 QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_NVFP4_V2 = QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_CS_V2
 
 
@@ -280,7 +288,16 @@
 )
 
 
-QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_MX_V2 = QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_CS_V2
+QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_MX_V2 = replace(
+    QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_CS_V2,
+    virtual_pipeline_model_parallel_size=3,
+    moe_flex_dispatcher_backend="hybridep",
+    moe_a2a_overlap=True,
+    cuda_graph_impl="full_iteration",
+    cuda_graph_scope=[],
+    cutedsl_fused_grouped_mlp=True,
+    fp8_dot_product_attention=True,
+)
 QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_NVFP4_V2 = QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_CS_V2
 
 
@@ -452,7 +469,14 @@
 QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_FP8_CS_V1 = QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_BF16_V1
 
 
-QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_FP8_MX_V1 = QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_BF16_V1
+QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_FP8_MX_V1 = replace(
+    QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_FP8_CS_V1,
+    moe_a2a_overlap=True,
+    cuda_graph_impl="full_iteration",
+    cuda_graph_scope=[],
+    cutedsl_fused_grouped_mlp=True,
+    fp8_dot_product_attention=True,
+)
 
 
 QWEN3_30B_A3B_PRETRAIN_CONFIG_H100_BF16_V1 = replace(