From f1618df3530a83355212005485d4dc99b616f1e8 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Fri, 26 Jun 2026 03:13:31 -0700 Subject: [PATCH 1/5] update nccl Signed-off-by: Phuong Nguyen --- 3rdparty/nccl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/nccl b/3rdparty/nccl index 808d2433dd..696e971fb0 160000 --- a/3rdparty/nccl +++ b/3rdparty/nccl @@ -1 +1 @@ -Subproject commit 808d2433dda3cccc80f8172a94a6b117359e7102 +Subproject commit 696e971fb092a26f81a8c24b432beecdbbe3064e From a810ad65571b695ef171c27f2d2d258d10ce5a74 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Sun, 28 Jun 2026 23:14:50 -0700 Subject: [PATCH 2/5] nccl with relax num_dispatch_tokens%64!=0 Signed-off-by: Phuong Nguyen --- 3rdparty/nccl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/nccl b/3rdparty/nccl index 696e971fb0..a6b5de08b6 160000 --- a/3rdparty/nccl +++ b/3rdparty/nccl @@ -1 +1 @@ -Subproject commit 696e971fb092a26f81a8c24b432beecdbbe3064e +Subproject commit a6b5de08b6af4f938cef541ae6e4d405632f89a4 From 57a601325bb2deb9f97d41343d90f1c8db3e738a Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Mon, 29 Jun 2026 01:30:22 -0700 Subject: [PATCH 3/5] Skip EP tests/examples on nodes without NVLink Signed-off-by: Phuong Nguyen --- examples/jax/ep/bench/run_ep_bench.sh | 9 +++++++++ tests/cpp_distributed/run_test_ep.sh | 8 ++++++++ tests/jax/multi_process_launch_ep.sh | 9 +++++++++ 3 files changed, 26 insertions(+) diff --git a/examples/jax/ep/bench/run_ep_bench.sh b/examples/jax/ep/bench/run_ep_bench.sh index 1531dfd5cf..63ada44101 100755 --- a/examples/jax/ep/bench/run_ep_bench.sh +++ b/examples/jax/ep/bench/run_ep_bench.sh @@ -47,6 +47,15 @@ NUM_GPUS=$(nvidia-smi -L 2>/dev/null | wc -l) if [ "${NUM_GPUS}" -lt 4 ]; then echo "EP bench requires >=4 GPUs (found ${NUM_GPUS}); SKIPPING."; exit 0 fi + +# NCCL EP requires NVLink P2P among ranks on the node. +NVLINK_OUTPUT=$(nvidia-smi nvlink --status 2>&1) +if [ $? -ne 0 ] || [[ "$NVLINK_OUTPUT" == *"not supported"* ]] \ + || [[ "$NVLINK_OUTPUT" == *"No devices"* ]] || [ -z "$NVLINK_OUTPUT" ]; then + echo "NVLink not detected on this platform — EP bench requires NVLink; SKIPPING." + exit 0 +fi + NUM=4 COORD="${COORD:-127.0.0.1:23457}" TIMEOUT_S="${TIMEOUT_S:-1800}" diff --git a/tests/cpp_distributed/run_test_ep.sh b/tests/cpp_distributed/run_test_ep.sh index d486d45f8a..514bd10c93 100755 --- a/tests/cpp_distributed/run_test_ep.sh +++ b/tests/cpp_distributed/run_test_ep.sh @@ -35,6 +35,14 @@ if (( MIN_SM > 0 && MIN_SM < 90 )); then exit 0 fi +# NCCL EP requires NVLink P2P among ranks on the node. +NVLINK_OUTPUT=$(nvidia-smi nvlink --status 2>&1) +if [[ $? -ne 0 ]] || [[ "$NVLINK_OUTPUT" == *"not supported"* ]] \ + || [[ "$NVLINK_OUTPUT" == *"No devices"* ]] || [[ -z "$NVLINK_OUTPUT" ]]; then + echo "NVLink not detected on this platform; SKIPPING." + exit 0 +fi + TEST_BIN="${BUILD_DIR}/test_ep" if [[ ! -x "${TEST_BIN}" ]]; then echo "ERROR: binary not found: ${TEST_BIN}" diff --git a/tests/jax/multi_process_launch_ep.sh b/tests/jax/multi_process_launch_ep.sh index d32ce5f5d3..44c6bd6bc0 100755 --- a/tests/jax/multi_process_launch_ep.sh +++ b/tests/jax/multi_process_launch_ep.sh @@ -32,6 +32,15 @@ if [ "${NUM_RUNS}" -lt 4 ]; then echo "NCCL EP requires at least 4 GPUs (found ${NUM_RUNS}); SKIPPING." exit 0 fi + +# NCCL EP requires NVLink P2P among ranks on the node. +NVLINK_OUTPUT=$(nvidia-smi nvlink --status 2>&1) +if [ $? -ne 0 ] || [[ "$NVLINK_OUTPUT" == *"not supported"* ]] \ + || [[ "$NVLINK_OUTPUT" == *"No devices"* ]] || [ -z "$NVLINK_OUTPUT" ]; then + echo "NVLink not detected on this platform — EP test requires NVLink; SKIPPING." + exit 0 +fi + # Default test mesh is (2, 2); use exactly 4 ranks even on larger boxes. NUM_RUNS="${NVTE_TEST_EP_NUM_RANKS:-4}" From 8c803f6d16a1fa4ef60c5885cc97aef6e0aba6d2 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Mon, 29 Jun 2026 06:39:14 -0700 Subject: [PATCH 4/5] cleanup Signed-off-by: Phuong Nguyen --- examples/jax/ep/bench/run_ep_bench.sh | 2 +- tests/jax/multi_process_launch_ep.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/jax/ep/bench/run_ep_bench.sh b/examples/jax/ep/bench/run_ep_bench.sh index 63ada44101..eb3e980876 100755 --- a/examples/jax/ep/bench/run_ep_bench.sh +++ b/examples/jax/ep/bench/run_ep_bench.sh @@ -52,7 +52,7 @@ fi NVLINK_OUTPUT=$(nvidia-smi nvlink --status 2>&1) if [ $? -ne 0 ] || [[ "$NVLINK_OUTPUT" == *"not supported"* ]] \ || [[ "$NVLINK_OUTPUT" == *"No devices"* ]] || [ -z "$NVLINK_OUTPUT" ]; then - echo "NVLink not detected on this platform — EP bench requires NVLink; SKIPPING." + echo "NVLink not detected on this platform; SKIPPING." exit 0 fi diff --git a/tests/jax/multi_process_launch_ep.sh b/tests/jax/multi_process_launch_ep.sh index 44c6bd6bc0..4f53613d0e 100755 --- a/tests/jax/multi_process_launch_ep.sh +++ b/tests/jax/multi_process_launch_ep.sh @@ -37,7 +37,7 @@ fi NVLINK_OUTPUT=$(nvidia-smi nvlink --status 2>&1) if [ $? -ne 0 ] || [[ "$NVLINK_OUTPUT" == *"not supported"* ]] \ || [[ "$NVLINK_OUTPUT" == *"No devices"* ]] || [ -z "$NVLINK_OUTPUT" ]; then - echo "NVLink not detected on this platform — EP test requires NVLink; SKIPPING." + echo "NVLink not detected on this platform; SKIPPING." exit 0 fi From d4055aa683db23891fdbbfcc73851f52f5b59e8e Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Mon, 29 Jun 2026 11:47:27 -0700 Subject: [PATCH 5/5] Detect active NVLink via nvlink --status link bandwidth in EP scripts Signed-off-by: Phuong Nguyen --- examples/jax/ep/bench/run_ep_bench.sh | 8 +++----- tests/cpp_distributed/run_test_ep.sh | 6 ++---- tests/jax/multi_process_launch_ep.sh | 8 +++----- tests/pytorch/distributed/run_test_ep.sh | 4 ++-- 4 files changed, 10 insertions(+), 16 deletions(-) diff --git a/examples/jax/ep/bench/run_ep_bench.sh b/examples/jax/ep/bench/run_ep_bench.sh index eb3e980876..63133156eb 100755 --- a/examples/jax/ep/bench/run_ep_bench.sh +++ b/examples/jax/ep/bench/run_ep_bench.sh @@ -48,11 +48,9 @@ if [ "${NUM_GPUS}" -lt 4 ]; then echo "EP bench requires >=4 GPUs (found ${NUM_GPUS}); SKIPPING."; exit 0 fi -# NCCL EP requires NVLink P2P among ranks on the node. -NVLINK_OUTPUT=$(nvidia-smi nvlink --status 2>&1) -if [ $? -ne 0 ] || [[ "$NVLINK_OUTPUT" == *"not supported"* ]] \ - || [[ "$NVLINK_OUTPUT" == *"No devices"* ]] || [ -z "$NVLINK_OUTPUT" ]; then - echo "NVLink not detected on this platform; SKIPPING." +# NCCL EP requires active NVLink P2P among ranks on the node. +if ! nvidia-smi nvlink --status 2>/dev/null | grep -qE 'Link [0-9]+:.*GB/s'; then + echo "NVLink not detected on this platform — EP bench requires NVLink; SKIPPING." exit 0 fi diff --git a/tests/cpp_distributed/run_test_ep.sh b/tests/cpp_distributed/run_test_ep.sh index 514bd10c93..da293dadfd 100755 --- a/tests/cpp_distributed/run_test_ep.sh +++ b/tests/cpp_distributed/run_test_ep.sh @@ -35,10 +35,8 @@ if (( MIN_SM > 0 && MIN_SM < 90 )); then exit 0 fi -# NCCL EP requires NVLink P2P among ranks on the node. -NVLINK_OUTPUT=$(nvidia-smi nvlink --status 2>&1) -if [[ $? -ne 0 ]] || [[ "$NVLINK_OUTPUT" == *"not supported"* ]] \ - || [[ "$NVLINK_OUTPUT" == *"No devices"* ]] || [[ -z "$NVLINK_OUTPUT" ]]; then +# NCCL EP requires active NVLink P2P among ranks on the node. +if ! nvidia-smi nvlink --status 2>/dev/null | grep -qE 'Link [0-9]+:.*GB/s'; then echo "NVLink not detected on this platform; SKIPPING." exit 0 fi diff --git a/tests/jax/multi_process_launch_ep.sh b/tests/jax/multi_process_launch_ep.sh index 4f53613d0e..ff89f712eb 100755 --- a/tests/jax/multi_process_launch_ep.sh +++ b/tests/jax/multi_process_launch_ep.sh @@ -33,11 +33,9 @@ if [ "${NUM_RUNS}" -lt 4 ]; then exit 0 fi -# NCCL EP requires NVLink P2P among ranks on the node. -NVLINK_OUTPUT=$(nvidia-smi nvlink --status 2>&1) -if [ $? -ne 0 ] || [[ "$NVLINK_OUTPUT" == *"not supported"* ]] \ - || [[ "$NVLINK_OUTPUT" == *"No devices"* ]] || [ -z "$NVLINK_OUTPUT" ]; then - echo "NVLink not detected on this platform; SKIPPING." +# NCCL EP requires active NVLink P2P among ranks on the node. +if ! nvidia-smi nvlink --status 2>/dev/null | grep -qE 'Link [0-9]+:.*GB/s'; then + echo "NVLink not detected on this platform — EP test requires NVLink; SKIPPING." exit 0 fi diff --git a/tests/pytorch/distributed/run_test_ep.sh b/tests/pytorch/distributed/run_test_ep.sh index ae40c8ba4b..68b691f787 100755 --- a/tests/pytorch/distributed/run_test_ep.sh +++ b/tests/pytorch/distributed/run_test_ep.sh @@ -18,10 +18,10 @@ if [ "${DETECTED_GPUS}" -lt 4 ]; then exit 0 fi -# NCCL EP requires NVLink/NVSwitch between GPUs. +# NCCL EP requires active NVLink P2P among ranks on the node. # On PCIe-only nodes (no NVLink) it falls back to the network # transport and deadlocks, so skip cleanly there. -if ! nvidia-smi topo -m 2>/dev/null | grep -qE "\bNV[0-9]+\b"; then +if ! nvidia-smi nvlink --status 2>/dev/null | grep -qE 'Link [0-9]+:.*GB/s'; then echo "No NVLink between GPUs (PCIe-only fabric); NCCL EP is unsupported here. SKIPPING." exit 0 fi