Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 3rdparty/nccl
Submodule nccl updated 477 files
7 changes: 7 additions & 0 deletions examples/jax/ep/bench/run_ep_bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,13 @@ NUM_GPUS=$(nvidia-smi -L 2>/dev/null | wc -l)
if [ "${NUM_GPUS}" -lt 4 ]; then
echo "EP bench requires >=4 GPUs (found ${NUM_GPUS}); SKIPPING."; exit 0
fi

# NCCL EP requires active NVLink P2P among ranks on the node.
if ! nvidia-smi nvlink --status 2>/dev/null | grep -qE 'Link [0-9]+:.*GB/s'; then
echo "NVLink not detected on this platform — EP bench requires NVLink; SKIPPING."
exit 0
fi

NUM=4
COORD="${COORD:-127.0.0.1:23457}"
TIMEOUT_S="${TIMEOUT_S:-1800}"
Expand Down
6 changes: 6 additions & 0 deletions tests/cpp_distributed/run_test_ep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ if (( MIN_SM > 0 && MIN_SM < 90 )); then
exit 0
fi

# NCCL EP requires active NVLink P2P among ranks on the node.
if ! nvidia-smi nvlink --status 2>/dev/null | grep -qE 'Link [0-9]+:.*GB/s'; then
echo "NVLink not detected on this platform; SKIPPING."
exit 0
fi

TEST_BIN="${BUILD_DIR}/test_ep"
if [[ ! -x "${TEST_BIN}" ]]; then
echo "ERROR: binary not found: ${TEST_BIN}"
Expand Down
7 changes: 7 additions & 0 deletions tests/jax/multi_process_launch_ep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@ if [ "${NUM_RUNS}" -lt 4 ]; then
echo "NCCL EP requires at least 4 GPUs (found ${NUM_RUNS}); SKIPPING."
exit 0
fi

# NCCL EP requires active NVLink P2P among ranks on the node.
if ! nvidia-smi nvlink --status 2>/dev/null | grep -qE 'Link [0-9]+:.*GB/s'; then
echo "NVLink not detected on this platform — EP test requires NVLink; SKIPPING."
exit 0
fi

# Default test mesh is (2, 2); use exactly 4 ranks even on larger boxes.
NUM_RUNS="${NVTE_TEST_EP_NUM_RANKS:-4}"

Expand Down
4 changes: 2 additions & 2 deletions tests/pytorch/distributed/run_test_ep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ if [ "${DETECTED_GPUS}" -lt 4 ]; then
exit 0
fi

# NCCL EP requires NVLink/NVSwitch between GPUs.
# NCCL EP requires active NVLink P2P among ranks on the node.
# On PCIe-only nodes (no NVLink) it falls back to the network
# transport and deadlocks, so skip cleanly there.
if ! nvidia-smi topo -m 2>/dev/null | grep -qE "\bNV[0-9]+\b"; then
if ! nvidia-smi nvlink --status 2>/dev/null | grep -qE 'Link [0-9]+:.*GB/s'; then
echo "No NVLink between GPUs (PCIe-only fabric); NCCL EP is unsupported here. SKIPPING."
exit 0
fi
Expand Down
Loading