Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 3rdparty/nccl
Submodule nccl updated 477 files
9 changes: 9 additions & 0 deletions examples/jax/ep/bench/run_ep_bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,15 @@ NUM_GPUS=$(nvidia-smi -L 2>/dev/null | wc -l)
if [ "${NUM_GPUS}" -lt 4 ]; then
echo "EP bench requires >=4 GPUs (found ${NUM_GPUS}); SKIPPING."; exit 0
fi

# NCCL EP requires NVLink P2P among ranks on the node.
NVLINK_OUTPUT=$(nvidia-smi nvlink --status 2>&1)
if [ $? -ne 0 ] || [[ "$NVLINK_OUTPUT" == *"not supported"* ]] \
|| [[ "$NVLINK_OUTPUT" == *"No devices"* ]] || [ -z "$NVLINK_OUTPUT" ]; then
echo "NVLink not detected on this platform; SKIPPING."
exit 0
fi

NUM=4
COORD="${COORD:-127.0.0.1:23457}"
TIMEOUT_S="${TIMEOUT_S:-1800}"
Expand Down
56 changes: 33 additions & 23 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,12 @@ def setup_requirements() -> Tuple[List[str], List[str]]:


def _discover_nccl_home() -> str:
"""Resolve NCCL_HOME: honor env var, else probe well-known prefixes, else ldconfig."""
"""Resolve NCCL_HOME, preferring the NCCL the dynamic loader resolves at runtime.

Probes in order: NCCL_HOME env var, ldconfig cache, well-known prefixes, then a
pip-installed nvidia-nccl-cu* wheel. To test a non-default NCCL (e.g. a wheel), set
NCCL_HOME and ensure the runtime loader resolves the same lib (e.g. LD_LIBRARY_PATH).
"""
env_home = os.environ.get("NCCL_HOME")
if env_home:
if (Path(env_home) / "include" / "nccl.h").exists():
Expand All @@ -152,28 +157,11 @@ def _discover_nccl_home() -> str:
# Include Debian/Ubuntu multiarch subdirs (e.g. lib/aarch64-linux-gnu).
lib_subdirs = ("lib", "lib64", "lib/aarch64-linux-gnu", "lib/x86_64-linux-gnu")

# pip-installed NCCL (nvidia-nccl-cu* wheel) lives under nvidia/nccl in
# site-packages and has no top-level include/lib layout.
try:
import importlib.util

spec = importlib.util.find_spec("nvidia.nccl")
if spec is not None and spec.submodule_search_locations:
pip_root = Path(next(iter(spec.submodule_search_locations)))
if (pip_root / "include" / "nccl.h").exists() and any(
(pip_root / sub / name).exists() for sub in lib_subdirs for name in lib_names
):
return str(pip_root)
except (ImportError, ValueError):
pass

for cand in ("/opt/nvidia/nccl", "/usr/local/nccl", "/usr"):
p = Path(cand)
if (p / "include" / "nccl.h").exists() and any(
(p / sub / name).exists() for sub in lib_subdirs for name in lib_names
):
return str(p)

# Prefer the NCCL the dynamic loader will actually resolve at runtime so the
# EP build links against the same libnccl that gets loaded. libtransformer_engine
# carries no NCCL RUNPATH, so the loader uses ldconfig/system paths; building
# against a different NCCL (e.g. a pip wheel) causes ABI mismatches. ldconfig is
# the ground truth for runtime resolution, so consult it before well-known prefixes.
try:
out = subprocess.check_output(["ldconfig", "-p"], stderr=subprocess.DEVNULL).decode()
for line in out.splitlines():
Expand All @@ -187,6 +175,28 @@ def _discover_nccl_home() -> str:
except (subprocess.CalledProcessError, FileNotFoundError):
pass

for cand in ("/opt/nvidia/nccl", "/usr/local/nccl", "/usr"):
p = Path(cand)
if (p / "include" / "nccl.h").exists() and any(
(p / sub / name).exists() for sub in lib_subdirs for name in lib_names
):
return str(p)

# Fall back to a pip-installed NCCL (nvidia-nccl-cu* wheel) under nvidia/nccl
# in site-packages, used only when no system NCCL is present.
try:
import importlib.util

spec = importlib.util.find_spec("nvidia.nccl")
if spec is not None and spec.submodule_search_locations:
pip_root = Path(next(iter(spec.submodule_search_locations)))
if (pip_root / "include" / "nccl.h").exists() and any(
(pip_root / sub / name).exists() for sub in lib_subdirs for name in lib_names
):
return str(pip_root)
except (ImportError, ValueError):
pass

raise RuntimeError(
"Could not locate NCCL core (nccl.h + libnccl.so). Set NCCL_HOME to the install prefix."
)
Expand Down
8 changes: 8 additions & 0 deletions tests/cpp_distributed/run_test_ep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@ if (( MIN_SM > 0 && MIN_SM < 90 )); then
exit 0
fi

# NCCL EP requires NVLink P2P among ranks on the node.
NVLINK_OUTPUT=$(nvidia-smi nvlink --status 2>&1)
if [[ $? -ne 0 ]] || [[ "$NVLINK_OUTPUT" == *"not supported"* ]] \
|| [[ "$NVLINK_OUTPUT" == *"No devices"* ]] || [[ -z "$NVLINK_OUTPUT" ]]; then
echo "NVLink not detected on this platform; SKIPPING."
exit 0
fi

TEST_BIN="${BUILD_DIR}/test_ep"
if [[ ! -x "${TEST_BIN}" ]]; then
echo "ERROR: binary not found: ${TEST_BIN}"
Expand Down
9 changes: 9 additions & 0 deletions tests/jax/multi_process_launch_ep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,15 @@ if [ "${NUM_RUNS}" -lt 4 ]; then
echo "NCCL EP requires at least 4 GPUs (found ${NUM_RUNS}); SKIPPING."
exit 0
fi

# NCCL EP requires NVLink P2P among ranks on the node.
NVLINK_OUTPUT=$(nvidia-smi nvlink --status 2>&1)
if [ $? -ne 0 ] || [[ "$NVLINK_OUTPUT" == *"not supported"* ]] \
|| [[ "$NVLINK_OUTPUT" == *"No devices"* ]] || [ -z "$NVLINK_OUTPUT" ]; then
echo "NVLink not detected on this platform; SKIPPING."
exit 0
fi

# Default test mesh is (2, 2); use exactly 4 ranks even on larger boxes.
NUM_RUNS="${NVTE_TEST_EP_NUM_RANKS:-4}"

Expand Down
Loading