From 00fde6eda573682489022887e9e4100b822dcf14 Mon Sep 17 00:00:00 2001 From: Jose Luis Rivero Date: Tue, 2 Jun 2026 20:14:02 +0200 Subject: [PATCH] ci: add headless rendering smoke test (Lyrical + Gazebo Jetty) Bootstrap GitHub Actions CI for vrx4. A single job builds the workspace in the ros:lyrical-ros-base container with Gazebo Jetty (ros-gz from the ros2-testing repo) and runs a headless smoke test under Xvfb + Mesa software GL: it launches the real production launch, confirms /clock is advancing, and measures the frame rate of an observation camera. The observation camera is test-only and is not part of any shipped world. smoke_test.sh waits for the sim to come up (rclpy /clock detection), spawns the camera into the running world via ros_gz_sim create, and bridges its image just for the duration of the test, so the production world, launch, and bridge config carry no CI scaffolding. Also caches the Gazebo Fuel assets for faster, less network-dependent reruns, adds the checker's test deps to vrx_bringup, and updates the README requirement from ROS 2 Rolling to ROS 2 Lyrical. Co-Authored-By: Claude Opus 4.8 Signed-off-by: Jose Luis Rivero --- .github/workflows/ci.yml | 68 +++++++++++++++++ README.md | 4 +- tools/check_render.py | 144 +++++++++++++++++++++++++++++++++++ tools/observation_camera.sdf | 29 +++++++ tools/smoke_test.sh | 121 +++++++++++++++++++++++++++++ vrx_bringup/package.xml | 5 ++ 6 files changed, 369 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100755 tools/check_render.py create mode 100644 tools/observation_camera.sdf create mode 100755 tools/smoke_test.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..b7bd95b4 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,68 @@ +name: vrx4 CI + +on: + push: + branches: [jrivero/vrx4_ci_clean] + pull_request: + branches: [vrx4] + workflow_dispatch: + +jobs: + smoke: + name: Smoke xvfb (Lyrical + Jetty) + runs-on: ubuntu-latest + timeout-minutes: 45 + container: + image: ros:lyrical-ros-base + env: + DEBIAN_FRONTEND: noninteractive + steps: + - name: Headless GL + ros2-testing repo + checkout deps + run: | + apt-get update + apt-get install -y --no-install-recommends \ + ca-certificates git \ + xvfb libgl1-mesa-dri libglx-mesa0 \ + mesa-utils \ + ros2-testing-apt-source + + - uses: actions/checkout@v4 + with: + path: src/vrx + + # open_water.sdf pulls the Portuguese Ledge tile from Gazebo Fuel on first + # run; cache it so reruns are fast and less network-dependent. + - name: Cache Gazebo Fuel assets + uses: actions/cache@v4 + with: + # In a job container HOME is /github/home, so Gazebo writes its Fuel + # cache to /github/home/.gz (not /root/.gz). + path: /github/home/.gz/fuel + key: gz-fuel-${{ hashFiles('src/vrx/vrx_gazebo/worlds/*.sdf') }} + restore-keys: gz-fuel- + + - name: Install ros-gz (Gazebo Jetty) + workspace deps + run: | + apt-get update + # ros-gz-sim / ros-gz-bridge pull the gz-*-vendor packages, i.e. + # Gazebo Jetty built for Lyrical, from ros2-testing. + apt-get install -y --no-install-recommends \ + ros-lyrical-ros-gz-sim ros-lyrical-ros-gz-bridge + rosdep update + rosdep install --from-paths src --ignore-src -y --rosdistro lyrical \ + --skip-keys "ros_gz_sim ros_gz_bridge" + + - name: Build + run: | + . /opt/ros/lyrical/setup.sh + colcon build --merge-install + # colcon exits 0 (and still writes install/setup.sh) even if it built + # nothing, so assert the package we actually launch was installed. + test -d install/share/vrx_bringup \ + || { echo "colcon did not build vrx_bringup" >&2; exit 1; } + + - name: Headless smoke test + camera FPS (xvfb) + run: | + . /opt/ros/lyrical/setup.sh + . install/setup.sh + bash src/vrx/tools/smoke_test.sh diff --git a/README.md b/README.md index 5ebd6cba..7ea4df01 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,9 @@ included yet — this is the starting point for a fresh rebuild. ## Requirements -- ROS 2 Rolling +- ROS 2 Lyrical - Gazebo Jetty -- `ros_gz` (matching the Rolling / Jetty pairing) +- `ros_gz` (matching the Lyrical / Jetty pairing) ## Packages diff --git a/tools/check_render.py b/tools/check_render.py new file mode 100755 index 00000000..c9839956 --- /dev/null +++ b/tools/check_render.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +"""Smoke check for the VRX simulation: sim is stepping AND rendering works. + +Confirms two things over a measurement window: + 1. /clock advances (the physics/simulation is actually stepping). + 2. the observation camera publishes images, and reports the achieved frame + rate (FPS). + +Subscribes with BEST_EFFORT QoS so it receives from either reliable or +best-effort publishers. Exit 0 if the clock advanced and at least +--min-frames camera frames arrived; 1 on timeout or insufficient frames. +""" + +import sys +import time +import argparse +from collections.abc import Callable + +import rclpy +from rclpy.node import Node +from rclpy.qos import QoSProfile, ReliabilityPolicy, HistoryPolicy +from rosgraph_msgs.msg import Clock +from sensor_msgs.msg import Image + + +class RenderWatcher(Node): + def __init__(self, image_topic: str) -> None: + super().__init__("vrx_smoke_render_watcher") + qos = QoSProfile( + reliability=ReliabilityPolicy.BEST_EFFORT, + history=HistoryPolicy.KEEP_LAST, + depth=10, + ) + self.latest_clock_ns: int | None = None + self.image_count: int = 0 + self.create_subscription(Clock, "/clock", self._on_clock, qos) + self.create_subscription(Image, image_topic, self._on_image, qos) + + def _on_clock(self, msg: Clock) -> None: + self.latest_clock_ns = msg.clock.sec * 1_000_000_000 + msg.clock.nanosec + + def _on_image(self, msg: Image) -> None: + self.image_count += 1 + + +def _spin_until(node: Node, predicate: Callable[[], bool], timeout: float) -> bool: + end = time.monotonic() + timeout + while time.monotonic() < end: + rclpy.spin_once(node, timeout_sec=0.1) + if predicate(): + return True + return predicate() + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--image-topic", default="/observation_camera/image") + parser.add_argument( + "--first-timeout", + type=float, + default=90.0, + help="seconds to wait for the first /clock message (covers Fuel download " + "and render-engine init)", + ) + parser.add_argument( + "--measure-secs", + type=float, + default=10.0, + help="wall-clock window over which to count camera frames", + ) + parser.add_argument("--min-frames", type=int, default=1) + parser.add_argument( + "--wait-only", + action="store_true", + help="exit 0 as soon as /clock is seen and skip the frame measurement; " + "used to gate the runtime camera spawn in CI", + ) + args = parser.parse_args() + + rclpy.init() + node = None + try: + node = RenderWatcher(args.image_topic) + + if not _spin_until( + node, + lambda: node.latest_clock_ns is not None, + args.first_timeout, + ): + node.get_logger().error( + f"No /clock within {args.first_timeout:.0f}s — " + "sim/bridge did not come up." + ) + return 1 + + if args.wait_only: + node.get_logger().info("/clock detected; simulation is up.") + return 0 + + # Measure over a fixed wall-clock window: spin (to drain image callbacks) + # for the whole window, counting frames. + clock_at_start = node.latest_clock_ns + node.image_count = 0 + t0 = time.monotonic() + _spin_until(node, lambda: False, args.measure_secs) + elapsed = time.monotonic() - t0 + + clock_advanced = ( + node.latest_clock_ns is not None + and clock_at_start is not None + and node.latest_clock_ns > clock_at_start + ) + frames = node.image_count + fps = frames / elapsed if elapsed > 0 else 0.0 + + node.get_logger().info( + f"clock advancing: {clock_advanced} " + f"({clock_at_start} -> {node.latest_clock_ns} ns)" + ) + node.get_logger().info( + f"CAMERA FPS: {fps:.2f} ({frames} frames on " + f"{args.image_topic} over {elapsed:.1f}s)" + ) + + if not clock_advanced: + node.get_logger().error("FAIL: /clock did not advance (paused?).") + return 1 + if frames < args.min_frames: + node.get_logger().error( + f"FAIL: only {frames} camera frame(s) " + f"(< {args.min_frames}) — rendering not producing images." + ) + return 1 + + node.get_logger().info("PASS: sim stepping and camera rendering.") + return 0 + finally: + if node is not None: + node.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tools/observation_camera.sdf b/tools/observation_camera.sdf new file mode 100644 index 00000000..d4328a37 --- /dev/null +++ b/tools/observation_camera.sdf @@ -0,0 +1,29 @@ + + + + + true + + + observation_camera/image + 30 + 1 + + 1.047 + + 640 + 480 + + + 0.1 + 2000 + + + + + + diff --git a/tools/smoke_test.sh b/tools/smoke_test.sh new file mode 100755 index 00000000..cfdfe3ef --- /dev/null +++ b/tools/smoke_test.sh @@ -0,0 +1,121 @@ +#!/usr/bin/env bash +# Headless smoke test for vrx4: GLX rendering under a virtual X server (Xvfb) +# with Mesa software GL (llvmpipe). It launches the real production launch +# headless, then spawns a test-only observation camera into the running world +# (the camera is deliberately NOT part of the production world), bridges its +# image to ROS, and confirms /clock is advancing AND the camera is producing +# frames, reporting the achieved FPS. Run inside a built workspace with ROS 2 +# Lyrical + vendored Gazebo Jetty already sourced. +set -uo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +WORLD="${VRX_SMOKE_WORLD:-open_water.sdf}" +# World *name* (the attribute), needed to address the +# entity-spawn service /world//create. open_water.sdf is "default". +WORLD_NAME="${VRX_SMOKE_WORLD_NAME:-default}" +CAMERA_SDF="${VRX_SMOKE_CAMERA_SDF:-${SCRIPT_DIR}/observation_camera.sdf}" +IMAGE_TOPIC="${VRX_SMOKE_IMAGE_TOPIC:-/observation_camera/image}" +MIN_FRAMES="${VRX_SMOKE_MIN_FRAMES:-5}" +LOG="$(mktemp -t vrx_smoke_launch.XXXXXX.log)" || { echo "mktemp failed" >&2; exit 1; } + +# Isolate this run's ROS graph so a stray /clock from another process on the +# runner (a leftover sim, a rosbag) can't satisfy the check. Launch and checker +# both inherit this, so they still talk to each other. +export ROS_DOMAIN_ID="${ROS_DOMAIN_ID:-$(( $$ % 100 + 1 ))}" + +# Fake X display + Mesa software GL (llvmpipe), GLX rendering path. Exercises the +# project's real composable launch (gazebo_gui:=false for headless). +export LIBGL_ALWAYS_SOFTWARE=1 +export GALLIUM_DRIVER="${GALLIUM_DRIVER:-llvmpipe}" +LAUNCH=(ros2 launch vrx_bringup simulation.launch.xml + gazebo_gui:=false world:="${WORLD}") + +LAUNCH_PID="" +BRIDGE_PID="" +cleanup() { + [[ -n "${BRIDGE_PID}" ]] && kill "${BRIDGE_PID}" 2>/dev/null || true + if [[ -n "${LAUNCH_PID}" ]]; then + kill -- "-${LAUNCH_PID}" 2>/dev/null || true + wait "${LAUNCH_PID}" 2>/dev/null || true + fi + rm -f "${LOG}" +} +trap cleanup EXIT + +echo "[smoke] launching simulation headless (world=${WORLD})..." +# setsid -> own session/process group so cleanup can kill the whole tree. +setsid xvfb-run -a "${LAUNCH[@]}" >"${LOG}" 2>&1 & +LAUNCH_PID=$! + +# Fail fast if the launcher could not even fork/exec. +if ! kill -0 "${LAUNCH_PID}" 2>/dev/null; then + echo "[smoke] FAILED: launch process did not start." >&2 + cat "${LOG}" >&2 + exit 1 +fi + +# Wait for the sim to be up before injecting the camera: a /clock message means +# the world finished loading (Fuel download on first run) and is stepping, so +# the /world//create service is registered. Reuse the checker's proven +# rclpy /clock detection (--wait-only) instead of `ros2 topic echo`. +echo "[smoke] waiting for the simulation to come up (/clock)..." +if ! python3 "${SCRIPT_DIR}/check_render.py" --wait-only --first-timeout 180; then + echo "[smoke] FAILED: sim/bridge did not come up (no /clock)." >&2 + cat "${LOG}" >&2 + exit 1 +fi + +# Spawn the test-only observation camera into the now-loaded world. Pose is +# given here (not in the model SDF) so placement — a test concern — lives with +# the test. The Sensors system already loaded by the world renders it. A short +# retry covers the race between /clock and the create service registering. +echo "[smoke] spawning observation camera into world '${WORLD_NAME}'..." +spawned=0 +for _ in 1 2 3 4 5; do + if ros2 run ros_gz_sim create \ + -world "${WORLD_NAME}" \ + -file "${CAMERA_SDF}" \ + -name observation_camera \ + -x 8 -y 8 -z 10 -R 0 -P 0.5 -Y -2.36 >>"${LOG}" 2>&1; then + spawned=1 + break + fi + sleep 2 +done +if [[ ${spawned} -ne 1 ]]; then + echo "[smoke] FAILED: could not spawn the observation camera." >&2 + cat "${LOG}" >&2 + exit 1 +fi + +# Bridge the camera image to ROS just for this test (the production bridge config +# stays free of CI-only topics). +echo "[smoke] bridging ${IMAGE_TOPIC} (test-only)..." +ros2 run ros_gz_bridge parameter_bridge \ + "${IMAGE_TOPIC}@sensor_msgs/msg/Image[gz.msgs.Image" >>"${LOG}" 2>&1 & +BRIDGE_PID=$! + +echo "[smoke] measuring /clock advance + camera FPS..." +# Capture rc inline (set -e is off) so no later edit can clobber $? before it. +rc=0 +python3 "${SCRIPT_DIR}/check_render.py" \ + --image-topic "${IMAGE_TOPIC}" \ + --min-frames "${MIN_FRAMES}" || rc=$? + +# Best-effort crash signal: ros2 launch keeps running if a child dies, so this +# only fires when the whole launch tree is already gone — but when it does, the +# failure was a crashed/failed launch, not a slow or paused sim. +if [[ ${rc} -ne 0 ]] && ! kill -0 "${LAUNCH_PID}" 2>/dev/null; then + echo "[smoke] launch process exited before the check completed (crash)." >&2 +fi + +if [[ ${rc} -ne 0 ]]; then + echo "[smoke] FAILED. Launch output follows:" >&2 + echo "----------------------------------------" >&2 + cat "${LOG}" >&2 + echo "----------------------------------------" >&2 + exit "${rc}" +fi + +echo "[smoke] PASSED." +exit 0 diff --git a/vrx_bringup/package.xml b/vrx_bringup/package.xml index 2791cd93..9ff7ae50 100644 --- a/vrx_bringup/package.xml +++ b/vrx_bringup/package.xml @@ -15,6 +15,11 @@ ros_gz_bridge vrx_gazebo + + rclpy + rosgraph_msgs + sensor_msgs + ament_lint_auto ament_lint_common