diff --git a/.bumpversion.toml b/.bumpversion.toml
index 91f59c6..2905593 100644
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "1.2.3"
+current_version = "1.2.6-dev8"
 parse = """(?x)
     (?P<major>0|[1-9]\\d*)\\.
     (?P<minor>0|[1-9]\\d*)\\.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 24d8fff..ad103ff 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,7 +11,7 @@ repos:
         args: ["--maxkb=10000"]
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.11.0
+    rev: v0.15.2
     hooks:
       # Run the linter.
       - id: ruff
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 918a25f..de11a99 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -10,5 +10,6 @@
     }
   },
   "python.analysis.typeCheckingMode": "standard",
-  "python.testing.pytestEnabled": true
+  "python.testing.pytestEnabled": true,
+  "ruff.format.preview": false
 }
diff --git a/examples/signal_filter/__init__.py b/examples/signal_filter/__init__.py
new file mode 100644
index 0000000..8fa8327
--- /dev/null
+++ b/examples/signal_filter/__init__.py
@@ -0,0 +1 @@
+"""Signal filter examples for fullwave25."""
diff --git a/examples/signal_filter/plane_wave_with_filter.py b/examples/signal_filter/plane_wave_with_filter.py
new file mode 100644
index 0000000..fd6c6c4
--- /dev/null
+++ b/examples/signal_filter/plane_wave_with_filter.py
@@ -0,0 +1,280 @@
+"""Demonstrate the built-in high-pass filter inside solver.run().
+
+Based on examples/linear_transducer/plane_wave_compounding.py.
+
+This script runs a single 0-degree plane wave transmission through a medium
+with echoic targets, then shows the effect of passing
+``highpass_cutoff_mhz=0.5`` to ``solver.run()``:
+
+  * Left panel   - raw sensor traces (PML low-frequency drift visible)
+  * Right panel  - high-pass filtered traces (drift removed)
+  * Bottom panel - amplitude spectra of a single element before/after
+
+Run with:
+    uv run python examples/signal_filter/plane_wave_with_filter.py
+"""
+
+import logging
+import shutil
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+import fullwave
+from fullwave.utils.signal_filter import apply_filter
+
+logging.getLogger("__main__").setLevel(logging.INFO)
+
+
+# ---------------------------------------------------------------------------
+# Medium helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_echoic_targets(
+    scatterer: np.ndarray,
+    grid: fullwave.Grid,
+    target_radius_m: float,
+    target_spacing_m: float,
+    n_targets_axial: int,
+    n_targets_lateral: int,
+) -> np.ndarray:
+    """Place a grid of hypo/hyper/anechoic circles in the scatterer map."""
+    total_axial = (n_targets_axial - 1) * target_spacing_m
+    total_lateral = (n_targets_lateral - 1) * target_spacing_m
+    x0 = (grid.shape[0] * grid.dx - total_axial) / 2
+    y0 = (grid.shape[1] * grid.dx - total_lateral) / 2
+
+    axial_pos = np.linspace(x0, x0 + total_axial, n_targets_axial)
+    lateral_pos = np.linspace(y0, y0 + total_lateral, n_targets_lateral)
+
+    # columns: anechoic, hypo-echoic, hyper-echoic
+    ratio = np.array(
+        [
+            [0.0, 0.5, 3.0],
+            [0.0, 0.5, 3.0],
+            [0.0, 0.5, 3.0],
+        ],
+    )
+
+    for i_ax, xp in enumerate(axial_pos):
+        for i_lat, yp in enumerate(lateral_pos):
+            xi = int(xp / grid.dx)
+            yi = int(yp / grid.dx)
+            rr, cc = np.ogrid[
+                -xi : scatterer.shape[0] - xi,
+                -yi : scatterer.shape[1] - yi,
+            ]
+            mask = rr**2 + cc**2 <= (target_radius_m / grid.dx) ** 2
+            scatterer -= 1.0
+            scatterer[mask] *= ratio[i_ax, i_lat]
+            scatterer += 1.0
+
+    return scatterer
+
+
+def _make_input_signal(
+    grid: fullwave.Grid,
+    transducer: fullwave.Transducer,
+    element_layer_px: int,
+    p_max: float = 1e5,
+) -> np.ndarray:
+    """Build a 0-degree plane wave input signal (no steering delay)."""
+    input_signal = np.zeros((transducer.n_sources, grid.nt))
+    for i in range(len(input_signal)):
+        n_y = input_signal.shape[0] // element_layer_px
+        i_layer = i // n_y
+        input_signal[i] = fullwave.utils.pulse.gaussian_modulated_sinusoidal_signal(
+            nt=grid.nt,
+            f0=grid.f0,
+            duration=grid.duration,
+            ncycles=2,
+            drop_off=2,
+            p0=p_max,
+            i_layer=i_layer,
+            dt_for_layer_delay=grid.dt,
+            cfl_for_layer_delay=grid.cfl,
+            delay_sec=0.0,
+        )
+    return input_signal
+
+
+# ---------------------------------------------------------------------------
+# Simulation sub-steps
+# ---------------------------------------------------------------------------
+
+
+def _build_medium(
+    grid: fullwave.Grid,
+    c0: float,
+) -> fullwave.Medium:
+    """Build the acoustic medium with echoic scattering targets."""
+    rng = np.random.default_rng(42)
+    scatterer, _ = fullwave.utils.generate_scatterer(
+        grid=grid,
+        ratio_scatterer_to_total_grid=0.38,
+        scatter_value_std=0.02 / 2,
+        rng=rng,
+    )
+    scatterer = _make_echoic_targets(
+        scatterer,
+        grid,
+        target_radius_m=5e-3,
+        target_spacing_m=15e-3,
+        n_targets_axial=3,
+        n_targets_lateral=3,
+    )
+    return fullwave.Medium(
+        grid,
+        sound_speed=np.ones(grid.shape) * c0,
+        density=np.ones(grid.shape) * 1000 * scatterer,
+        alpha_coeff=np.ones(grid.shape) * 0.5,
+        alpha_power=np.ones(grid.shape) * 1.1,
+        beta=np.zeros(grid.shape),
+        air_map=np.zeros(grid.shape),
+    )
+
+
+def _build_transducer(
+    grid: fullwave.Grid,
+    domain_size: tuple[float, float],
+) -> tuple[fullwave.Transducer, int]:
+    """Build the 128-element linear transducer and return it with element_layer_px."""
+    element_layer_px = 4
+    transducer_width_m = 38e-3
+    transducer_geometry = fullwave.TransducerGeometry(
+        grid,
+        number_elements=128,
+        element_width_m=0.298e-3 - 0.048e-3,
+        element_spacing_m=0.048e-3,
+        element_layer_px=element_layer_px,
+        position_m=(0, (domain_size[1] - transducer_width_m) / 2),
+        radius=float("inf"),
+    )
+    transducer = fullwave.Transducer(
+        transducer_geometry=transducer_geometry,
+        grid=grid,
+        sampling_modulus_time=7,
+    )
+    transducer.set_signal(_make_input_signal(grid, transducer, element_layer_px))
+    return transducer, element_layer_px
+
+
+def _run_simulation(
+    work_dir: Path,
+    grid: fullwave.Grid,
+    medium: fullwave.Medium,
+    transducer: fullwave.Transducer,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Run the solver and return (raw_output, hp_filtered_output)."""
+    solver = fullwave.Solver(
+        work_dir=work_dir,
+        grid=grid,
+        medium=medium,
+        transducer=transducer,
+    )
+    raw_output = solver.run(simulation_dir_name="txrx_raw", is_static_map=True)
+    raw_output = transducer.post_process_sensor_output(raw_output, average_surface_signals=True)
+    shutil.rmtree(work_dir / "txrx_raw")
+
+    # Equivalent to passing highpass_cutoff_mhz=0.5 directly to solver.run()
+    dt_rec = grid.dt * transducer.sampling_modulus_time
+    filtered_output = apply_filter(raw_output, dt=dt_rec, f_low_hz=0.5e6, use_gpu=False)
+    return raw_output, filtered_output
+
+
+def _plot_results(
+    work_dir: Path,
+    grid: fullwave.Grid,
+    transducer: fullwave.Transducer,
+    raw_output: np.ndarray,
+    filtered_output: np.ndarray,
+    f0: float,
+) -> None:
+    """Plot time traces and amplitude spectra for three representative elements."""
+    dt_rec = grid.dt * transducer.sampling_modulus_time
+    n_t_rec = raw_output.shape[1]
+    t_us = np.arange(n_t_rec) * dt_rec * 1e6
+    freqs_mhz = np.fft.rfftfreq(n_t_rec, d=dt_rec) / 1e6
+
+    def _db(sig: np.ndarray) -> np.ndarray:
+        amp = np.abs(np.fft.rfft(sig))
+        return 20 * np.log10(np.maximum(amp / (amp.max() + 1e-12), 1e-5))
+
+    n_elem = raw_output.shape[0]
+    elem_indices = [n_elem // 4, n_elem // 2, 3 * n_elem // 4]
+
+    fig, axes = plt.subplots(3, 2, figsize=(13, 10))
+    fig.suptitle("Plane wave — sensor output before / after HP filter (0.5 MHz)", fontsize=12)
+
+    for row, idx in enumerate(elem_indices):
+        ax_t, ax_f = axes[row, 0], axes[row, 1]
+        raw_trace, filt_trace = raw_output[idx], filtered_output[idx]
+
+        ax_t.plot(t_us, raw_trace, color="tab:gray", lw=0.7, alpha=0.7, label="Raw")
+        ax_t.plot(t_us, filt_trace, color="tab:orange", lw=0.9, label="HP 0.5 MHz")
+        ax_t.set_ylabel("Pressure")
+        ax_t.set_title(f"element {idx}")
+        ax_t.legend(fontsize=8)
+        ax_t.set_xlim(t_us[0], t_us[-1])
+        if row == 2:
+            ax_t.set_xlabel("Time (µs)")
+
+        ax_f.plot(freqs_mhz, _db(raw_trace), color="tab:gray", lw=0.7, alpha=0.7, label="Raw")
+        ax_f.plot(freqs_mhz, _db(filt_trace), color="tab:orange", lw=0.9, label="HP 0.5 MHz")
+        ax_f.axvline(f0 / 1e6, color="steelblue", lw=0.8, ls="--", label=f"f0={f0 / 1e6:.0f} MHz")
+        ax_f.set_xlim(0, f0 / 1e6 * 3)
+        ax_f.set_ylim(-80, 5)
+        ax_f.set_ylabel("Amplitude (dB)")
+        ax_f.legend(fontsize=8)
+        if row == 2:
+            ax_f.set_xlabel("Frequency (MHz)")
+
+    axes[0, 0].set_title(f"Time traces — {axes[0, 0].get_title()}", fontsize=10)
+    axes[0, 1].set_title(f"Amplitude spectra — {axes[0, 1].get_title()}", fontsize=10)
+    plt.tight_layout()
+    out_fig = work_dir / "sensor_before_after_filter.png"
+    plt.savefig(out_fig, dpi=150)
+    print(f"Saved figure to {out_fig}")
+
+
+def _print_summary(raw_output: np.ndarray, filtered_output: np.ndarray) -> None:
+    """Print DC drift statistics before and after filtering."""
+    drift_raw = raw_output.mean(axis=1)
+    drift_filt = filtered_output.mean(axis=1)
+    print(
+        f"\nDC drift (mean across elements):"
+        f"\n  Raw      - mean={drift_raw.mean():.4f}, std={drift_raw.std():.4f}"
+        f"\n  Filtered - mean={drift_filt.mean():.6f}, std={drift_filt.std():.6f}",
+    )
+    print(
+        "\nTip: pass highpass_cutoff_mhz=0.5 directly to solver.run() to apply"
+        " the same filter automatically before the result is returned.",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    """Run plane wave simulation and compare raw vs high-pass filtered sensor output."""
+    work_dir = Path("./outputs/plane_wave_with_filter")
+    work_dir.mkdir(parents=True, exist_ok=True)
+
+    domain_size = (4.5e-2, 4.5e-2)  # [axial, lateral] m
+    f0 = 2e6
+    c0 = 1540
+
+    grid = fullwave.Grid(domain_size, f0, domain_size[0] / c0 * 2.3, c0=c0, ppw=12, cfl=0.4)
+    medium = _build_medium(grid, c0)
+    transducer, _ = _build_transducer(grid, domain_size)
+    raw_output, filtered_output = _run_simulation(work_dir, grid, medium, transducer)
+    _plot_results(work_dir, grid, transducer, raw_output, filtered_output, f0)
+    _print_summary(raw_output, filtered_output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/signal_filter/signal_filter_example.py b/examples/signal_filter/signal_filter_example.py
new file mode 100644
index 0000000..33bf8a5
--- /dev/null
+++ b/examples/signal_filter/signal_filter_example.py
@@ -0,0 +1,130 @@
+"""Demonstrate the built-in signal filter (no GPU / no simulation required).
+
+This script:
+  1. Builds a synthetic sensor trace that mimics typical PML drift artefacts:
+     a slow DC ramp + a 3 MHz ultrasound pulse buried in broadband noise.
+  2. Applies a high-pass filter (0.5 MHz) to remove the drift.
+  3. Applies a band-pass filter (1-5 MHz) to isolate the ultrasound band.
+  4. Plots the time traces and their amplitude spectra side-by-side.
+
+Run with:
+    uv run python examples/signal_filter/signal_filter_example.py
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from fullwave.utils.signal_filter import apply_filter
+
+
+def main() -> None:
+    """Build a synthetic sensor trace and demonstrate high-pass and band-pass filtering."""
+    # ---------------------------------------------------------------------------
+    # Simulation-like parameters
+    # ---------------------------------------------------------------------------
+    f0 = 3e6  # center frequency, Hz
+    dt = 1 / (f0 * 20)  # ~20 samples per period
+    n_t = 2048
+    t = np.arange(n_t) * dt  # time axis, seconds
+
+    # ---------------------------------------------------------------------------
+    # Synthetic signal: DC drift + 3 MHz pulse + broadband noise
+    # ---------------------------------------------------------------------------
+    rng = np.random.default_rng(42)
+
+    # Slow PML drift: linear ramp (typical artifact)
+    drift = np.linspace(0, 2.0, n_t)
+
+    # Short Gaussian-windowed 3 MHz pulse arriving at t = 3 µs
+    t_pulse = 3e-6
+    sigma = 0.5e-6
+    envelope = np.exp(-0.5 * ((t - t_pulse) / sigma) ** 2)
+    pulse = envelope * np.sin(2 * np.pi * f0 * t)
+
+    # White noise floor at -30 dB relative to pulse
+    noise = rng.standard_normal(n_t) * 0.03
+
+    raw = drift + pulse + noise  # shape [n_t]
+
+    # Wrap as [n_sensors, n_t] (one channel)
+    data = raw[np.newaxis, :]
+
+    # ---------------------------------------------------------------------------
+    # Apply filters
+    # ---------------------------------------------------------------------------
+    hp_filtered = apply_filter(data, dt, f_low_hz=0.5e6, use_gpu=False)
+    bp_filtered = apply_filter(data, dt, f_low_hz=1e6, f_high_hz=5e6, use_gpu=False)
+
+    # ---------------------------------------------------------------------------
+    # Amplitude spectrum helper
+    # ---------------------------------------------------------------------------
+    freqs_mhz = np.fft.rfftfreq(n_t, d=dt) / 1e6
+
+    def spectrum_db(sig: np.ndarray) -> np.ndarray:
+        """Return amplitude spectrum in dB (normalised to peak, floored at -100 dB)."""
+        amp = np.abs(np.fft.rfft(sig))
+        amp_norm = amp / (amp.max() + 1e-12)
+        return 20 * np.log10(np.maximum(amp_norm, 1e-5))  # floor at -100 dB
+
+    # ---------------------------------------------------------------------------
+    # Plot — 2 rows (time / spectrum) x 3 columns (raw / high-pass / band-pass)
+    # Each filtered column overlays the raw signal so before/after is clear.
+    # ---------------------------------------------------------------------------
+    t_us = t * 1e6  # µs for display
+
+    fig, axes = plt.subplots(2, 3, figsize=(14, 7))
+    fig.suptitle("Built-in signal filter — example", fontsize=13)
+
+    filters = [
+        ("Raw (drift + pulse + noise)", data[0], None, "tab:blue"),
+        ("High-pass 0.5 MHz", hp_filtered[0], data[0], "tab:orange"),
+        ("Band-pass 1-5 MHz", bp_filtered[0], data[0], "tab:green"),
+    ]
+
+    for col, (label, sig, before, color) in enumerate(filters):
+        ax_t = axes[0, col]
+        ax_f = axes[1, col]
+
+        # --- time trace ---
+        if before is not None:
+            ax_t.plot(t_us, before, color="tab:gray", lw=0.6, alpha=0.5, label="Before")
+        ax_t.plot(t_us, sig, color=color, lw=0.9, label="After" if before is not None else label)
+        ax_t.set_title(label, fontsize=10)
+        ax_t.set_xlabel("Time (µs)")
+        ax_t.set_ylabel("Amplitude")
+        ax_t.set_xlim(t_us[0], t_us[-1])
+        if before is not None:
+            ax_t.legend(fontsize=8)
+
+        # --- amplitude spectrum ---
+        if before is not None:
+            ax_f.plot(
+                freqs_mhz,
+                spectrum_db(before),
+                color="tab:gray",
+                lw=0.6,
+                alpha=0.5,
+                label="Before",
+            )
+        ax_f.plot(
+            freqs_mhz,
+            spectrum_db(sig),
+            color=color,
+            lw=0.9,
+            label="After" if before is not None else label,
+        )
+        ax_f.set_xlabel("Frequency (MHz)")
+        ax_f.set_ylabel("Amplitude (dB)")
+        ax_f.set_xlim(0, 10)
+        ax_f.set_ylim(-80, 5)
+        ax_f.axvline(f0 / 1e6, color="gray", lw=0.8, ls="--", label=f"f₀ = {f0 / 1e6:.0f} MHz")
+        ax_f.legend(fontsize=8)
+
+    plt.tight_layout()
+    out_path = "signal_filter_example.png"
+    plt.savefig(out_path, dpi=150)
+    print(f"Saved figure to {out_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/simple_plane_wave/simple_plane_wave_sparse_grid.py b/examples/simple_plane_wave/simple_plane_wave_sparse_grid.py
new file mode 100644
index 0000000..e4e973c
--- /dev/null
+++ b/examples/simple_plane_wave/simple_plane_wave_sparse_grid.py
@@ -0,0 +1,149 @@
+"""Simple plane wave example using sparse-grid sensor output.
+
+Instead of recording every grid point, the sensor is configured with
+mod_x=4, mod_y=4 so the binary records only every 4th point in each
+spatial dimension.  This reduces output data size by a factor of 16
+compared to a full-domain sensor.
+"""
+
+import logging
+from pathlib import Path
+
+import numpy as np
+
+import fullwave
+from fullwave.utils import plot_utils
+from fullwave.utils.coordinates import map_to_coords
+
+
+def main() -> None:
+    """Run simple plane wave example with sparse-grid sensor."""
+    logging.getLogger("__main__").setLevel(logging.INFO)
+
+    work_dir = Path("./outputs/") / "simple_plane_wave_sparse_grid"
+    work_dir.mkdir(parents=True, exist_ok=True)
+
+    #
+    # --- grid ---
+    #
+    domain_size = (3e-2, 2e-2)  # meters
+    f0 = 3e6
+    c0 = 1540
+    duration = domain_size[0] / c0 * 2
+    grid = fullwave.Grid(
+        domain_size=domain_size,
+        f0=f0,
+        duration=duration,
+        c0=c0,
+    )
+
+    #
+    # --- medium ---
+    #
+    sound_speed_map = 1540 * np.ones((grid.nx, grid.ny))
+    density_map = 1000 * np.ones((grid.nx, grid.ny))
+    alpha_coeff_map = 0.5 * np.ones((grid.nx, grid.ny))
+    alpha_power_map = 1.0 * np.ones((grid.nx, grid.ny))
+    beta_map = np.zeros((grid.nx, grid.ny))
+
+    obj_x = slice(grid.nx // 3, 2 * grid.nx // 3)
+    obj_y = slice(grid.ny // 3, 2 * grid.ny // 3)
+    sound_speed_map[obj_x, obj_y] = 1600
+    density_map[obj_x, obj_y] = 1100
+    alpha_coeff_map[obj_x, obj_y] = 0.75
+    alpha_power_map[obj_x, obj_y] = 1.1
+
+    medium = fullwave.Medium(
+        grid=grid,
+        sound_speed=sound_speed_map,
+        density=density_map,
+        alpha_coeff=alpha_coeff_map,
+        alpha_power=alpha_power_map,
+        beta=beta_map,
+    )
+
+    #
+    # --- source: plane wave from the top ---
+    #
+    ncycles = 2
+    drop_off = 2
+    element_thickness_px = 3
+
+    p_mask = np.zeros((grid.nx, grid.ny), dtype=bool)
+    p_mask[:element_thickness_px, :] = True
+    p_coordinates = map_to_coords(p_mask)
+
+    p0 = np.zeros((p_mask.sum(), grid.nt))
+    for i in range(element_thickness_px):
+        p0_vec = fullwave.utils.pulse.gaussian_modulated_sinusoidal_signal(
+            nt=grid.nt,
+            f0=f0,
+            duration=duration,
+            ncycles=ncycles,
+            drop_off=drop_off,
+            p0=1e5,
+            i_layer=i,
+            dt_for_layer_delay=grid.dt,
+            cfl_for_layer_delay=grid.cfl,
+        )
+        n_y = p_coordinates.shape[0] // element_thickness_px
+        p0[n_y * i : n_y * (i + 1), :] = p0_vec[: grid.nt]
+
+    source = fullwave.Source(p0=p0, coords=p_coordinates, grid_shape=grid.shape)
+
+    #
+    # --- sparse-grid sensor ---
+    #
+    # Record every 4th point in x (depth) and every 4th point in y (lateral).
+    # The binary generates the sensor positions automatically; no mask or
+    # coordinate array is needed.
+    mod_x = 4
+    mod_y = 4
+    sensor = fullwave.Sensor(mod_x=mod_x, mod_y=mod_y, sampling_modulus_time=7)
+
+    #
+    # --- solver ---
+    #
+    # Sparse-grid output is supported by the exponential-attenuation binary.
+    fw_solver = fullwave.Solver(
+        work_dir=work_dir,
+        grid=grid,
+        medium=medium,
+        source=source,
+        sensor=sensor,
+        use_exponential_attenuation=True,
+    )
+    fw_solver.summary()
+
+    # sensor_output shape: [n_sparse_sensors, nt_recorded]
+    # n_sparse_sensors is inferred by the solver from the binary output length.
+    sensor_output = fw_solver.run()
+
+    #
+    # --- reshape and visualize ---
+    #
+    # Compute the subsampled spatial dimensions to reconstruct the 2D wavefield.
+    nx_sparse = int(np.ceil(grid.nx / mod_x))
+    ny_sparse = int(np.ceil(grid.ny / mod_y))
+    nt_recorded = sensor_output.shape[1]
+
+    # Reshape to [nt_recorded, nx_sparse, ny_sparse]
+    propagation_map = sensor_output.T.reshape(nt_recorded, nx_sparse, ny_sparse)
+
+    p_max = np.abs(propagation_map).max() / 4
+    time_step = nt_recorded // 3
+    plot_utils.plot_array(
+        propagation_map[time_step],
+        aspect=ny_sparse / nx_sparse,
+        export_path=work_dir / "wave_propagation_sparse_snapshot.png",
+        vmax=p_max,
+        vmin=-p_max,
+    )
+
+    print(f"Full grid size:    ({grid.nx}, {grid.ny})")
+    print(f"Sparse grid size:  ({nx_sparse}, {ny_sparse})  [mod_x={mod_x}, mod_y={mod_y}]")
+    print(f"Output shape:      {sensor_output.shape}  [n_sparse_sensors, nt_recorded]")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/simple_plane_wave/simple_plane_wave_sparse_grid_3d.py b/examples/simple_plane_wave/simple_plane_wave_sparse_grid_3d.py
new file mode 100644
index 0000000..9bfaf29
--- /dev/null
+++ b/examples/simple_plane_wave/simple_plane_wave_sparse_grid_3d.py
@@ -0,0 +1,168 @@
+"""Simple 3D plane wave example using sparse-grid sensor output.
+
+Instead of recording every grid point, the sensor is configured with
+mod_x=4, mod_y=4, mod_z=4 so the binary records only every 4th point in each
+spatial dimension.  This reduces output data size by a factor of 64
+compared to a full-domain sensor.
+"""
+
+import logging
+from pathlib import Path
+
+import numpy as np
+
+import fullwave
+from fullwave.utils import plot_utils
+from fullwave.utils.coordinates import map_to_coords
+
+
+def main() -> None:  # noqa: PLR0915
+    """Run 3D simple plane wave example with sparse-grid sensor."""
+    logging.getLogger("__main__").setLevel(logging.INFO)
+
+    work_dir = Path("./outputs/") / "simple_plane_wave_sparse_grid_3d"
+    work_dir.mkdir(parents=True, exist_ok=True)
+
+    #
+    # --- grid ---
+    #
+    f0 = 1e6
+    c0 = 1540
+    wavelength = c0 / f0
+    domain_size = (10 * wavelength, 10 * wavelength, 10 * wavelength)  # meters
+    duration = domain_size[0] / c0 * 2
+    grid = fullwave.Grid(domain_size, f0, duration, c0=c0)
+    grid.print_info()
+
+    #
+    # --- medium ---
+    #
+    sound_speed_map = 1540 * np.ones((grid.nx, grid.ny, grid.nz))
+    density_map = 1000 * np.ones((grid.nx, grid.ny, grid.nz))
+    alpha_coeff_map = 0.5 * np.ones((grid.nx, grid.ny, grid.nz))
+    alpha_power_map = 1.0 * np.ones((grid.nx, grid.ny, grid.nz))
+    beta_map = np.zeros((grid.nx, grid.ny, grid.nz))
+
+    obj_x = slice(grid.nx // 3, 2 * grid.nx // 3)
+    obj_y = slice(grid.ny // 4, 3 * grid.ny // 4)
+    obj_z = slice(grid.nz // 3, 9 * grid.nz // 10)
+    sound_speed_map[obj_x, obj_y, obj_z] = 1600
+    density_map[obj_x, obj_y, obj_z] = 1100
+    alpha_coeff_map[obj_x, obj_y, obj_z] = 0.75
+    alpha_power_map[obj_x, obj_y, obj_z] = 1.1
+
+    medium = fullwave.Medium(
+        grid=grid,
+        sound_speed=sound_speed_map,
+        density=density_map,
+        alpha_coeff=alpha_coeff_map,
+        alpha_power=alpha_power_map,
+        beta=beta_map,
+    )
+    medium.print_info()
+
+    #
+    # --- source: plane wave from the top ---
+    #
+    ncycles = 2
+    drop_off = 2
+    element_thickness_px = 3
+
+    p_mask = np.zeros((grid.nx, grid.ny, grid.nz), dtype=bool)
+    p_mask[:element_thickness_px, :] = True
+    p_coordinates = map_to_coords(p_mask)
+
+    p0 = np.zeros((p_mask.sum(), grid.nt))
+    for i in range(element_thickness_px):
+        p0_vec = fullwave.utils.pulse.gaussian_modulated_sinusoidal_signal(
+            nt=grid.nt,
+            f0=f0,
+            duration=duration,
+            ncycles=ncycles,
+            drop_off=drop_off,
+            p0=1e5,
+            i_layer=i,
+            dt_for_layer_delay=grid.dt,
+            cfl_for_layer_delay=grid.cfl,
+        )
+        n_yz = p_coordinates.shape[0] // element_thickness_px
+        p0[n_yz * i : n_yz * (i + 1), :] = p0_vec[: grid.nt]
+
+    source = fullwave.Source(
+        p0=p0,
+        coords=p_coordinates,
+        grid_shape=(grid.nx, grid.ny, grid.nz),
+    )
+
+    #
+    # --- sparse-grid sensor ---
+    #
+    # Record every 4th point in x (depth), y (lateral), and z (elevational).
+    # The binary generates the sensor positions automatically; no mask or
+    # coordinate array is needed.  mod_z is required for 3D simulations.
+    mod_x = 2
+    mod_y = 2
+    mod_z = 2
+    sensor = fullwave.Sensor(mod_x=mod_x, mod_y=mod_y, mod_z=mod_z, sampling_modulus_time=4)
+
+    #
+    # --- solver ---
+    #
+    # Sparse-grid output is supported by the exponential-attenuation binary.
+    fw_solver = fullwave.Solver(
+        work_dir=work_dir,
+        grid=grid,
+        medium=medium,
+        source=source,
+        sensor=sensor,
+        use_exponential_attenuation=True,
+    )
+    fw_solver.summary()
+
+    # sensor_output shape: [n_sparse_sensors, nt_recorded]
+    # n_sparse_sensors is inferred by the solver from the binary output length.
+    sensor_output = fw_solver.run()
+
+    #
+    # --- reshape and visualize ---
+    #
+    # Compute the subsampled spatial dimensions to reconstruct the 3D wavefield.
+    nx_sparse = int(np.ceil(grid.nx / mod_x))
+    ny_sparse = int(np.ceil(grid.ny / mod_y))
+    nz_sparse = int(np.ceil(grid.nz / mod_z))
+    nt_recorded = sensor_output.shape[1]
+
+    # Reshape to [nt_recorded, nx_sparse, ny_sparse, nz_sparse]
+    propagation_map = sensor_output.T.reshape(nt_recorded, nx_sparse, ny_sparse, nz_sparse)
+
+    time_step = nt_recorded // 3
+    p_max = np.abs(propagation_map).max() / 4
+
+    # x-y slice at the middle z index
+    plot_utils.plot_array(
+        propagation_map[time_step, :, :, nz_sparse // 2],
+        aspect=ny_sparse / nx_sparse,
+        export_path=work_dir / "wave_propagation_sparse_snapshot_x-y.png",
+        vmax=p_max,
+        vmin=-p_max,
+    )
+
+    # x-z slice at the middle y index
+    plot_utils.plot_array(
+        propagation_map[time_step, :, ny_sparse // 2, :],
+        aspect=nz_sparse / nx_sparse,
+        export_path=work_dir / "wave_propagation_sparse_snapshot_x-z.png",
+        vmax=p_max,
+        vmin=-p_max,
+    )
+
+    print(f"Full grid size:    ({grid.nx}, {grid.ny}, {grid.nz})")
+    print(
+        f"Sparse grid size:  ({nx_sparse}, {ny_sparse}, {nz_sparse})"
+        f"  [mod_x={mod_x}, mod_y={mod_y}, mod_z={mod_z}]",
+    )
+    print(f"Output shape:      {sensor_output.shape}  [n_sparse_sensors, nt_recorded]")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/simple_plane_wave/simple_plane_wave_velocity_source.py b/examples/simple_plane_wave/simple_plane_wave_velocity_source.py
new file mode 100644
index 0000000..028960a
--- /dev/null
+++ b/examples/simple_plane_wave/simple_plane_wave_velocity_source.py
@@ -0,0 +1,197 @@
+"""Simple plane wave transmit example using a velocity (u0) source.
+
+A velocity source drives the particle-velocity equation directly, as opposed to
+the hard pressure source (p0) which drives the pressure equation.  For a plane
+wave propagating in the depth (x) direction the relevant component is u0.
+
+The acoustic impedance relation  p = rho · c · u  links the two formulations, so
+the velocity amplitude is scaled as
+
+    u_amp = p_amp / (ρ₀ · c₀)
+
+Everything else (grid, medium, sensor, solver) is identical to
+simple_plane_wave.py so the two examples can be run side-by-side for comparison.
+"""
+
+import logging
+from pathlib import Path
+
+import numpy as np
+
+import fullwave
+from fullwave.utils import plot_utils, signal_process
+from fullwave.utils.coordinates import map_to_coords
+
+
+def main() -> None:
+    """Run simple plane wave transmit example with a velocity source."""
+    logging.getLogger("__main__").setLevel(logging.INFO)
+
+    #
+    # --- working directory ---
+    #
+    work_dir = Path("./outputs/") / "simple_plane_wave_velocity_source"
+    work_dir.mkdir(parents=True, exist_ok=True)
+
+    #
+    # --- computational grid ---
+    #
+    domain_size = (3e-2, 2e-2)  # (depth, lateral) in metres
+    f0 = 3e6  # centre frequency [Hz]
+    c0 = 1540.0  # background sound speed [m/s]
+    rho0 = 1000.0  # background density [kg/m³]
+    duration = domain_size[0] / c0 * 2  # two-way travel time across depth
+    grid = fullwave.Grid(
+        domain_size=domain_size,
+        f0=f0,
+        duration=duration,
+        c0=c0,
+    )
+
+    #
+    # --- acoustic medium ---
+    #
+    sound_speed_map = c0 * np.ones((grid.nx, grid.ny))  # m/s
+    density_map = rho0 * np.ones((grid.nx, grid.ny))  # kg/m³
+    alpha_coeff_map = 0.5 * np.ones((grid.nx, grid.ny))  # dB/(MHz^y cm)
+    alpha_power_map = 1.0 * np.ones((grid.nx, grid.ny))  # power-law exponent
+    beta_map = 0.0 * np.ones((grid.nx, grid.ny))  # nonlinearity coefficient
+
+    # embed a scatterer with different acoustic properties
+    obj_x_start = grid.nx // 3
+    obj_x_end = 2 * grid.nx // 3
+    obj_y_start = grid.ny // 3
+    obj_y_end = 2 * grid.ny // 3
+
+    sound_speed_map[obj_x_start:obj_x_end, obj_y_start:obj_y_end] = 1600
+    density_map[obj_x_start:obj_x_end, obj_y_start:obj_y_end] = 1100
+    alpha_coeff_map[obj_x_start:obj_x_end, obj_y_start:obj_y_end] = 0.75
+    alpha_power_map[obj_x_start:obj_x_end, obj_y_start:obj_y_end] = 1.1
+
+    medium = fullwave.Medium(
+        grid=grid,
+        sound_speed=sound_speed_map,
+        density=density_map,
+        alpha_coeff=alpha_coeff_map,
+        alpha_power=alpha_power_map,
+        beta=beta_map,
+    )
+    medium.plot(export_path=work_dir / "medium.png")
+
+    #
+    # --- velocity source ---
+    #
+    # Use the u-component (depth / x direction) to drive a downward-travelling
+    # plane wave.  The source occupies the top `element_thickness_px` rows of the
+    # grid, matching the pressure-source layout in simple_plane_wave.py.
+    #
+    # Velocity amplitude is derived from the target pressure amplitude via the
+    # plane-wave impedance relation:  u_amp = p_amp / (ρ₀ · c₀)
+    #
+    p_amp = 1e5  # target pressure amplitude [Pa]
+    u_amp = p_amp / (rho0 * c0)  # corresponding velocity amplitude [m/s]
+
+    ncycles = 2
+    drop_off = 2
+    # element_thickness_px = 3
+
+    # Build the coordinate array for the velocity source layer
+    # small velocity source at the center of the domain
+
+    source_width_px_x = 2
+    source_width_px_y = 2
+    u_mask = np.zeros((grid.nx, grid.ny), dtype=bool)
+    u_mask[
+        grid.nx // 2 - source_width_px_x // 2 : grid.nx // 2 + source_width_px_x // 2,
+        grid.ny // 2 - source_width_px_y // 2 : grid.ny // 2 + source_width_px_y // 2,
+    ] = True
+
+    coords_u = map_to_coords(u_mask)  # shape [n_sources_u, 2]
+
+    # Build the u0 signal matrix [n_sources_u, nt]
+    u0 = np.zeros((coords_u.shape[0], grid.nt))
+
+    u0_vec = fullwave.utils.pulse.gaussian_modulated_sinusoidal_signal(
+        nt=grid.nt,
+        f0=f0,
+        duration=duration,
+        ncycles=ncycles,
+        drop_off=drop_off,
+        p0=u_amp,  # amplitude in m/s
+    )
+    u0[:, :] = u0_vec
+
+    # for i_layer in range(element_thickness_px):
+    #     u0_vec = fullwave.utils.pulse.gaussian_modulated_sinusoidal_signal(
+    #         nt=grid.nt,
+    #         f0=f0,
+    #         duration=duration,
+    #         ncycles=ncycles,
+    #         drop_off=drop_off,
+    #         p0=u_amp,  # amplitude in m/s
+    #         i_layer=i_layer,
+    #         dt_for_layer_delay=grid.dt,
+    #         cfl_for_layer_delay=grid.cfl,
+    #     )
+    #     n_y = coords_u.shape[0] // element_thickness_px
+    #     u0[n_y * i_layer : n_y * (i_layer + 1), :] = u0_vec
+
+    source = fullwave.Source(
+        grid_shape=grid.shape,
+        u0=u0,
+        coords_u=coords_u,
+    )
+
+    #
+    # --- sensor ---
+    #
+    sensor_mask = np.ones((grid.nx, grid.ny), dtype=bool)
+    sensor = fullwave.Sensor(mask=sensor_mask, sampling_modulus_time=7)
+
+    #
+    # --- solver ---
+    #
+    fw_solver = fullwave.Solver(
+        work_dir=work_dir,
+        grid=grid,
+        medium=medium,
+        source=source,
+        sensor=sensor,
+        run_on_memory=False,
+        use_exponential_attenuation=True,
+        save_gpu_memory=True,
+        path_fullwave_simulation_bin=Path(
+            "/home/msode/workspace/lab_repos/fullwave-python-public/debug_solver_bin/fullwave2_2d_exponential_attenuation_multi_gpu",
+        ),
+    )
+    sensor_output = fw_solver.run()
+
+    #
+    # --- visualisation ---
+    #
+    propagation_map = signal_process.reshape_whole_sensor_to_nt_nx_ny(
+        sensor_output,
+        grid,
+    )
+    p_max_plot = np.abs(propagation_map).max().item() / 4
+    time_step = propagation_map.shape[0] // 3
+    plot_utils.plot_array(
+        propagation_map[time_step, :, :],
+        aspect=propagation_map.shape[2] / propagation_map.shape[1],
+        export_path=work_dir / "wave_propagation_snapshot.png",
+        vmax=p_max_plot,
+        vmin=-p_max_plot,
+    )
+    plot_utils.plot_wave_propagation_with_map(
+        propagation_map=propagation_map,
+        c_map=medium.sound_speed,
+        rho_map=medium.density,
+        export_name=work_dir / "wave_propagation_animation.mp4",
+        vmax=p_max_plot,
+        vmin=-p_max_plot,
+        figsize=(4, 6),
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fullwave/__init__.py b/fullwave/__init__.py
index f90a1a1..03caafd 100644
--- a/fullwave/__init__.py
+++ b/fullwave/__init__.py
@@ -60,7 +60,7 @@
     __version__ = version("fullwave")
 except PackageNotFoundError:
     # Update via bump-my-version, not manually
-    __version__ = "1.2.3"
+    __version__ = "1.2.6-dev8"
 
 VERSION = __version__  # for convenience
 logger.info("Fullwave version: %s", __version__)
diff --git a/fullwave/beamformer/beamformer.py b/fullwave/beamformer/beamformer.py
index f068799..4e327d1 100644
--- a/fullwave/beamformer/beamformer.py
+++ b/fullwave/beamformer/beamformer.py
@@ -14,7 +14,8 @@ class Beamformer:
     For faster implementations, consider using libraries such as
     mach beamformer: https://github.com/Forest-Neurotech/mach
 
-    References:
+    References
+    ----------
     - Fullwave2 BMME890 implementation
     - https://github.com/gfpinton/fullwave_bmme890/blob/master/fullwave2_launcher_imaging_planewave.m
 
diff --git a/fullwave/medium.py b/fullwave/medium.py
index 7d37948..d1a0b43 100644
--- a/fullwave/medium.py
+++ b/fullwave/medium.py
@@ -1,15 +1,17 @@
 """Medium class for Fullwave."""
 
+from __future__ import annotations
+
 import logging
 from collections import OrderedDict
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
+from typing import TYPE_CHECKING
 
 import matplotlib.pyplot as plt
 import numexpr as ne
 import numpy as np
-from numpy.typing import NDArray
 
 from fullwave import Grid
 from fullwave.solver.utils import initialize_relaxation_param_dict
@@ -17,8 +19,142 @@
 from fullwave.utils.coordinates import coords_to_map, map_to_coords
 from fullwave.utils.relaxation_parameters import generate_relaxation_params
 
+if TYPE_CHECKING:
+    from types import ModuleType
+
+    from numpy.typing import NDArray
+
 logger = logging.getLogger("__main__." + __name__)
 
+_CUPY_AVAILABLE: bool | None = None
+
+
+def _check_cupy() -> bool:
+    """Return True if CuPy is importable; result is cached after the first call."""
+    global _CUPY_AVAILABLE  # noqa: PLW0603
+    if _CUPY_AVAILABLE is None:
+        try:
+            import cupy  # noqa: F401, PLC0415
+
+            _CUPY_AVAILABLE = True
+        except ImportError:
+            _CUPY_AVAILABLE = False
+    return _CUPY_AVAILABLE
+
+
+def _get_array_module(*, use_gpu: bool) -> ModuleType:
+    """Return ``cupy`` when *use_gpu* is True and CuPy is available, else ``numpy``."""
+    if use_gpu and _check_cupy():
+        import cupy  # noqa: PLC0415
+
+        return cupy
+    return np
+
+
+def _cleanup_gpu_arrays(obj: object, attr_names: list[str]) -> None:
+    """Delete partially allocated GPU arrays and free CuPy memory pool."""
+    for attr in attr_names:
+        if hasattr(obj, attr):
+            delattr(obj, attr)
+    try:
+        import cupy as cp  # noqa: PLC0415
+
+        cp.get_default_memory_pool().free_all_blocks()
+    except ImportError:
+        pass
+
+
+def _upload_arrays_multi_gpu(
+    named_arrays: list[tuple[str, np.ndarray]],
+    dtype: np.dtype,
+    target_device: int = 0,
+) -> dict[str, object]:
+    """Upload numpy arrays to *target_device* using parallel PCIe via multiple GPUs.
+
+    Each array is first transferred to a different GPU (one per PCIe link),
+    then copied to *target_device* via peer-to-peer (NVLink when available).
+
+    Parameters
+    ----------
+    named_arrays
+        List of (name, numpy_array) pairs to upload.
+    dtype
+        Target CuPy dtype for the arrays.
+    target_device
+        CUDA device ID where all arrays should end up (default 0).
+
+    Returns
+    -------
+    dict mapping name -> CuPy array on *target_device*.
+
+    """
+    from concurrent.futures import as_completed  # noqa: PLC0415
+
+    import cupy as cp  # noqa: PLC0415
+
+    n_gpus = cp.cuda.runtime.getDeviceCount()
+    n_arrays = len(named_arrays)
+    n_workers = min(n_gpus, n_arrays)
+    results: dict[str, object] = {}
+
+    logger.info(
+        "Uploading %d arrays to GPU using %d devices (of %d available).",
+        n_arrays,
+        n_workers,
+        n_gpus,
+    )
+
+    def _upload_one(name: str, arr_np: np.ndarray, device_id: int) -> tuple[str, object]:
+        with cp.cuda.Device(device_id):
+            gpu_arr = cp.asarray(arr_np, dtype=dtype)
+            if device_id != target_device:
+                with cp.cuda.Device(target_device):
+                    result = cp.array(gpu_arr)
+                del gpu_arr
+                cp.get_default_memory_pool().free_all_blocks()
+                return name, result
+            return name, gpu_arr
+
+    with ThreadPoolExecutor(max_workers=n_workers) as executor:
+        futures = [
+            executor.submit(_upload_one, name, arr, i % n_gpus)
+            for i, (name, arr) in enumerate(named_arrays)
+        ]
+        for future in as_completed(futures):
+            name, result = future.result()
+            results[name] = result
+
+    return results
+
+
+def _upload_or_convert_arrays(
+    xp: object,
+    dtype: np.dtype,
+    named_arrays: list[tuple[str, np.ndarray]],
+) -> dict[str, object]:
+    """Upload arrays to GPU (multi-GPU if available) or convert with numpy/cupy.
+
+    Returns dict mapping name -> array on the target backend.
+    """
+    if xp is np:
+        return {
+            name: np.atleast_2d(np.asarray(arr)).astype(dtype, copy=False)
+            for name, arr in named_arrays
+        }
+
+    sample_arr = np.asarray(named_arrays[0][1])
+    if sample_arr.ndim >= 3 and _check_cupy():
+        import cupy as cp  # noqa: PLC0415
+
+        n_gpus = cp.cuda.runtime.getDeviceCount()
+        if n_gpus > 1:
+            np_arrays = [(name, np.atleast_2d(np.asarray(arr))) for name, arr in named_arrays]
+            return _upload_arrays_multi_gpu(np_arrays, dtype)
+
+    return {
+        name: xp.atleast_2d(xp.asarray(arr)).astype(dtype, copy=False) for name, arr in named_arrays
+    }
+
 
 @dataclass
 class MediumRelaxationMaps:
@@ -47,6 +183,7 @@ def __init__(
         use_isotropic_relaxation: bool = True,
         n_jobs: int = -1,
         dtype: type = np.float64,
+        use_gpu: bool = False,
     ) -> None:
         """Medium class for Fullwave.
 
@@ -91,6 +228,9 @@ def __init__(
         dtype : type, optional
             Data type for medium arrays. Default is np.float64.
             Use np.float32 to reduce Python-side memory usage by ~50%.
+        use_gpu : bool, optional
+            If True, use CuPy for GPU-accelerated computation (default is False).
+            Requires CuPy to be installed. Falls back to CPU if CuPy is unavailable.
 
         """
         check_functions.check_compatible_value(
@@ -98,18 +238,50 @@ def __init__(
             [2],
             "Only n_relaxation_mechanisms=2 are supported currently.",
         )
+        self.use_gpu = use_gpu
+        self.xp: ModuleType = _get_array_module(use_gpu=use_gpu)
+        if self.xp is not np:
+            logger.info("MediumRelaxationMaps: using CuPy GPU backend")
+        elif use_gpu:
+            logger.warning(
+                "MediumRelaxationMaps: use_gpu=True but CuPy is not available. "
+                "Falling back to CPU (numpy)."
+            )
         self.n_relaxation_mechanisms = n_relaxation_mechanisms
         self.dtype = np.dtype(dtype)
-        self.relaxation_param_dict = initialize_relaxation_param_dict(
-            n_relaxation_mechanisms=n_relaxation_mechanisms,
-            value=np.zeros_like(sound_speed, dtype=self.dtype),
-        )
-        self.grid = grid
-        self.is_3d = grid.is_3d
-
-        self.sound_speed = sound_speed
-        self.density = density
-        self.beta = beta
+        xp = self.xp
+        try:
+            self.relaxation_param_dict = initialize_relaxation_param_dict(
+                n_relaxation_mechanisms=n_relaxation_mechanisms,
+                value=xp.zeros_like(xp.asarray(sound_speed), dtype=self.dtype),
+            )
+            self.grid = grid
+            self.is_3d = grid.is_3d
+
+            self.sound_speed = xp.atleast_2d(xp.asarray(sound_speed)).astype(self.dtype, copy=False)
+            self.density = xp.atleast_2d(xp.asarray(density)).astype(self.dtype, copy=False)
+            self.beta = xp.atleast_2d(xp.asarray(beta)).astype(self.dtype, copy=False)
+        except Exception:
+            if xp is np:
+                raise
+            logger.warning("GPU OOM in MediumRelaxationMaps.__init__. Falling back to CPU (numpy).")
+            _cleanup_gpu_arrays(self, ["sound_speed", "density", "beta"])
+            if hasattr(self, "relaxation_param_dict"):
+                del self.relaxation_param_dict
+            import cupy as cp  # noqa: PLC0415
+
+            cp.get_default_memory_pool().free_all_blocks()
+            self.xp = np
+            xp = np
+            self.relaxation_param_dict = initialize_relaxation_param_dict(
+                n_relaxation_mechanisms=n_relaxation_mechanisms,
+                value=np.zeros_like(np.asarray(sound_speed), dtype=self.dtype),
+            )
+            self.grid = grid
+            self.is_3d = grid.is_3d
+            self.sound_speed = np.atleast_2d(np.asarray(sound_speed)).astype(self.dtype, copy=False)
+            self.density = np.atleast_2d(np.asarray(density)).astype(self.dtype, copy=False)
+            self.beta = np.atleast_2d(np.asarray(beta)).astype(self.dtype, copy=False)
 
         if air_coords is not None:
             if air_map is not None:
@@ -123,7 +295,6 @@ def __init__(
             self.air_coords = np.empty((0, ndim), dtype=np.int64)
 
         self.n_jobs = n_jobs
-        self.__post_init__()
 
         self._update_relaxation_param_dict(
             relaxation_param_updates=relaxation_param_dict,
@@ -135,12 +306,6 @@ def __init__(
         self.check_fields()
         logger.debug("MediumRelaxationMaps instance created.")
 
-    def __post_init__(self) -> None:
-        """Post-initialization processing for Medium."""
-        self.sound_speed = np.atleast_2d(self.sound_speed).astype(self.dtype, copy=False)
-        self.density = np.atleast_2d(self.density).astype(self.dtype, copy=False)
-        self.beta = np.atleast_2d(self.beta).astype(self.dtype, copy=False)
-
     def _update_relaxation_param_dict(
         self,
         relaxation_param_updates: dict[str, NDArray[np.float64]],
@@ -169,7 +334,10 @@ def _update_relaxation_param_dict(
         # SIMD/cache-friendly pass over contiguous arrays.
         # x1 and x2 directions are independent, so run in parallel threads
         # (numpy/numexpr release the GIL during computation).
-        def _sort_by_time_const(
+        xp = self.xp
+        use_gpu = xp is not np
+
+        def _sort_by_time_const_cpu(
             d_arrays: list[NDArray[np.float64]],
             a_arrays: list[NDArray[np.float64]],
             kappa: NDArray[np.float64],  # noqa: ARG001
@@ -184,23 +352,50 @@ def _sort_by_time_const(
                     a_arrays[i] = np.where(swap, aj, ai)
                     a_arrays[j] = np.where(swap, ai, aj)
 
-        with ThreadPoolExecutor(max_workers=2) as pool:
-            fut_x1 = pool.submit(_sort_by_time_const, d_x1, a_x1, kappa_x1)
-            fut_x2 = pool.submit(_sort_by_time_const, d_x2, a_x2, kappa_x2)
-            fut_x1.result()
-            fut_x2.result()
+        def _sort_by_time_const_gpu(
+            d_arrays: list,
+            a_arrays: list,
+            kappa: NDArray[np.float64],
+        ) -> None:
+            kappa_gpu = xp.asarray(kappa)
+            d_gpu = [xp.asarray(d) for d in d_arrays]
+            a_gpu = [xp.asarray(a) for a in a_arrays]
+            for i in range(n_nu):
+                for j in range(i + 1, n_nu):
+                    swap = d_gpu[i] / kappa_gpu + a_gpu[i] > d_gpu[j] / kappa_gpu + a_gpu[j]
+                    d_gpu[i], d_gpu[j] = (
+                        xp.where(swap, d_gpu[j], d_gpu[i]),
+                        xp.where(swap, d_gpu[i], d_gpu[j]),
+                    )
+                    a_gpu[i], a_gpu[j] = (
+                        xp.where(swap, a_gpu[j], a_gpu[i]),
+                        xp.where(swap, a_gpu[i], a_gpu[j]),
+                    )
+            for i in range(n_nu):
+                d_arrays[i] = d_gpu[i]
+                a_arrays[i] = a_gpu[i]
+
+        if use_gpu:
+            _sort_by_time_const_gpu(d_x1, a_x1, kappa_x1)
+            _sort_by_time_const_gpu(d_x2, a_x2, kappa_x2)
+        else:
+            with ThreadPoolExecutor(max_workers=2) as pool:
+                fut_x1 = pool.submit(_sort_by_time_const_cpu, d_x1, a_x1, kappa_x1)
+                fut_x2 = pool.submit(_sort_by_time_const_cpu, d_x2, a_x2, kappa_x2)
+                fut_x1.result()
+                fut_x2.result()
 
         # Write results into relaxation_param_dict
         param_dict = self.relaxation_param_dict
-        param_dict["kappa_x1"] = np.atleast_2d(kappa_x1)
-        param_dict["kappa_x2"] = np.atleast_2d(kappa_x2)
+        param_dict["kappa_x1"] = xp.atleast_2d(xp.asarray(kappa_x1))
+        param_dict["kappa_x2"] = xp.atleast_2d(xp.asarray(kappa_x2))
 
         for i in range(n_nu):
             nu = i + 1
-            param_dict[f"d_x1_nu{nu}"] = np.atleast_2d(d_x1[i])
-            param_dict[f"alpha_x1_nu{nu}"] = np.atleast_2d(a_x1[i])
-            param_dict[f"d_x2_nu{nu}"] = np.atleast_2d(d_x2[i])
-            param_dict[f"alpha_x2_nu{nu}"] = np.atleast_2d(a_x2[i])
+            param_dict[f"d_x1_nu{nu}"] = xp.atleast_2d(xp.asarray(d_x1[i]))
+            param_dict[f"alpha_x1_nu{nu}"] = xp.atleast_2d(xp.asarray(a_x1[i]))
+            param_dict[f"d_x2_nu{nu}"] = xp.atleast_2d(xp.asarray(d_x2[i]))
+            param_dict[f"alpha_x2_nu{nu}"] = xp.atleast_2d(xp.asarray(a_x2[i]))
 
         # Cache and check keys
         desired_key_set = getattr(
@@ -231,7 +426,8 @@ def check_relaxation_param_dict(
     ) -> None:
         """Check if the relaxation parameter updates have valid keys and matching shapes.
 
-        Raises:
+        Raises
+        ------
             ValueError: If the keys do not match the desired keys or
             if the shapes of the values do not match the domain shape.
 
@@ -254,10 +450,17 @@ def check_relaxation_param_dict(
                 )
                 raise ValueError(error_msg)
 
+    def _to_numpy(self, arr: NDArray) -> NDArray:
+        """Transfer array to CPU numpy. No-op if already numpy."""
+        if self.xp is not np:
+            return self.xp.asnumpy(arr)
+        return arr
+
     @property
-    def bulk_modulus(self) -> NDArray[np.float64]:
-        """Return the bulk_modulus."""
-        return np.multiply(self.sound_speed**2, self.density)
+    def bulk_modulus(self) -> NDArray:
+        """Return the bulk_modulus (stays on same device as source arrays)."""
+        xp = self.xp
+        return xp.multiply(self.sound_speed**2, self.density)
 
     @property
     def air_map(self) -> NDArray[np.int64]:
@@ -287,31 +490,38 @@ def n_air(self) -> int:
         """Return the number of air coordinates."""
         return self.air_coords.shape[0]
 
-    @staticmethod
     def _calc_a_and_b(
+        self,
         dx: float | NDArray[np.float64],
         kappa_x: float | NDArray[np.float64],
         alpha_x: float | NDArray[np.float64],
         dt: float | NDArray[np.float64],
         output_dtype: np.dtype | None = None,
     ) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
-        dx = np.asarray(dx, dtype=np.float64)
-        kappa_x = np.asarray(kappa_x, dtype=np.float64)
-        alpha_x = np.asarray(alpha_x, dtype=np.float64)
-        dt = np.asarray(dt, dtype=np.float64)
-
-        eps = np.finfo(np.float64).eps  # noqa: F841
-
-        # b = exp(-(dx/kappa_x + alpha_x) * dt)
-        b = ne.evaluate("exp(-(dx/kappa_x + alpha_x) * dt)")
+        xp = self.xp
+        use_gpu = xp is not np
 
-        # denom = kappa_x*(dx + kappa_x*alpha_x) + eps
-        denom = ne.evaluate("kappa_x*(dx + kappa_x*alpha_x) + eps")  # noqa: F841
+        dx = xp.asarray(dx, dtype=xp.float64)
+        kappa_x = xp.asarray(kappa_x, dtype=xp.float64)
+        alpha_x = xp.asarray(alpha_x, dtype=xp.float64)
+        dt = xp.asarray(dt, dtype=xp.float64)
 
-        # a = dx/denom*(b - 1)
-        a = ne.evaluate("dx/denom*(b - 1)")
+        eps = xp.finfo(xp.float64).eps
 
-        if output_dtype is not None and output_dtype != np.float64:
+        if use_gpu:
+            b = xp.exp(-(dx / kappa_x + alpha_x) * dt)
+            denom = kappa_x * (dx + kappa_x * alpha_x) + eps
+            a = dx / denom * (b - 1)
+        else:
+            eps_local = eps  # noqa: F841
+            # b = exp(-(dx/kappa_x + alpha_x) * dt)
+            b = ne.evaluate("exp(-(dx/kappa_x + alpha_x) * dt)")
+            # denom = kappa_x*(dx + kappa_x*alpha_x) + eps
+            denom = ne.evaluate("kappa_x*(dx + kappa_x*alpha_x) + eps_local")
+            # a = dx/denom*(b - 1)
+            a = ne.evaluate("dx/denom*(b - 1)")
+
+        if output_dtype is not None and output_dtype != xp.float64:
             a = a.astype(output_dtype, copy=False)
             b = b.astype(output_dtype, copy=False)
 
@@ -481,17 +691,18 @@ def plot(
             error_msg = "3D plotting is not implemented yet."
             raise NotImplementedError(error_msg)
 
+        _np = self._to_numpy
         if plot_fw2_params:
             target_map_dict: OrderedDict = OrderedDict(
                 [
-                    ("Sound speed", self.sound_speed),
-                    ("Density", self.density),
-                    ("Beta", self.beta),
+                    ("Sound speed", _np(self.sound_speed)),
+                    ("Density", _np(self.density)),
+                    ("Beta", _np(self.beta)),
                     ("Air map", self.air_map),
                 ],
             )
             for key in self.relaxation_param_dict_for_fw2:
-                target_map_dict[key] = self.relaxation_param_dict_for_fw2[key]
+                target_map_dict[key] = _np(self.relaxation_param_dict_for_fw2[key])
         else:
             relaxation_param_dict_keys = initialize_relaxation_param_dict(
                 n_relaxation_mechanisms=self.n_relaxation_mechanisms,
@@ -499,14 +710,14 @@ def plot(
 
             target_map_dict: OrderedDict = OrderedDict(
                 [
-                    ("Sound speed", self.sound_speed),
-                    ("Density", self.density),
-                    ("Beta", self.beta),
+                    ("Sound speed", _np(self.sound_speed)),
+                    ("Density", _np(self.density)),
+                    ("Beta", _np(self.beta)),
                     ("Air map", self.air_map),
                 ],
             )
             for key in relaxation_param_dict_keys:
-                target_map_dict[key] = self.relaxation_param_dict[key]
+                target_map_dict[key] = _np(self.relaxation_param_dict[key])
 
         num_plots = len(target_map_dict)
         # calculate subplot shape to make a square
@@ -551,21 +762,22 @@ def __str__(self) -> str:
             A string summarizing the Medium properties.
 
         """
+        xp = self.xp
         return (
             f"Relaxation Medium:\n"
             f"  Grid: {self.grid}\n"
             "\n"
-            f"  Sound speed: min {np.min(self.sound_speed):.2f} m/s, "
-            f"max {np.max(self.sound_speed):.2f} m/s\n"
-            f"  Density: min {np.min(self.density):.2f} kg/m^3, "
-            f"max {np.max(self.density):.2f} kg/m^3\n"
-            f"  Beta: min {np.min(self.beta):.2f}, max {np.max(self.beta):.2f}\n"
+            f"  Sound speed: min {float(xp.min(self.sound_speed)):.2f} m/s, "
+            f"max {float(xp.max(self.sound_speed)):.2f} m/s\n"
+            f"  Density: min {float(xp.min(self.density)):.2f} kg/m^3, "
+            f"max {float(xp.max(self.density)):.2f} kg/m^3\n"
+            f"  Beta: min {float(xp.min(self.beta)):.2f}, max {float(xp.max(self.beta)):.2f}\n"
             f"  Number of air coordinates: {self.n_air}\n"
             f"  Number of relaxation mechanisms: {self.n_relaxation_mechanisms}\n"
             f"  Relaxation parameters:\n"
         ) + "".join(
             [
-                f"    {key}: min {np.min(value):.4e}, max {np.max(value):.4e}\n"
+                f"    {key}: min {float(xp.min(value)):.4e}, max {float(xp.max(value)):.4e}\n"
                 for key, value in self.relaxation_param_dict.items()
             ],
         )
@@ -581,7 +793,7 @@ def __repr__(self) -> str:
         """
         return str(self)
 
-    def build(self) -> "MediumRelaxationMaps":
+    def build(self) -> MediumRelaxationMaps:
         """Build the MediumRelaxationMaps instance.
 
         It returns self for compatibility with Solver class.
@@ -622,6 +834,7 @@ def __init__(
         air_map: NDArray[np.int64] | None = None,
         air_coords: NDArray[np.int64] | None = None,
         dtype: type = np.float64,
+        use_gpu: bool = False,
     ) -> None:
         """Medium class for Fullwave.
 
@@ -652,17 +865,48 @@ def __init__(
         dtype : type, optional
             Data type for medium arrays. Default is np.float64.
             Use np.float32 to reduce Python-side memory usage by ~50%.
+        use_gpu : bool, optional
+            If True, use CuPy for GPU-accelerated computation (default is False).
+            Requires CuPy to be installed. Falls back to CPU if CuPy is unavailable.
 
         """
         check_functions.check_instance(grid, Grid)
+        self.use_gpu = use_gpu
+        self.xp: ModuleType = _get_array_module(use_gpu=use_gpu)
+        if self.xp is not np:
+            logger.info("MediumExponentialAttenuation: using CuPy GPU backend")
+        elif use_gpu:
+            logger.warning(
+                "MediumExponentialAttenuation: use_gpu=True but CuPy is not available. "
+                "Falling back to CPU (numpy)."
+            )
         self.grid = grid
         self.is_3d = grid.is_3d
         self.dtype = np.dtype(dtype)
 
-        self.sound_speed = sound_speed
-        self.density = density
-        self.alpha_exp = alpha_exp
-        self.beta = beta
+        xp = self.xp
+        attr_names = ["sound_speed", "density", "alpha_exp", "beta"]
+        named_inputs = [
+            ("sound_speed", sound_speed),
+            ("density", density),
+            ("alpha_exp", alpha_exp),
+            ("beta", beta),
+        ]
+        try:
+            gpu_arrays = _upload_or_convert_arrays(xp, self.dtype, named_inputs)
+            for name in attr_names:
+                setattr(self, name, gpu_arrays[name])
+        except Exception:
+            if xp is np:
+                raise
+            logger.warning(
+                "GPU OOM in MediumExponentialAttenuation.__init__. Falling back to CPU (numpy)."
+            )
+            _cleanup_gpu_arrays(self, attr_names)
+            self.xp = np
+            gpu_arrays = _upload_or_convert_arrays(np, self.dtype, named_inputs)
+            for name in attr_names:
+                setattr(self, name, gpu_arrays[name])
 
         if air_coords is not None:
             if air_map is not None:
@@ -675,16 +919,8 @@ def __init__(
             ndim = 3 if self.is_3d else 2
             self.air_coords = np.empty((0, ndim), dtype=np.int64)
 
-        self.__post_init__()
         self.check_fields()
 
-    def __post_init__(self) -> None:
-        """Post-initialization processing for Medium."""
-        self.sound_speed = np.atleast_2d(self.sound_speed).astype(self.dtype, copy=False)
-        self.density = np.atleast_2d(self.density).astype(self.dtype, copy=False)
-        self.alpha_exp = np.atleast_2d(self.alpha_exp).astype(self.dtype, copy=False)
-        self.beta = np.atleast_2d(self.beta).astype(self.dtype, copy=False)
-
     def check_fields(self) -> None:
         """Check if the fields have the correct shape."""
         grid_shape: tuple[int, ...]
@@ -704,6 +940,12 @@ def _error_msg(
         assert self.alpha_exp.shape == grid_shape, _error_msg(self.alpha_exp, grid_shape)
         assert self.beta.shape == grid_shape, _error_msg(self.beta, grid_shape)
 
+    def _to_numpy(self, arr: NDArray) -> NDArray:
+        """Transfer array to CPU numpy. No-op if already numpy."""
+        if self.xp is not np:
+            return self.xp.asnumpy(arr)
+        return arr
+
     @property
     def air_map(self) -> NDArray[np.int64]:
         """Returns the air map.
@@ -720,9 +962,10 @@ def air_map(self) -> NDArray[np.int64]:
         return coords_to_map(self.air_coords, grid_shape=grid_shape, is_3d=self.is_3d)
 
     @property
-    def bulk_modulus(self) -> NDArray[np.float64]:
-        """Return the bulk_modulus."""
-        return np.multiply(self.sound_speed**2, self.density)
+    def bulk_modulus(self) -> NDArray:
+        """Return the bulk_modulus (stays on same device as source arrays)."""
+        xp = self.xp
+        return xp.multiply(self.sound_speed**2, self.density)
 
     @property
     def n_coords_zero(self) -> int:
@@ -858,6 +1101,7 @@ def __init__(
         use_isotropic_relaxation: bool = True,
         n_jobs: int = -1,
         dtype: type = np.float64,
+        use_gpu: bool = False,
     ) -> None:
         """Medium class for Fullwave.
 
@@ -912,6 +1156,9 @@ def __init__(
             Use np.float32 to reduce Python-side memory usage by ~50%.
             The CUDA solver reads all data as float32, so float32 storage
             avoids redundant conversion copies.
+        use_gpu : bool, optional
+            If True, use CuPy for GPU-accelerated computation (default is False).
+            Requires CuPy to be installed. Falls back to CPU if CuPy is unavailable.
 
         """
         check_functions.check_compatible_value(
@@ -921,15 +1168,40 @@ def __init__(
         )
         check_functions.check_instance(grid, Grid)
         check_functions.check_path_exists(path_relaxation_parameters_database)
+        self.use_gpu = use_gpu
+        self.xp: ModuleType = _get_array_module(use_gpu=use_gpu)
+        if self.xp is not np:
+            logger.info("Medium: using CuPy GPU backend")
+        elif use_gpu:
+            logger.warning(
+                "Medium: use_gpu=True but CuPy is not available. Falling back to CPU (numpy)."
+            )
         self.grid = grid
         self.is_3d = grid.is_3d
         self.dtype = np.dtype(dtype)
 
-        self.sound_speed = sound_speed
-        self.density = density
-        self.alpha_coeff = alpha_coeff
-        self.alpha_power = alpha_power
-        self.beta = beta
+        xp = self.xp
+        attr_names = ["sound_speed", "density", "alpha_coeff", "alpha_power", "beta"]
+        named_inputs = [
+            ("sound_speed", sound_speed),
+            ("density", density),
+            ("alpha_coeff", alpha_coeff),
+            ("alpha_power", alpha_power),
+            ("beta", beta),
+        ]
+        try:
+            gpu_arrays = _upload_or_convert_arrays(xp, self.dtype, named_inputs)
+            for name in attr_names:
+                setattr(self, name, gpu_arrays[name])
+        except Exception:
+            if xp is np:
+                raise
+            logger.warning("GPU OOM in Medium.__init__. Falling back to CPU (numpy).")
+            _cleanup_gpu_arrays(self, attr_names)
+            self.xp = np
+            gpu_arrays = _upload_or_convert_arrays(np, self.dtype, named_inputs)
+            for name in attr_names:
+                setattr(self, name, gpu_arrays[name])
 
         if air_coords is not None:
             if air_map is not None:
@@ -957,18 +1229,9 @@ def __init__(
 
         self.attenuation_builder = attenuation_builder
         self.n_jobs = n_jobs
-        self.__post_init__()
         self.check_fields()
         logger.debug("Medium instance created.")
 
-    def __post_init__(self) -> None:
-        """Post-initialization processing for Medium."""
-        self.sound_speed = np.atleast_2d(self.sound_speed).astype(self.dtype, copy=False)
-        self.density = np.atleast_2d(self.density).astype(self.dtype, copy=False)
-        self.alpha_coeff = np.atleast_2d(self.alpha_coeff).astype(self.dtype, copy=False)
-        self.alpha_power = np.atleast_2d(self.alpha_power).astype(self.dtype, copy=False)
-        self.beta = np.atleast_2d(self.beta).astype(self.dtype, copy=False)
-
     def check_fields(self) -> None:
         """Check if the fields have the correct shape."""
         grid_shape: tuple[int, ...]
@@ -1005,10 +1268,17 @@ def air_map(self) -> NDArray[np.int64]:
             return np.zeros(grid_shape, dtype=int)
         return coords_to_map(self.air_coords, grid_shape=grid_shape, is_3d=self.is_3d)
 
+    def _to_numpy(self, arr: NDArray) -> NDArray:
+        """Transfer array to CPU numpy. No-op if already numpy."""
+        if self.xp is not np:
+            return self.xp.asnumpy(arr)
+        return arr
+
     @property
-    def bulk_modulus(self) -> NDArray[np.float64]:
-        """Return the bulk_modulus."""
-        return np.multiply(self.sound_speed**2, self.density)
+    def bulk_modulus(self) -> NDArray:
+        """Return the bulk_modulus (stays on same device as source arrays)."""
+        xp = self.xp
+        return xp.multiply(self.sound_speed**2, self.density)
 
     @property
     def n_coords_zero(self) -> int:
@@ -1034,6 +1304,7 @@ def plot(
         dpi: int = 300,
     ) -> None:
         """Plot the medium fields using matplotlib."""
+        _np = self._to_numpy
         if self.is_3d:
             plt.close("all")
             _, axes = plt.subplots(2, 6, figsize=figsize)
@@ -1041,16 +1312,16 @@ def plot(
             for ax, map_data, title in zip(
                 axes.flatten(),
                 [
-                    self.sound_speed[:, :, self.grid.nz // 2],
-                    self.sound_speed[:, self.grid.ny // 2, :],
-                    self.density[:, :, self.grid.nz // 2],
-                    self.density[:, self.grid.ny // 2, :],
-                    self.alpha_coeff[:, :, self.grid.nz // 2],
-                    self.alpha_coeff[:, self.grid.ny // 2, :],
-                    self.alpha_power[:, :, self.grid.nz // 2],
-                    self.alpha_power[:, self.grid.ny // 2, :],
-                    self.beta[:, :, self.grid.nz // 2],
-                    self.beta[:, self.grid.ny // 2, :],
+                    _np(self.sound_speed[:, :, self.grid.nz // 2]),
+                    _np(self.sound_speed[:, self.grid.ny // 2, :]),
+                    _np(self.density[:, :, self.grid.nz // 2]),
+                    _np(self.density[:, self.grid.ny // 2, :]),
+                    _np(self.alpha_coeff[:, :, self.grid.nz // 2]),
+                    _np(self.alpha_coeff[:, self.grid.ny // 2, :]),
+                    _np(self.alpha_power[:, :, self.grid.nz // 2]),
+                    _np(self.alpha_power[:, self.grid.ny // 2, :]),
+                    _np(self.beta[:, :, self.grid.nz // 2]),
+                    _np(self.beta[:, self.grid.ny // 2, :]),
                     self.air_map[:, :, self.grid.nz // 2],
                     self.air_map[:, self.grid.ny // 2, :],
                 ],
@@ -1089,11 +1360,11 @@ def plot(
             for ax, map_data, title in zip(
                 axes.flatten(),
                 [
-                    self.sound_speed,
-                    self.density,
-                    self.alpha_coeff,
-                    self.alpha_power,
-                    self.beta,
+                    _np(self.sound_speed),
+                    _np(self.density),
+                    _np(self.alpha_coeff),
+                    _np(self.alpha_power),
+                    _np(self.beta),
                     self.air_map,
                 ],
                 [
@@ -1147,11 +1418,13 @@ def build(self) -> MediumRelaxationMaps:
         it uses the relaxation parameters look up table
         to generate the relaxation parameters.
 
-        Returns:
+        Returns
+        -------
             MediumRelaxationMaps: An instance of MediumRelaxationMaps
             built from the retrieved relaxation parameters.
 
-        Raises:
+        Raises
+        ------
             ValueError: If an unknown attenuation_builder is specified.
 
         """
@@ -1173,6 +1446,8 @@ def build(self) -> MediumRelaxationMaps:
             relaxation_param_dict = {
                 k: v.astype(self.dtype, copy=False) for k, v in relaxation_param_dict.items()
             }
+        # Convert relaxation params to GPU if needed (MediumRelaxationMaps will
+        # call xp.asarray on them in __init__)
         return MediumRelaxationMaps(
             grid=self.grid,
             sound_speed=self.sound_speed,
@@ -1184,6 +1459,7 @@ def build(self) -> MediumRelaxationMaps:
             use_isotropic_relaxation=self.use_isotropic_relaxation,
             n_jobs=self.n_jobs,
             dtype=self.dtype,
+            use_gpu=self.use_gpu,
         )
 
     def _db_mhz_cm_to_a_exp(
@@ -1210,13 +1486,18 @@ def _db_mhz_cm_to_a_exp(
         f0 = self.grid.omega / (2.0 * np.pi * 1e6)  # scalar
         att_factor_dt = -self.grid.dt * 0.5 * f0 * self.grid.c0 / (1e-2 * np_factor)
 
-        # numexpr: exp(att_factor_dt * alpha_coeff)
+        xp = self.xp
+        if xp is not np:
+            # alpha_coeff may already be a CuPy array; asarray is a no-op in that case
+            alpha_gpu = xp.asarray(alpha_coeff)
+            return xp.exp(att_factor_dt * alpha_gpu)
         return ne.evaluate("exp(att * a)", local_dict={"a": alpha_coeff, "att": att_factor_dt})
 
     def build_exponential(self) -> MediumExponentialAttenuation:
         """Build MediumExponentialAttenuation from alpha and power maps.
 
-        Returns:
+        Returns
+        -------
             MediumExponentialAttenuation: An instance of MediumExponentialAttenuation
             built from the alpha and power maps.
 
@@ -1233,6 +1514,7 @@ def build_exponential(self) -> MediumExponentialAttenuation:
             beta=self.beta,
             air_coords=self.air_coords,
             dtype=self.dtype,
+            use_gpu=self.use_gpu,
         )
 
     def print_info(self) -> None:
@@ -1252,19 +1534,20 @@ def __str__(self) -> str:
             A string summarizing the Medium properties.
 
         """
+        xp = self.xp
         return (
             f"Medium: \n"
             f"  Grid: {self.grid}\n"
             "\n"
-            f"  Sound speed: min={np.min(self.sound_speed):.2f}, "
-            f"max={np.max(self.sound_speed):.2f}\n"
-            f"  Density: min={np.min(self.density):.2f}, "
-            f"max={np.max(self.density):.2f}\n"
-            f"  Alpha coeff: min={np.min(self.alpha_coeff):.2f}, "
-            f"max={np.max(self.alpha_coeff):.2f}\n"
-            f"  Alpha power: min={np.min(self.alpha_power):.2f}, "
-            f"max={np.max(self.alpha_power):.2f}\n"
-            f"  Beta: min={np.min(self.beta):.2f}, max={np.max(self.beta):.2f}\n"
+            f"  Sound speed: min={float(xp.min(self.sound_speed)):.2f}, "
+            f"max={float(xp.max(self.sound_speed)):.2f}\n"
+            f"  Density: min={float(xp.min(self.density)):.2f}, "
+            f"max={float(xp.max(self.density)):.2f}\n"
+            f"  Alpha coeff: min={float(xp.min(self.alpha_coeff)):.2f}, "
+            f"max={float(xp.max(self.alpha_coeff)):.2f}\n"
+            f"  Alpha power: min={float(xp.min(self.alpha_power)):.2f}, "
+            f"max={float(xp.max(self.alpha_power)):.2f}\n"
+            f"  Beta: min={float(xp.min(self.beta)):.2f}, max={float(xp.max(self.beta)):.2f}\n"
             f"  Number of air coords: {self.n_air}\n"
             f"  Attenuation builder: {self.attenuation_builder}\n"
         )
diff --git a/fullwave/sensor.py b/fullwave/sensor.py
index 0a9c1ad..8de59ee 100644
--- a/fullwave/sensor.py
+++ b/fullwave/sensor.py
@@ -28,6 +28,9 @@ def __init__(
         *,
         coords: NDArray[np.int64] | None = None,
         grid_shape: tuple[int, ...] | None = None,
+        mod_x: int | None = None,
+        mod_y: int | None = None,
+        mod_z: int = 0,
     ) -> None:
         """Sensor class for Fullwave.
 
@@ -36,7 +39,7 @@ def __init__(
         mask : NDArray[np.bool] | None
             Binary matrix where the pressure is recorded at each time-step
             shape: [nx, ny] for 2D, [nx, ny, nz] for 3D.
-            Mutually exclusive with coords/grid_shape.
+            Mutually exclusive with coords/grid_shape and mod_x/mod_y.
         sampling_modulus_time: int
             Sampling modulus in time. Default is 1 (record at every time step).
             Changing this value to n will record the pressure every n time steps.
@@ -46,39 +49,95 @@ def __init__(
             Must be provided together with grid_shape.
         grid_shape : tuple[int, ...] | None
             Shape of the computational grid. Required when using coords input.
+        mod_x : int | None
+            Spatial decimation stride in x (depth) for sparse-grid sensor output.
+            When provided together with mod_y, activates sparse-grid mode.
+            In this mode the solver binary generates sensor positions automatically
+            as [::mod_x, ::mod_y] (and [::mod_z] for 3D) across the domain,
+            ignoring any explicit coords/mask.  Mutually exclusive with mask/coords.
+        mod_y : int | None
+            Spatial decimation stride in y (lateral).  Must be provided with mod_x.
+        mod_z : int
+            Spatial decimation stride in z (elevational).  Only relevant in 3D simulations.
+            Providing mod_z > 0 together with mod_x and mod_y creates a 3D sparse sensor.
+            Default 0 (2D sensor, or no z-subsampling when used in a 3D run).
 
         Raises
         ------
         ValueError
             If grid_shape is not provided when using coords input.
             If both mask and coords are provided (mutually exclusive).
-            If neither mask nor coords (with grid_shape) is provided.
+            If only one of mod_x / mod_y is provided.
+            If mod_x/mod_y are mixed with mask or coords.
+            If none of the three input modes is supplied.
 
         """
-        if coords is not None:
+        if mod_x is not None or mod_y is not None:
+            # --- sparse-grid mode ---
+            if mod_x is None or mod_y is None:
+                msg = "Both mod_x and mod_y must be provided for sparse-grid sensor mode"
+                raise ValueError(msg)
+            if mask is not None or coords is not None:
+                msg = "mod_x/mod_y are mutually exclusive with mask and coords"
+                raise ValueError(msg)
+            if mod_x <= 0 or mod_y <= 0:
+                msg = "mod_x and mod_y must be positive integers"
+                raise ValueError(msg)
+            if mod_z < 0:
+                msg = "mod_z must be a non-negative integer"
+                raise ValueError(msg)
+            self.is_sparse_grid = True
+            self.mod_x = mod_x
+            self.mod_y = mod_y
+            self.mod_z = mod_z
+            self.is_3d = mod_z > 0
+            ndim = 3 if self.is_3d else 2
+            # Empty placeholder — the binary computes positions from mod values.
+            self.outcoords = np.empty((0, ndim), dtype=np.int64)
+            self.grid_shape = None
+        elif coords is not None:
             if grid_shape is None:
                 msg = "grid_shape is required when using coords input"
                 raise ValueError(msg)
             if mask is not None:
                 msg = "mask and coords are mutually exclusive"
                 raise ValueError(msg)
+            self.is_sparse_grid = False
+            self.mod_x = 0
+            self.mod_y = 0
+            self.mod_z = 0
             self.outcoords = np.atleast_2d(coords).astype(np.int64, copy=False)
             self.grid_shape = tuple(grid_shape)
+            self.is_3d = len(self.grid_shape) == 3
         elif mask is not None:
+            self.is_sparse_grid = False
+            self.mod_x = 0
+            self.mod_y = 0
+            self.mod_z = 0
             mask = np.atleast_2d(mask)
             self.grid_shape = mask.shape
             self.outcoords = map_to_coords(mask)
+            self.is_3d = len(self.grid_shape) == 3
         else:
-            msg = "Either mask or coords (with grid_shape) must be provided"
+            msg = "Either mask, coords (with grid_shape), or mod_x with mod_y must be provided"
             raise ValueError(msg)
 
         self.sampling_modulus_time = sampling_modulus_time
-        self.is_3d = len(self.grid_shape) == 3
         super().__init__()
         logger.debug("Sensor instance created.")
 
     def validate(self, grid_shape: NDArray[np.int64] | tuple) -> None:
         """Check if the sensor coordinates are consistent with the grid shape."""
+        if self.is_sparse_grid:
+            grid_shape = tuple(grid_shape) if isinstance(grid_shape, np.ndarray) else grid_shape
+            if len(grid_shape) == 3 and self.mod_z == 0:
+                msg = (
+                    "Sparse-grid sensor used in a 3D simulation but mod_z was not provided. "
+                    "Pass mod_z > 0 to Sensor (e.g. Sensor(mod_x=4, mod_y=4, mod_z=4))."
+                )
+                raise ValueError(msg)
+            logger.debug("Sparse-grid sensor validated.")
+            return
         grid_shape = tuple(grid_shape) if isinstance(grid_shape, np.ndarray) else grid_shape
         assert self.grid_shape == grid_shape, f"{self.grid_shape} != {grid_shape}"
         assert self.n_sensors > 0, "No active sensor found."
@@ -130,7 +189,8 @@ def plot(
     ) -> None:
         """Plot the transducer mask, optionally exporting and displaying the figure.
 
-        Raises:
+        Raises
+        ------
             ValueError: If the sensor is 3D because plotting is not supported.
 
         """
@@ -173,6 +233,11 @@ def __str__(self) -> str:
             Formatted string containing source information.
 
         """
+        if self.is_sparse_grid:
+            mod_str = f"mod_x={self.mod_x}, mod_y={self.mod_y}"
+            if self.is_3d:
+                mod_str += f", mod_z={self.mod_z}"
+            return f"Sensor (sparse-grid): \n  Strides: {mod_str}\n  Is 3D: {self.is_3d}\n"
         return (
             f"Sensor: \n"
             f"  Number of sensors: {self.n_sensors}\n"
diff --git a/fullwave/solver/binary_manager.py b/fullwave/solver/binary_manager.py
index 3c8b755..f3c433b 100644
--- a/fullwave/solver/binary_manager.py
+++ b/fullwave/solver/binary_manager.py
@@ -28,7 +28,7 @@
 
 # Pinned release tag for the solver binaries.
 # Update this only when new binaries are uploaded to a GitHub release.
-BINARY_RELEASE_TAG = "fullwave_bin_v1.1"
+BINARY_RELEASE_TAG = "fullwave_bin_v1.3"
 
 
 def _download_url(filename: str, tag: str) -> str:
diff --git a/fullwave/solver/cuda_utils.py b/fullwave/solver/cuda_utils.py
index e06209f..497b8ac 100644
--- a/fullwave/solver/cuda_utils.py
+++ b/fullwave/solver/cuda_utils.py
@@ -113,7 +113,8 @@ def cuda_api_call(func: Callable) -> Callable:
     Decorator for CUDA API calls
     Raises RuntimeError if the CUDA call does not return CUDA_SUCCESS.
 
-    Returns:
+    Returns
+    -------
         Callable: The wrapped function that checks CUDA API call results.
 
     """
@@ -138,7 +139,8 @@ def cuda_api_call_warn(func: Callable) -> Callable:
 
     Prints a warning message if the CUDA call does not return CUDA_SUCCESS.
 
-    Returns:
+    Returns
+    -------
         Callable: The wrapped function that checks CUDA API call results.
 
     """
@@ -230,7 +232,8 @@ def get_cuda_device_specs() -> list[dict[str, Any]]:
         'cuda_cores': int
     }
 
-    Returns:
+    Returns
+    -------
         A list of dictionaries containing specifications for each CUDA device.
 
     """
@@ -325,7 +328,8 @@ def get_cuda_architecture() -> list[dict[str, Any]]:
         'architecture': str,
     }
 
-    Returns:
+    Returns
+    -------
         A list of dictionaries containing architecture information for each CUDA device.
 
     """
@@ -363,7 +367,8 @@ def get_cuda_architecture() -> list[dict[str, Any]]:
 def retrieve_cuda_version() -> float:
     """Retrieve the CUDA driver version.
 
-    Returns:
+    Returns
+    -------
         str: CUDA version in the format "major.minor" or "unknown" if retrieval fails.
 
     """
diff --git a/fullwave/solver/input_file_writer.py b/fullwave/solver/input_file_writer.py
index a67cba3..67a4034 100644
--- a/fullwave/solver/input_file_writer.py
+++ b/fullwave/solver/input_file_writer.py
@@ -36,6 +36,8 @@ def __init__(
         use_exponential_attenuation: bool = False,
         use_isotropic_relaxation: bool = False,
         release_after_write: bool = False,
+        pml_thickness: int = 0,
+        use_gpu: bool = False,
     ) -> None:
         """Initialize the InputGeneratorBase instance.
 
@@ -74,6 +76,10 @@ def __init__(
             Whether to release the variable from memory after writing to file.
             This can help reduce memory usage when generating input files for large simulations.
             default is False.
+        pml_thickness : int, optional
+            PML boundary thickness in grid points (n_pml_layer + n_transition_layer).
+            Required by the solver binary when using sparse grid (mod_x/mod_y != 0) to determine
+            the interior domain boundaries. Also written when mod_x == 0 for future use.
 
         """
         logger.debug("Initializing InputFileWriter instance.")
@@ -81,6 +87,7 @@ def __init__(
         self._work_dir = Path(work_dir)
         self.path_fullwave_simulation_bin = path_fullwave_simulation_bin
         self.use_isotropic_relaxation = use_isotropic_relaxation
+        self.use_gpu = use_gpu
 
         if validate_input:
             check_functions.check_path_exists(self.path_fullwave_simulation_bin)
@@ -99,14 +106,40 @@ def __init__(
         self.is_3d = self.grid.is_3d
         self.use_exponential_attenuation = use_exponential_attenuation
         self.release_after_write = release_after_write
+        self.pml_thickness = pml_thickness
 
-        self._dim = int(
-            np.rint(self.medium.sound_speed.max()) - np.rint(self.medium.sound_speed.min()),
-        )
+        self._precomputed_bulk_modulus: NDArray[np.float32] | None = None
+
+        if self.use_gpu:
+            try:
+                import cupy as cp  # noqa: PLC0415
+
+                n_gpus = cp.cuda.runtime.getDeviceCount()
+                if n_gpus > 1 and self.medium.sound_speed.ndim == 3:
+                    c_min_val, c_max_val = self._compute_dc_map_and_bulk_modulus_multi_gpu(
+                        n_gpus,
+                    )
+                else:
+                    c_min_val, c_max_val = self._compute_dc_map_and_bulk_modulus_single_gpu()
+                self._dim = int(round(c_max_val) - round(c_min_val))
+                self._dc_map_ready = True
+            except ImportError:
+                self._dim = int(
+                    np.rint(self.medium.sound_speed.max()) - np.rint(self.medium.sound_speed.min()),
+                )
+                c_min_val = float(self.medium.sound_speed.min())
+                self._dc_map_ready = False
+        else:
+            self._dim = int(
+                np.rint(self.medium.sound_speed.max()) - np.rint(self.medium.sound_speed.min()),
+            )
+            c_min_val = float(self.medium.sound_speed.min())
+            self._dc_map_ready = False
 
         self._set_d_mat()
-        self._set_d_map(self._dim, self.medium.sound_speed)
-        self._set_dc_map(self.medium.sound_speed)
+        self._set_d_map(self._dim, self.medium.sound_speed, c_min=c_min_val)
+        if not self._dc_map_ready:
+            self._set_dc_map(self.medium.sound_speed)
         logger.debug("InputFileWriter instance created.")
 
     def run(
@@ -169,6 +202,19 @@ def run(
             )
         if incoords_add is not None:
             self._queue_coords_write(simulation_dir / "icc_add.dat", incoords_add)
+        for _vel_suffix, _vel_attr in (("u", "u0"), ("v", "v0"), ("w", "w0")):
+            _signal_vel = getattr(self.source, _vel_attr, None)
+            _incoords_vel = getattr(self.source, f"incoords_{_vel_suffix}", None)
+            if _signal_vel is not None:
+                self._queue_ic_write(
+                    simulation_dir / f"icmat_{_vel_suffix}.dat",
+                    np.transpose(_signal_vel),
+                )
+            if _incoords_vel is not None:
+                self._queue_coords_write(
+                    simulation_dir / f"icc_{_vel_suffix}.dat",
+                    _incoords_vel,
+                )
         self._queue_coords_write(simulation_dir / "icc.dat", self.source.incoords)
         self._copy_simulation_bin_file(simulation_dir)
 
@@ -449,10 +495,10 @@ def _horner7(
         # (((((((a7*x + a6)*x + a5)*x + a4)*x + a3)*x + a2)*x + a1)*x + a0)
         return ((((((a7 * x + a6) * x + a5) * x + a4) * x + a3) * x + a2) * x + a1) * x + a0
 
-    def _set_d_map(self, dim: int, c_map: np.ndarray) -> None:
+    def _set_d_map(self, dim: int, c_map: np.ndarray, *, c_min: float | None = None) -> None:
         self._d_map = np.zeros((9, 2, dim + 1), dtype=np.float64)
 
-        cmin = float(np.min(c_map))  # compute once (was inside loop)
+        cmin = c_min if c_min is not None else float(np.min(c_map))
         scale = self.grid.dt / self.grid.dx  # compute once
         i = np.arange(dim + 1, dtype=np.float64)
         r = (i + cmin) * scale
@@ -663,10 +709,27 @@ def _set_dc_map(self, c_map: np.ndarray) -> None:
 
         For large 3-D maps the naive approach allocates a full float64 copy
         and makes several sequential passes.
-        This version processes the first axis in chunks using a thread pool so
-        that (a) peak memory stays bounded and (b) multiple cores share the work.
+        GPU path: uses CuPy for the entire computation in one pass.
+        CPU path: processes the first axis in chunks using a thread pool.
         """
         logger.debug("Setting dc map for stencil coefficients.")
+
+        if self.use_gpu:
+            try:
+                import cupy as cp  # noqa: PLC0415
+
+                c_gpu = cp.asarray(c_map, dtype=cp.float64)
+                c_min_rounded = float(matlab_round(float(c_gpu.min())))
+                offset = -c_min_rounded + 1
+                c_gpu += 1e-9
+                cp.rint(c_gpu, out=c_gpu)
+                c_gpu += offset
+                self._dc_map = cp.asnumpy(c_gpu.astype(cp.int32))
+                logger.debug("dc map for stencil coefficients set (GPU).")
+                return
+            except ImportError:
+                pass
+
         c_min_rounded = matlab_round(c_map.min())
         offset = float(-c_min_rounded + 1)
 
@@ -700,6 +763,127 @@ def _process_chunk(start: int) -> None:
 
     # --- batch write utils ---
 
+    def _compute_dc_map_and_bulk_modulus_single_gpu(self) -> tuple[float, float]:
+        """Compute dc_map and bulk_modulus on a single GPU.
+
+        Returns
+        -------
+        tuple[float, float]
+            (c_min_val, c_max_val) of the sound speed.
+
+        """
+        import cupy as cp  # noqa: PLC0415
+
+        c_gpu = cp.asarray(self.medium.sound_speed, dtype=cp.float64)
+        c_min_val = float(c_gpu.min())
+        c_max_val = float(c_gpu.max())
+
+        # dc_map: rint(sound_speed + 1e-9) - rint(c_min) + 1
+        c_tmp = c_gpu.copy()
+        c_min_rounded = float(matlab_round(c_min_val))
+        offset = -c_min_rounded + 1
+        c_tmp += 1e-9
+        cp.rint(c_tmp, out=c_tmp)
+        c_tmp += offset
+        self._dc_map = cp.asnumpy(c_tmp.astype(cp.int32))
+        del c_tmp, c_gpu
+
+        # bulk_modulus: sound_speed^2 * density
+        c_f32 = cp.asarray(self.medium.sound_speed, dtype=cp.float32)
+        rho_f32 = cp.asarray(self.medium.density, dtype=cp.float32)
+        bulk = cp.multiply(c_f32 * c_f32, rho_f32)
+        self._precomputed_bulk_modulus = cp.asnumpy(bulk)
+        del c_f32, rho_f32, bulk
+        cp.get_default_memory_pool().free_all_blocks()
+
+        logger.debug("dc map and bulk modulus set (single GPU).")
+        return c_min_val, c_max_val
+
+    def _compute_dc_map_and_bulk_modulus_multi_gpu(
+        self,
+        n_gpus: int,
+    ) -> tuple[float, float]:
+        """Compute dc_map and bulk_modulus in parallel across multiple GPUs.
+
+        Each GPU processes a slice of the 3D array along axis 0.
+
+        Returns
+        -------
+        tuple[float, float]
+            (c_min_val, c_max_val) of the sound speed.
+
+        """
+        from concurrent.futures import ThreadPoolExecutor, as_completed  # noqa: PLC0415
+
+        import cupy as cp  # noqa: PLC0415
+
+        sound_speed_np = self.medium.sound_speed
+        density_np = self.medium.density
+
+        # Ensure numpy for slicing
+        if not isinstance(sound_speed_np, np.ndarray):
+            sound_speed_np = cp.asnumpy(sound_speed_np)
+        if not isinstance(density_np, np.ndarray):
+            density_np = cp.asnumpy(density_np)
+
+        n_slabs = sound_speed_np.shape[0]
+        n_workers = min(n_gpus, n_slabs)
+        chunk_size = -(-n_slabs // n_workers)  # ceil division
+
+        logger.info(
+            "Computing dc_map and bulk_modulus using %d GPUs (of %d available).",
+            n_workers,
+            n_gpus,
+        )
+
+        # Phase 1+2: Each GPU computes local min/max, dc_map chunk, bulk_modulus chunk
+        dc_map_result = np.empty(sound_speed_np.shape, dtype=np.int32)
+        bulk_result = np.empty(sound_speed_np.shape, dtype=np.float32)
+        local_mins = np.empty(n_workers, dtype=np.float64)
+        local_maxs = np.empty(n_workers, dtype=np.float64)
+
+        def _process_on_device(worker_id: int, device_id: int) -> None:
+            start = worker_id * chunk_size
+            end = min(start + chunk_size, n_slabs)
+            with cp.cuda.Device(device_id):
+                c_chunk = cp.asarray(sound_speed_np[start:end], dtype=cp.float64)
+                local_mins[worker_id] = float(c_chunk.min())
+                local_maxs[worker_id] = float(c_chunk.max())
+
+                # dc_map for this chunk (offset applied after global min is known)
+                c_chunk += 1e-9
+                cp.rint(c_chunk, out=c_chunk)
+                dc_chunk_f64 = c_chunk  # reuse buffer
+                dc_map_result[start:end] = cp.asnumpy(dc_chunk_f64.astype(cp.int32))
+                del dc_chunk_f64
+
+                # bulk_modulus for this chunk
+                c_f32 = cp.asarray(sound_speed_np[start:end], dtype=cp.float32)
+                rho_f32 = cp.asarray(density_np[start:end], dtype=cp.float32)
+                bulk_chunk = cp.multiply(c_f32 * c_f32, rho_f32)
+                bulk_result[start:end] = cp.asnumpy(bulk_chunk)
+                del c_f32, rho_f32, bulk_chunk
+                cp.get_default_memory_pool().free_all_blocks()
+
+        with ThreadPoolExecutor(max_workers=n_workers) as executor:
+            futures = [executor.submit(_process_on_device, i, i % n_gpus) for i in range(n_workers)]
+            for future in as_completed(futures):
+                future.result()
+
+        c_min_val = float(local_mins.min())
+        c_max_val = float(local_maxs.max())
+
+        # Apply global offset to dc_map
+        c_min_rounded = int(matlab_round(c_min_val))
+        offset = np.int32(-c_min_rounded + 1)
+        dc_map_result += offset
+
+        self._dc_map = dc_map_result
+        self._precomputed_bulk_modulus = bulk_result
+
+        logger.debug("dc map and bulk modulus set (multi-GPU, %d devices).", n_workers)
+        return c_min_val, c_max_val
+
     def _init_pending_writes(self) -> None:
         """Initialize the pending writes list for batch I/O."""
         self._pending_writes: list[tuple] = []
@@ -804,10 +988,15 @@ def _save_variables_into_dat_file(
         relaxation_param_map_dict_for_fw2: dict[str, NDArray[np.float64]],
         dim: int,
     ) -> None:
+        k_map = (
+            self._precomputed_bulk_modulus
+            if self._precomputed_bulk_modulus is not None
+            else self.medium.bulk_modulus
+        )
         self._save_maps(
             simulation_dir,
             c_map=self.medium.sound_speed,
-            k_map=self.medium.bulk_modulus,
+            k_map=k_map,
             rho_map=self.medium.density,
             beta_map=self.medium.beta,
         )
@@ -815,6 +1004,7 @@ def _save_variables_into_dat_file(
         self._save_step_params(simulation_dir)
         self._save_coords_params(simulation_dir)
         self._save_d_params(simulation_dir, dim)
+        self._save_sparse_grid_params(simulation_dir)
 
         if self.use_isotropic_relaxation:
             rename_dict = {
@@ -869,10 +1059,15 @@ def _save_variables_into_dat_file_exponential_attenuation(
         simulation_dir: Path,
         dim: int,
     ) -> None:
+        k_map = (
+            self._precomputed_bulk_modulus
+            if self._precomputed_bulk_modulus is not None
+            else self.medium.bulk_modulus
+        )
         self._save_maps(
             simulation_dir,
             c_map=self.medium.sound_speed,
-            k_map=self.medium.bulk_modulus,
+            k_map=k_map,
             rho_map=self.medium.density,
             beta_map=self.medium.beta,
             alpha_exp_map=self.medium.alpha_exp,
@@ -881,6 +1076,7 @@ def _save_variables_into_dat_file_exponential_attenuation(
         self._save_step_params(simulation_dir)
         self._save_coords_params(simulation_dir)
         self._save_d_params(simulation_dir, dim)
+        self._save_sparse_grid_params(simulation_dir)
 
     def _build_symbolic_links_for_dat_files(self, src_dir: Path, dst_dir: Path) -> None:
         var_name_list = [
@@ -903,6 +1099,9 @@ def _build_symbolic_links_for_dat_files(self, src_dir: Path, dst_dir: Path) -> N
             "ncoordszero",
             "nTic",
             "modT",
+            "modX",
+            "modY",
+            "pml_thickness",
             "d",
             "dmap",
             "ndmap",
@@ -935,6 +1134,8 @@ def _build_symbolic_links_for_dat_files(self, src_dir: Path, dst_dir: Path) -> N
                     "bpmly2",
                 ],
             )
+        if self.is_3d:
+            var_name_list.append("modZ")
         if self.is_3d and not self.use_isotropic_relaxation:
             var_name_list.extend(
                 [
@@ -1047,6 +1248,10 @@ def _save_coords_params(self, simulation_dir: Path) -> None:
         n_sources_add = getattr(self.source, "n_sources_add", 0)
         if n_sources_add > 0:
             var_list.append(("ncoords_add", n_sources_add))
+        for _vel_suffix in ("u", "v", "w"):
+            _n_vel = getattr(self.source, f"n_sources_{_vel_suffix}", 0)
+            if _n_vel > 0:
+                var_list.append((f"ncoords_{_vel_suffix}", _n_vel))
         if self.is_3d:
             var_list.extend(
                 [
@@ -1057,6 +1262,22 @@ def _save_coords_params(self, simulation_dir: Path) -> None:
             save_path = simulation_dir / f"{var_name}.dat"
             self._queue_v_abs_write(np.int32, save_path, var)
 
+    def _save_sparse_grid_params(self, simulation_dir: Path) -> None:
+        """Write sparse-grid parameters when the sensor is in sparse-grid mode.
+
+        modX / modY / modZ and pml_thickness are only written when the sensor
+        was constructed with mod_x/mod_y (i.e. sensor.is_sparse_grid is True).
+        In standard coordinate/mask mode these files are not produced, preserving
+        backward compatibility with older binaries that do not expect them.
+        """
+        if not self.sensor.is_sparse_grid:
+            return
+        self._queue_v_abs_write(np.int32, simulation_dir / "modX.dat", self.sensor.mod_x)
+        self._queue_v_abs_write(np.int32, simulation_dir / "modY.dat", self.sensor.mod_y)
+        if self.is_3d:
+            self._queue_v_abs_write(np.int32, simulation_dir / "modZ.dat", self.sensor.mod_z)
+        self._queue_v_abs_write(np.int32, simulation_dir / "pml_thickness.dat", self.pml_thickness)
+
     def _save_d_params(
         self,
         simulation_dir: Path,
@@ -1151,6 +1372,13 @@ def _write_matrix(
 
         dtype = np.dtype(var_type)
 
+        # Transfer CuPy arrays to CPU for file writing
+        if not isinstance(variable_mat, np.ndarray):
+            try:
+                variable_mat = variable_mat.get()  # CuPy → numpy
+            except AttributeError:
+                variable_mat = np.asarray(variable_mat)
+
         # Fast path: no conversion, no reorder.
         if variable_mat.dtype == dtype and variable_mat.flags.c_contiguous:
             variable_mat.tofile(save_path)  # writes in C order
diff --git a/fullwave/solver/launcher.py b/fullwave/solver/launcher.py
index 80da170..06a6dcb 100644
--- a/fullwave/solver/launcher.py
+++ b/fullwave/solver/launcher.py
@@ -31,6 +31,7 @@ def __init__(
         use_gpu: bool = True,
         cuda_device_id: str | int | list | None = None,
         save_gpu_memory: bool = False,
+        verify_gpu: bool = True,
     ) -> None:
         """Initialize a FullwaveLauncher instance.
 
@@ -62,6 +63,11 @@ def __init__(
             depending on the hardware and the simulation settings.
             useful in 3D simulations with large grid sizes
             where GPU memory is a limiting factor.
+        verify_gpu : bool, optional
+            Whether to verify that the specified CUDA devices exist on the system.
+            Defaults to True. Set to False when generating input files only
+            (``generate_input_only=True``) on a machine that may not have
+            the target GPUs available.
 
         """
         self._path_fullwave_simulation_bin = path_fullwave_simulation_bin
@@ -69,7 +75,10 @@ def __init__(
         assert self._path_fullwave_simulation_bin.exists(), error_msg
         self.is_3d = is_3d
         self.use_gpu = use_gpu
-        self.cuda_device_id = self._configure_cuda_device_id(cuda_device_id)
+        self.cuda_device_id = self._configure_cuda_device_id(
+            cuda_device_id,
+            verify_gpu=verify_gpu,
+        )
         self.save_gpu_memory = save_gpu_memory
         logger.debug("Launcher instance created.")
 
@@ -152,13 +161,20 @@ def _verify_cuda_devices_exist(device_id_str: str) -> None:
                 raise ValueError(message)
 
     @staticmethod
-    def _configure_cuda_device_id(cuda_device_id: str | int | list | None) -> str:
+    def _configure_cuda_device_id(
+        cuda_device_id: str | int | list | None,
+        *,
+        verify_gpu: bool = True,
+    ) -> str:
         """Verify and assign the CUDA device ID.
 
         Parameters
         ----------
         cuda_device_id : str | int | None
             The CUDA device ID to verify and assign.
+        verify_gpu : bool, optional
+            Whether to verify that the specified CUDA devices exist.
+            Defaults to True.
 
         Returns
         -------
@@ -167,7 +183,8 @@ def _configure_cuda_device_id(cuda_device_id: str | int | list | None) -> str:
 
         """
         output = Launcher._parse_cuda_device_id(cuda_device_id)
-        Launcher._verify_cuda_devices_exist(output)
+        if verify_gpu:
+            Launcher._verify_cuda_devices_exist(output)
         return output
 
     def run(
diff --git a/fullwave/solver/pml_builder.py b/fullwave/solver/pml_builder.py
index bf8efc7..4466a24 100644
--- a/fullwave/solver/pml_builder.py
+++ b/fullwave/solver/pml_builder.py
@@ -1,46 +1,96 @@
 """Perfectly Matched Layer (PML) setup for Fullwave."""
 
+from __future__ import annotations
+
 import concurrent.futures
 import logging
 from collections import OrderedDict
 from dataclasses import dataclass, field
 from functools import cached_property
-from itertools import starmap
 from pathlib import Path
+from typing import TYPE_CHECKING
 
 import matplotlib.pyplot as plt
 import numexpr as ne
 import numpy as np
-from joblib import Parallel, delayed
-from numpy.typing import NDArray
 
 import fullwave
+
+if TYPE_CHECKING:
+    from types import ModuleType
+
+    from numpy.typing import NDArray
 from fullwave.solver.utils import initialize_relaxation_param_dict
 from fullwave.utils import check_functions, plot_utils
 
 logger = logging.getLogger("__main__." + __name__)
 
+_CUPY_AVAILABLE: bool | None = None
+
+
+def _check_cupy() -> bool:
+    """Return True if CuPy is importable; result is cached after the first call."""
+    global _CUPY_AVAILABLE  # noqa: PLW0603
+    if _CUPY_AVAILABLE is None:
+        try:
+            import cupy  # noqa: F401, PLC0415
+
+            _CUPY_AVAILABLE = True
+        except ImportError:
+            _CUPY_AVAILABLE = False
+    return _CUPY_AVAILABLE
+
+
+def _get_array_module(*, use_gpu: bool) -> ModuleType:
+    """Return ``cupy`` when *use_gpu* is True and CuPy is available, else ``numpy``."""
+    if use_gpu and _check_cupy():
+        import cupy  # noqa: PLC0415
 
-def _smooth_transition_function_part(x: NDArray[np.float64]) -> NDArray[np.float64]:
-    return np.where(x > 0, np.exp(-1 / (x + 1e-20)), 0)
+        return cupy
+    return np
 
 
-def _smooth_transition_function(x: NDArray[np.float64]) -> NDArray[np.float64]:
-    return _smooth_transition_function_part(x) / (
-        _smooth_transition_function_part(x) + _smooth_transition_function_part(1 - x)
+def _smooth_transition_function_part(
+    x: NDArray[np.float64],
+    xp: ModuleType = np,
+) -> NDArray[np.float64]:
+    """Smooth bump helper (works with numpy or cupy)."""
+    return xp.where(x > 0, xp.exp(-1 / (x + 1e-20)), 0)
+
+
+def _smooth_transition_function(
+    x: NDArray[np.float64],
+    xp: ModuleType = np,
+) -> NDArray[np.float64]:
+    """Smooth transition function (works with numpy or cupy)."""
+    return _smooth_transition_function_part(x, xp=xp) / (
+        _smooth_transition_function_part(x, xp=xp) + _smooth_transition_function_part(1 - x, xp=xp)
     )
 
 
-def _linear_transition_function(x: NDArray[np.float64]) -> NDArray[np.float64]:
+def _linear_transition_function(
+    x: NDArray[np.float64],
+    xp: ModuleType = np,  # noqa: ARG001
+) -> NDArray[np.float64]:
+    """Linear transition function."""
     return x
 
 
-def _n_th_deg_polynomial_function(x: NDArray[np.float64], n: int = 2) -> NDArray[np.float64]:
+def _n_th_deg_polynomial_function(
+    x: NDArray[np.float64],
+    n: int = 2,
+    xp: ModuleType = np,  # noqa: ARG001
+) -> NDArray[np.float64]:
+    """N-th degree polynomial transition function."""
     return x**n
 
 
-def _cosine_transition_function(x: NDArray[np.float64]) -> NDArray[np.float64]:
-    return 0.5 * (1 - np.cos(np.pi * x))
+def _cosine_transition_function(
+    x: NDArray[np.float64],
+    xp: ModuleType = np,
+) -> NDArray[np.float64]:
+    """Cosine transition function (works with numpy or cupy)."""
+    return 0.5 * (1 - xp.cos(xp.pi * x))
 
 
 def _obtain_relax_var_rename_dict(
@@ -115,7 +165,7 @@ class PMLBuilder:
     pml_mask_x: NDArray[np.float64] = field(init=False)
     pml_mask_y: NDArray[np.float64] = field(init=False)
 
-    def __init__(  # noqa: PLR0915
+    def __init__(  # noqa: PLR0912
         self,
         grid: fullwave.Grid,
         medium: fullwave.Medium,
@@ -126,6 +176,7 @@ def __init__(  # noqa: PLR0915
         n_pml_layer: int = 40,
         n_transition_layer: int = 40,
         use_isotropic_relaxation: bool = False,
+        use_gpu: bool = False,
         # pml_alpha_target: float = 1.1,
         # pml_alpha_power_target: float = 1.6,
         # pml_strength_factor: float = 2.0,
@@ -160,6 +211,9 @@ def __init__(  # noqa: PLR0915
             This option omits the anisotropic relaxation mechanisms to model the attenuation.
             We usually recommend using isotropic relaxation mechanisms
             unless the anisotropic attenuation is required for the simulation.
+        use_gpu : bool, optional
+            If True, use CuPy for GPU-accelerated PML computation (default is False).
+            Requires CuPy to be installed. Falls back to CPU if CuPy is unavailable.
 
         """
         check_functions.check_instance(
@@ -186,6 +240,15 @@ def __init__(  # noqa: PLR0915
         self.is_3d = grid.is_3d
         self.use_isotropic_relaxation = use_isotropic_relaxation
 
+        self.use_gpu = use_gpu
+        self.xp: ModuleType = _get_array_module(use_gpu=use_gpu)
+        if self.xp is not np:
+            logger.info("PMLBuilder: using CuPy GPU backend")
+        elif use_gpu:
+            logger.warning(
+                "PMLBuilder: use_gpu=True but CuPy is not available. Falling back to CPU (numpy)."
+            )
+
         self.m_spatial_order = m_spatial_order
         self.n_pml_layer = n_pml_layer
         self.n_transition_layer = n_transition_layer
@@ -221,95 +284,72 @@ def __init__(  # noqa: PLR0915
 
         logger.debug("building extended medium for pml...")
         if isinstance(self.medium_org, fullwave.MediumRelaxationMaps):
-            with concurrent.futures.ThreadPoolExecutor() as executor:
-                future_sound_speed = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.sound_speed,
-                )
-                future_density = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.density,
-                )
-                future_beta = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.beta,
-                )
-                future_alpha_coeff = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.alpha_coeff,
-                )
-                future_alpha_power = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.alpha_power,
-                )
-                future_relaxation_param_dict = {
-                    key: executor.submit(self._extend_map_for_pml, value)
-                    for key, value in self.medium_org.relaxation_param_dict.items()
-                }
-
-                extended_sound_speed = future_sound_speed.result()
-                extended_density = future_density.result()
-                extended_beta = future_beta.result()
-                extended_alpha_coeff = future_alpha_coeff.result()
-                extended_alpha_power = future_alpha_power.result()
-                extended_relaxation_param_dict = {
-                    key: future.result() for key, future in future_relaxation_param_dict.items()
-                }
+            base_attrs = ["sound_speed", "density", "beta"]
+            relax_attrs = list(self.medium_org.relaxation_param_dict.keys())
+
+            if self.xp is not np:
+                # Pass CuPy arrays directly — multi-GPU extension uses D2D copy (NVLink)
+                named_arrays = [(name, getattr(self.medium_org, name)) for name in base_attrs]
+                named_arrays += [
+                    (key, self.medium_org.relaxation_param_dict[key]) for key in relax_attrs
+                ]
+                extended = self._extend_arrays_gpu(named_arrays)
+                # Free original GPU arrays to reclaim memory
+                self._ensure_numpy_medium_arrays(base_attrs)
+                for key in relax_attrs:
+                    import cupy as cp  # noqa: PLC0415
+
+                    val = self.medium_org.relaxation_param_dict[key]
+                    if not isinstance(val, np.ndarray):
+                        self.medium_org.relaxation_param_dict[key] = cp.asnumpy(val)
+                cp.get_default_memory_pool().free_all_blocks()
+            else:
+                named_arrays = [(name, getattr(self.medium_org, name)) for name in base_attrs] + [
+                    (key, self.medium_org.relaxation_param_dict[key]) for key in relax_attrs
+                ]
+                extended = self._extend_arrays_cpu(named_arrays)
 
+            extended_relaxation_param_dict = {key: extended[key] for key in relax_attrs}
+            # Extended arrays are numpy — skip re-upload to GPU to avoid
+            # wasting PCIe bandwidth. CPU numexpr handles subsequent computation.
             self.extended_medium = fullwave.MediumRelaxationMaps(
                 grid=self.extended_grid,
-                sound_speed=extended_sound_speed,
-                density=extended_density,
-                beta=extended_beta,
-                alpha_coeff=extended_alpha_coeff,
-                alpha_power=extended_alpha_power,
+                sound_speed=extended["sound_speed"],
+                density=extended["density"],
+                beta=extended["beta"],
                 relaxation_param_dict=extended_relaxation_param_dict,
                 air_coords=self.medium_org.air_coords + self.num_boundary_points,
                 n_relaxation_mechanisms=self.medium_org.n_relaxation_mechanisms,
                 n_jobs=self.medium_org.n_jobs,
                 dtype=getattr(self.medium_org, "dtype", np.float64),
+                use_gpu=False,
             )
         else:
-            with concurrent.futures.ThreadPoolExecutor() as executor:
-                future_sound_speed = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.sound_speed,
-                )
-                future_density = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.density,
-                )
-                future_beta = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.beta,
-                )
-                future_alpha_coeff = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.alpha_coeff,
-                )
-                future_alpha_power = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.alpha_power,
-                )
+            attr_names = ["sound_speed", "density", "beta", "alpha_coeff", "alpha_power"]
+            if self.xp is not np:
+                named_arrays = [(name, getattr(self.medium_org, name)) for name in attr_names]
+                extended = self._extend_arrays_gpu(named_arrays)
+                self._ensure_numpy_medium_arrays(attr_names)
+            else:
+                named_arrays = [(name, getattr(self.medium_org, name)) for name in attr_names]
+                extended = self._extend_arrays_cpu(named_arrays)
 
-                extended_sound_speed = future_sound_speed.result()
-                extended_density = future_density.result()
-                extended_beta = future_beta.result()
-                extended_alpha_coeff = future_alpha_coeff.result()
-                extended_alpha_power = future_alpha_power.result()
+            # Extended arrays are numpy — skip re-upload to GPU to avoid
+            # wasting PCIe bandwidth. CPU numexpr handles subsequent computation.
             self.extended_medium = fullwave.Medium(
                 grid=self.extended_grid,
-                sound_speed=extended_sound_speed,
-                density=extended_density,
-                beta=extended_beta,
-                alpha_coeff=extended_alpha_coeff,
-                alpha_power=extended_alpha_power,
+                sound_speed=extended["sound_speed"],
+                density=extended["density"],
+                beta=extended["beta"],
+                alpha_coeff=extended["alpha_coeff"],
+                alpha_power=extended["alpha_power"],
                 air_coords=self.medium_org.air_coords + self.num_boundary_points,
                 n_relaxation_mechanisms=self.medium_org.n_relaxation_mechanisms,
                 path_relaxation_parameters_database=self.medium_org.path_relaxation_parameters_database,
                 attenuation_builder=self.medium_org.attenuation_builder,
                 n_jobs=self.medium_org.n_jobs,
                 dtype=getattr(self.medium_org, "dtype", np.float64),
+                use_gpu=False,
             )
         logger.debug("building extended medium for pml...done")
 
@@ -322,24 +362,55 @@ def __init__(  # noqa: PLR0915
             if getattr(self.source_org, "incoords_add", None) is not None
             else None
         )
+        incoords_u_ext = (
+            self.source_org.incoords_u + self.num_boundary_points
+            if getattr(self.source_org, "incoords_u", None) is not None
+            else None
+        )
+        incoords_v_ext = (
+            self.source_org.incoords_v + self.num_boundary_points
+            if getattr(self.source_org, "incoords_v", None) is not None
+            else None
+        )
+        incoords_w_ext = (
+            self.source_org.incoords_w + self.num_boundary_points
+            if getattr(self.source_org, "incoords_w", None) is not None
+            else None
+        )
         self.extended_source = fullwave.Source(
             p0=self.source_org.p0,
             coords=self.source_org.incoords + self.num_boundary_points,
             grid_shape=extended_grid_shape,
             p0_additive=self.source_org.p0_additive,
             coords_additive=incoords_add_ext,
+            u0=getattr(self.source_org, "u0", None),
+            coords_u=incoords_u_ext,
+            v0=getattr(self.source_org, "v0", None),
+            coords_v=incoords_v_ext,
+            w0=getattr(self.source_org, "w0", None),
+            coords_w=incoords_w_ext,
         )
         logger.debug("building extended source for pml...done")
 
         logger.debug("building extended sensor for pml...")
-        extended_sensor_grid_shape = tuple(
-            s + 2 * self.num_boundary_points for s in self.sensor_org.grid_shape
-        )
-        self.extended_sensor = fullwave.Sensor(
-            coords=self.sensor_org.outcoords + self.num_boundary_points,
-            grid_shape=extended_sensor_grid_shape,
-            sampling_modulus_time=self.sensor_org.sampling_modulus_time,
-        )
+        if self.sensor_org.is_sparse_grid:
+            # Sparse-grid sensor: no explicit coordinates to shift.
+            # Pass mod values through; the binary generates positions at run time.
+            self.extended_sensor = fullwave.Sensor(
+                mod_x=self.sensor_org.mod_x,
+                mod_y=self.sensor_org.mod_y,
+                mod_z=self.sensor_org.mod_z,
+                sampling_modulus_time=self.sensor_org.sampling_modulus_time,
+            )
+        else:
+            extended_sensor_grid_shape = tuple(
+                s + 2 * self.num_boundary_points for s in self.sensor_org.grid_shape
+            )
+            self.extended_sensor = fullwave.Sensor(
+                coords=self.sensor_org.outcoords + self.num_boundary_points,
+                grid_shape=extended_sensor_grid_shape,
+                sampling_modulus_time=self.sensor_org.sampling_modulus_time,
+            )
         logger.debug("building extended sensor for pml...done")
         if self.is_3d:
             self.pml_mask_x, self.pml_mask_y, self.pml_mask_z = self._localize_pml_region()
@@ -427,22 +498,30 @@ def _extend_map_for_pml(
         *,
         fill_edge: bool = True,
     ) -> NDArray[np.float64 | np.int64 | np.bool_]:
-        """Fast version using pre-allocation and direct assignment instead of np.pad."""
+        """Fast version using pre-allocation and direct assignment instead of np.pad.
+
+        When ``self.use_gpu`` is True and CuPy is available, the computation
+        runs on the GPU and the result is returned as a numpy array.
+        """
+        xp = self.xp
         pad = self.num_boundary_points
 
+        # Ensure array is on the correct device (no-op if already there)
+        input_gpu = xp.asarray(input_map)
+
         # Pre-allocate output array with correct dtype
         if self.is_3d:
-            nx, ny, nz = input_map.shape
-            output = np.empty((nx + 2 * pad, ny + 2 * pad, nz + 2 * pad), dtype=input_map.dtype)
+            nx, ny, nz = input_gpu.shape
+            output = xp.empty((nx + 2 * pad, ny + 2 * pad, nz + 2 * pad), dtype=input_gpu.dtype)
 
             # Fill center with original data (single copy)
-            output[pad : pad + nx, pad : pad + ny, pad : pad + nz] = input_map
+            output[pad : pad + nx, pad : pad + ny, pad : pad + nz] = input_gpu
 
             if fill_edge:
                 # Fill edges efficiently using broadcasting
                 # X boundaries
-                output[:pad, pad : pad + ny, pad : pad + nz] = input_map[0:1, :, :]
-                output[pad + nx :, pad : pad + ny, pad : pad + nz] = input_map[-1:, :, :]
+                output[:pad, pad : pad + ny, pad : pad + nz] = input_gpu[0:1, :, :]
+                output[pad + nx :, pad : pad + ny, pad : pad + nz] = input_gpu[-1:, :, :]
 
                 # Y boundaries (now includes X corners)
                 output[:, :pad, pad : pad + nz] = output[:, pad : pad + 1, pad : pad + nz]
@@ -464,16 +543,16 @@ def _extend_map_for_pml(
                 output[:, :, :pad] = 0
                 output[:, :, pad + nz :] = 0
         else:  # 2D case
-            nx, ny = input_map.shape
-            output = np.empty((nx + 2 * pad, ny + 2 * pad), dtype=input_map.dtype)
+            nx, ny = input_gpu.shape
+            output = xp.empty((nx + 2 * pad, ny + 2 * pad), dtype=input_gpu.dtype)
 
             # Fill center
-            output[pad : pad + nx, pad : pad + ny] = input_map
+            output[pad : pad + nx, pad : pad + ny] = input_gpu
 
             if fill_edge:
                 # Fill edges
-                output[:pad, pad : pad + ny] = input_map[0:1, :]
-                output[pad + nx :, pad : pad + ny] = input_map[-1:, :]
+                output[:pad, pad : pad + ny] = input_gpu[0:1, :]
+                output[pad + nx :, pad : pad + ny] = input_gpu[-1:, :]
                 output[:, :pad] = output[:, pad : pad + 1]
                 output[:, pad + ny :] = output[:, pad + ny - 1 : pad + ny]
             else:
@@ -484,6 +563,146 @@ def _extend_map_for_pml(
 
         return output
 
+    def _extend_arrays_gpu(
+        self,
+        named_arrays: list[tuple[str, NDArray]],
+    ) -> dict[str, NDArray]:
+        """Extend medium arrays using multi-GPU, single-GPU, or CPU fallback.
+
+        Parameters
+        ----------
+        named_arrays : list of (name, numpy_array) pairs
+            Arrays to extend. Must be numpy arrays (CPU).
+
+        Returns
+        -------
+        dict[str, NDArray]
+            Extended arrays as numpy arrays.
+
+        """
+        import cupy as cp  # noqa: PLC0415
+
+        n_gpus = cp.cuda.runtime.getDeviceCount()
+        logger.info("CUDA devices available: %d", n_gpus)
+
+        # Strategy 1: Multi-GPU parallel
+        if n_gpus > 1:
+            n_workers = min(n_gpus, len(named_arrays))
+            try:
+                logger.info(
+                    "Extending %d medium arrays using %d GPUs (of %d available).",
+                    len(named_arrays),
+                    n_workers,
+                    n_gpus,
+                )
+                return self._extend_arrays_multi_gpu(named_arrays, n_gpus)
+            except Exception:
+                logger.warning("Multi-GPU extension failed. Falling back to sequential single-GPU.")
+
+        # Strategy 2: Sequential single-GPU (extend one, free, repeat)
+        try:
+            logger.info("Extending %d medium arrays sequentially on GPU 0.", len(named_arrays))
+            return self._extend_arrays_sequential_gpu(named_arrays)
+        except Exception:
+            logger.warning("Single-GPU extension failed. Falling back to CPU.")
+
+        # Strategy 3: CPU
+        logger.info("Extending %d medium arrays on CPU.", len(named_arrays))
+        return self._extend_arrays_cpu(named_arrays)
+
+    def _extend_arrays_multi_gpu(
+        self,
+        named_arrays: list[tuple[str, NDArray]],
+        n_gpus: int,
+    ) -> dict[str, NDArray]:
+        """Extend arrays in parallel, each on a different GPU.
+
+        Each thread sets its own CUDA device, transfers data, extends,
+        and returns the result as a numpy array.
+        Accepts both numpy and CuPy input arrays. CuPy arrays are copied
+        device-to-device via NVLink when available (faster than PCIe).
+        """
+        import cupy as cp  # noqa: PLC0415
+
+        def extend_on_device(
+            args: tuple[str, NDArray, int],
+        ) -> tuple[str, NDArray]:
+            name, arr, device_id = args
+            with cp.cuda.Device(device_id):
+                # cp.asarray handles both numpy (H2D) and CuPy from
+                # another device (D2D via NVLink when available)
+                arr_local = cp.asarray(arr)
+                result_gpu = self._extend_map_for_pml(arr_local)
+                result_np = cp.asnumpy(result_gpu)
+                del arr_local, result_gpu
+                cp.get_default_memory_pool().free_all_blocks()
+                return name, result_np
+
+        items = [(name, arr, i % n_gpus) for i, (name, arr) in enumerate(named_arrays)]
+
+        results = {}
+        with concurrent.futures.ThreadPoolExecutor(max_workers=min(n_gpus, len(items))) as executor:
+            futures = [executor.submit(extend_on_device, item) for item in items]
+            for future in concurrent.futures.as_completed(futures):
+                name, result = future.result()
+                results[name] = result
+        return results
+
+    def _extend_arrays_sequential_gpu(
+        self,
+        named_arrays: list[tuple[str, NDArray]],
+    ) -> dict[str, NDArray]:
+        """Extend arrays one at a time on GPU, freeing after each."""
+        import cupy as cp  # noqa: PLC0415
+
+        results = {}
+        pool = cp.get_default_memory_pool()
+        for name, arr_np in named_arrays:
+            result_gpu = self._extend_map_for_pml(arr_np)
+            results[name] = cp.asnumpy(result_gpu)
+            del result_gpu
+            pool.free_all_blocks()
+        return results
+
+    def _extend_arrays_cpu(
+        self,
+        named_arrays: list[tuple[str, NDArray]],
+    ) -> dict[str, NDArray]:
+        """Extend arrays on CPU using ThreadPoolExecutor."""
+        orig_xp = self.xp
+        self.xp = np
+        try:
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                futures = {
+                    name: executor.submit(self._extend_map_for_pml, arr)
+                    for name, arr in named_arrays
+                }
+                return {name: future.result() for name, future in futures.items()}
+        finally:
+            self.xp = orig_xp
+
+    def _ensure_numpy_medium_arrays(
+        self,
+        attr_names: list[str],
+    ) -> list[tuple[str, NDArray]]:
+        """Convert medium arrays to numpy and free GPU memory.
+
+        Returns list of (name, numpy_array) pairs.
+        """
+        import cupy as cp  # noqa: PLC0415
+
+        named_arrays = []
+        for name in attr_names:
+            arr = getattr(self.medium_org, name)
+            if not isinstance(arr, np.ndarray):
+                arr_np = cp.asnumpy(arr)
+                setattr(self.medium_org, name, arr_np)
+            else:
+                arr_np = arr
+            named_arrays.append((name, arr_np))
+        cp.get_default_memory_pool().free_all_blocks()
+        return named_arrays
+
     def _localize_pml_region(self) -> tuple[NDArray[np.float64], ...]:
         if self.is_3d:
             n_x_extended, n_y_extended, n_z_extended = self.extended_medium.sound_speed.shape
@@ -558,31 +777,38 @@ def _localize_pml_region(self) -> tuple[NDArray[np.float64], ...]:
 
         return pml_mask_x, pml_mask_y
 
-    @staticmethod
     def _calc_a_and_b(
+        self,
         d_x: float | NDArray[np.float64],
         kappa_x: float | NDArray[np.float64],
         alpha_x: float | NDArray[np.float64],
         dt: float | NDArray[np.float64],
         output_dtype: np.dtype | None = None,
     ) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
-        d_x = np.asarray(d_x, dtype=np.float64)
-        kappa_x = np.asarray(kappa_x, dtype=np.float64)
-        alpha_x = np.asarray(alpha_x, dtype=np.float64)
-        dt = np.asarray(dt, dtype=np.float64)
+        xp = self.xp
+        use_gpu = xp is not np
 
-        eps = np.finfo(np.float64).eps  # noqa: F841
+        d_x = xp.asarray(d_x, dtype=xp.float64)
+        kappa_x = xp.asarray(kappa_x, dtype=xp.float64)
+        alpha_x = xp.asarray(alpha_x, dtype=xp.float64)
+        dt = xp.asarray(dt, dtype=xp.float64)
 
-        # b = exp(-(dx/kappa_x + alpha_x) * dt)
-        b = ne.evaluate("exp(-(d_x/kappa_x + alpha_x) * dt)")
+        eps = xp.finfo(xp.float64).eps
 
-        # denom = kappa_x*(dx + kappa_x*alpha_x) + eps
-        denom = ne.evaluate("kappa_x*(d_x + kappa_x*alpha_x) + eps")  # noqa: F841
-
-        # a = dx/denom*(b - 1)
-        a = ne.evaluate("d_x/denom*(b - 1)")
-
-        if output_dtype is not None and output_dtype != np.float64:
+        if use_gpu:
+            b = xp.exp(-(d_x / kappa_x + alpha_x) * dt)
+            denom = kappa_x * (d_x + kappa_x * alpha_x) + eps
+            a = d_x / denom * (b - 1)
+        else:
+            eps_local = eps  # noqa: F841
+            # b = exp(-(dx/kappa_x + alpha_x) * dt)
+            b = ne.evaluate("exp(-(d_x/kappa_x + alpha_x) * dt)")
+            # denom = kappa_x*(dx + kappa_x*alpha_x) + eps
+            denom = ne.evaluate("kappa_x*(d_x + kappa_x*alpha_x) + eps_local")
+            # a = dx/denom*(b - 1)
+            a = ne.evaluate("d_x/denom*(b - 1)")
+
+        if output_dtype is not None and output_dtype != xp.float64:
             a = a.astype(output_dtype, copy=False)
             b = b.astype(output_dtype, copy=False)
 
@@ -803,21 +1029,42 @@ def _compute_one(
 
         items = list(rename_dict.items())
 
-        results = Parallel(n_jobs=self.medium_org.n_jobs, backend="threading")(
-            delayed(_compute_one)(
-                key_fw2,
-                key_py,
-                relaxation_param_dict,
-                alpha_target_higher_nu,
-                d_target_higher_nu,
-                alpha_target_pml,
-                d_target_pml,
-                n_polynomial,
-                self.is_3d,
-                self._apply_transition_and_pml,
-            )
-            for key_fw2, key_py in items
-        )
+        if self.xp is not np:
+            # GPU path: run sequentially to avoid CuPy multi-thread CUDA context issues
+            results = [
+                _compute_one(
+                    key_fw2,
+                    key_py,
+                    relaxation_param_dict,
+                    alpha_target_higher_nu,
+                    d_target_higher_nu,
+                    alpha_target_pml,
+                    d_target_pml,
+                    n_polynomial,
+                    self.is_3d,
+                    self._apply_transition_and_pml,
+                )
+                for key_fw2, key_py in items
+            ]
+        else:
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                futures = [
+                    executor.submit(
+                        _compute_one,
+                        key_fw2,
+                        key_py,
+                        relaxation_param_dict,
+                        alpha_target_higher_nu,
+                        d_target_higher_nu,
+                        alpha_target_pml,
+                        d_target_pml,
+                        n_polynomial,
+                        self.is_3d,
+                        self._apply_transition_and_pml,
+                    )
+                    for key_fw2, key_py in items
+                ]
+                results = [f.result() for f in futures]
         out_dict = dict(results)
 
         logger.debug("Calculating PML a and b coefficients...")
@@ -845,13 +1092,13 @@ def _worker(
             # Return keys + values so parent can update dict safely
             return (f"a_pml_{axis}{nu}", a, f"b_pml_{axis}{nu}", b)
 
-        results = Parallel(
-            n_jobs=self.medium_org.n_jobs,  # use all cores
-            backend="loky",  # process-based; safe default for Python code
-            prefer="processes",
-        )(
-            starmap(delayed(_worker), tasks),
-        )
+        if self.xp is not np:
+            # GPU path: run sequentially
+            results = [_worker(nu, axis) for nu, axis in tasks]
+        else:
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                futures = [executor.submit(_worker, nu, axis) for nu, axis in tasks]
+                results = [f.result() for f in futures]
 
         for a_key, a_val, b_key, b_val in results:
             out_dict[a_key] = a_val
@@ -867,7 +1114,7 @@ def _worker(
 
         return extended_medium
 
-    def _apply_pml_3d(  # noqa: PLR0915
+    def _apply_pml_3d(
         self,
         extended_medium: fullwave.MediumRelaxationMaps,
         theoritical_reflection_coefficient: float,
@@ -1088,21 +1335,43 @@ def _compute_one(
             raise ValueError(error_msg)
 
         items = list(rename_dict.items())
-        results = Parallel(n_jobs=self.medium_org.n_jobs, backend="threading")(
-            delayed(_compute_one)(
-                key_fw2,
-                key_py,
-                relaxation_param_dict,
-                alpha_target_higher_nu,
-                d_target_higher_nu,
-                alpha_target_pml,
-                d_target_pml,
-                n_polynomial,
-                self.is_3d,
-                self._apply_transition_and_pml,
-            )
-            for key_fw2, key_py in items
-        )
+
+        if self.xp is not np:
+            # GPU path: run sequentially to avoid CuPy multi-thread CUDA context issues
+            results = [
+                _compute_one(
+                    key_fw2,
+                    key_py,
+                    relaxation_param_dict,
+                    alpha_target_higher_nu,
+                    d_target_higher_nu,
+                    alpha_target_pml,
+                    d_target_pml,
+                    n_polynomial,
+                    self.is_3d,
+                    self._apply_transition_and_pml,
+                )
+                for key_fw2, key_py in items
+            ]
+        else:
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                futures = [
+                    executor.submit(
+                        _compute_one,
+                        key_fw2,
+                        key_py,
+                        relaxation_param_dict,
+                        alpha_target_higher_nu,
+                        d_target_higher_nu,
+                        alpha_target_pml,
+                        d_target_pml,
+                        n_polynomial,
+                        self.is_3d,
+                        self._apply_transition_and_pml,
+                    )
+                    for key_fw2, key_py in items
+                ]
+                results = [f.result() for f in futures]
         out_dict = dict(results)
 
         logger.debug("Calculating PML a and b coefficients...")
@@ -1131,13 +1400,13 @@ def _worker(
             # Return keys + values so parent can update dict safely
             return (f"a_pml_{axis}{nu}", a, f"b_pml_{axis}{nu}", b)
 
-        results = Parallel(
-            n_jobs=self.medium_org.n_jobs,  # use all cores
-            backend="loky",  # process-based; safe default for Python code
-            prefer="processes",
-        )(
-            starmap(delayed(_worker), tasks),
-        )
+        if self.xp is not np:
+            # GPU path: run sequentially
+            results = [_worker(nu, axis) for nu, axis in tasks]
+        else:
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                futures = [executor.submit(_worker, nu, axis) for nu, axis in tasks]
+                results = [f.result() for f in futures]
 
         for a_key, a_val, b_key, b_val in results:
             out_dict[a_key] = a_val
@@ -1153,7 +1422,7 @@ def _worker(
 
         return extended_medium
 
-    def _apply_transition_and_pml(  # noqa: C901, PLR0912, PLR0915
+    def _apply_transition_and_pml(  # noqa: PLR0912
         self,
         input_array: NDArray[np.float64],
         value_target: float,
@@ -1210,7 +1479,9 @@ def _apply_transition_and_pml(  # noqa: C901, PLR0912, PLR0915
             layer_offset = 0
 
         # Compute transition function once
-        transition_linspace = np.linspace(0, 1, layer_thickness + 1)
+        xp = self.xp
+        use_gpu = xp is not np
+        transition_linspace = xp.linspace(0, 1, layer_thickness + 1)
         transition_map = {
             "smooth": _smooth_transition_function,
             "linear": _linear_transition_function,
@@ -1229,9 +1500,10 @@ def _apply_transition_and_pml(  # noqa: C901, PLR0912, PLR0915
             transition_function = transition_map[transition_type](
                 transition_linspace,
                 n=n_polynomial,
+                xp=xp,
             )
         else:
-            transition_function = transition_map[transition_type](transition_linspace)
+            transition_function = transition_map[transition_type](transition_linspace, xp=xp)
 
         n_axis_extended = array_shape[axis]
         m_offset = self.m_spatial_order + layer_offset
@@ -1240,10 +1512,18 @@ def _apply_transition_and_pml(  # noqa: C901, PLR0912, PLR0915
         up_end = m_offset + layer_thickness
         down_start = n_axis_extended - m_offset - layer_thickness - 1
 
+        # Transfer to GPU if needed
+        if use_gpu:
+            input_array = xp.asarray(input_array)
+
         # Move axis to 0 for uniform processing
-        working_array = np.moveaxis(input_array, axis, 0)
-        # make working_array writeable
-        working_array.setflags(write=True)
+        working_array = xp.moveaxis(input_array, axis, 0)
+        if not use_gpu:
+            # make working_array writeable (numpy-specific)
+            working_array.setflags(write=True)
+        else:
+            # CuPy arrays are always writeable; ensure contiguous copy
+            working_array = working_array.copy()
 
         # Apply boundary conditions
         working_array[: m_offset + layer_thickness] = value_target
@@ -1276,7 +1556,7 @@ def _apply_transition_and_pml(  # noqa: C901, PLR0912, PLR0915
         working_array[down_start:down_end] = down_vals - trans_down * (down_vals - value_target)
 
         # Move axis back
-        return np.moveaxis(working_array, 0, axis)
+        return xp.moveaxis(working_array, 0, axis)
 
     @staticmethod
     def _calc_time_constants(
@@ -1443,6 +1723,7 @@ def __init__(
         *,
         m_spatial_order: int = 8,
         n_pml_layer: int = 40,
+        use_gpu: bool = False,
         # n_transition_layer: int = 40,
         # pml_alpha_target: float = 1.1,
         # pml_alpha_power_target: float = 1.6,
@@ -1468,6 +1749,9 @@ def __init__(
             see Pinton, G. (2021) http://arxiv.org/abs/2106.11476 for more detail.
         n_pml_layer : int, optional
             PML layer thickness (default is 40).
+        use_gpu : bool, optional
+            If True, use CuPy for GPU-accelerated PML computation (default is False).
+            Requires CuPy to be installed. Falls back to CPU if CuPy is unavailable.
         n_transition_layer : int, optional
             Number of transition layers (default is 40).
         pml_alpha_target : float, optional
@@ -1507,6 +1791,16 @@ def __init__(
         self.sensor_org = sensor
         self.is_3d = grid.is_3d
 
+        self.use_gpu = use_gpu
+        self.xp: ModuleType = _get_array_module(use_gpu=use_gpu)
+        if self.xp is not np:
+            logger.info("PMLBuilderExponentialAttenuation: using CuPy GPU backend")
+        elif use_gpu:
+            logger.warning(
+                "PMLBuilderExponentialAttenuation: use_gpu=True but CuPy is not available. "
+                "Falling back to CPU (numpy)."
+            )
+
         self.m_spatial_order = m_spatial_order
         self.n_pml_layer = n_pml_layer
         # self.n_transition_layer = n_transition_layer
@@ -1542,47 +1836,31 @@ def __init__(
         )
 
         logger.debug("building extended medium for pml...")
-        # run _extend_map_for_pml in parallel for all medium properties since it is a bottleneck
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            future_sound_speed = executor.submit(
-                self._extend_map_for_pml,
-                self.medium_org.sound_speed,
-            )
-            future_density = executor.submit(
-                self._extend_map_for_pml,
-                self.medium_org.density,
-            )
-            future_beta = executor.submit(
-                self._extend_map_for_pml,
-                self.medium_org.beta,
-            )
-            future_alpha_coeff = executor.submit(
-                self._extend_map_for_pml,
-                self.medium_org.alpha_coeff,
-            )
-            future_alpha_power = executor.submit(
-                self._extend_map_for_pml,
-                self.medium_org.alpha_power,
-            )
-
-            extended_sound_speed = future_sound_speed.result()
-            extended_density = future_density.result()
-            extended_beta = future_beta.result()
-            extended_alpha_coeff = future_alpha_coeff.result()
-            extended_alpha_power = future_alpha_power.result()
+        attr_names = ["sound_speed", "density", "beta", "alpha_coeff", "alpha_power"]
+        if self.xp is not np:
+            named_arrays = [(name, getattr(self.medium_org, name)) for name in attr_names]
+            extended = self._extend_arrays_gpu(named_arrays)
+            # Free original GPU arrays and replace with numpy to reclaim GPU memory
+            self._ensure_numpy_medium_arrays(attr_names)
+        else:
+            named_arrays = [(name, getattr(self.medium_org, name)) for name in attr_names]
+            extended = self._extend_arrays_cpu(named_arrays)
 
+        # Extended arrays are numpy — skip re-upload to GPU to avoid
+        # wasting PCIe bandwidth. CPU numexpr handles subsequent computation.
         self.extended_medium = fullwave.Medium(
             grid=self.extended_grid,
-            sound_speed=extended_sound_speed,
-            density=extended_density,
-            beta=extended_beta,
-            alpha_coeff=extended_alpha_coeff,
-            alpha_power=extended_alpha_power,
+            sound_speed=extended["sound_speed"],
+            density=extended["density"],
+            beta=extended["beta"],
+            alpha_coeff=extended["alpha_coeff"],
+            alpha_power=extended["alpha_power"],
             air_coords=self.medium_org.air_coords + self.num_boundary_points,
             n_relaxation_mechanisms=self.medium_org.n_relaxation_mechanisms,
             path_relaxation_parameters_database=self.medium_org.path_relaxation_parameters_database,
             attenuation_builder=self.medium_org.attenuation_builder,
             dtype=getattr(self.medium_org, "dtype", np.float64),
+            use_gpu=False,
         )
         logger.debug("Extended medium for PML built successfully.")
 
@@ -1594,21 +1872,50 @@ def __init__(
             if getattr(self.source_org, "incoords_add", None) is not None
             else None
         )
+        incoords_u_ext = (
+            self.source_org.incoords_u + self.num_boundary_points
+            if getattr(self.source_org, "incoords_u", None) is not None
+            else None
+        )
+        incoords_v_ext = (
+            self.source_org.incoords_v + self.num_boundary_points
+            if getattr(self.source_org, "incoords_v", None) is not None
+            else None
+        )
+        incoords_w_ext = (
+            self.source_org.incoords_w + self.num_boundary_points
+            if getattr(self.source_org, "incoords_w", None) is not None
+            else None
+        )
         self.extended_source = fullwave.Source(
             p0=self.source_org.p0,
             coords=self.source_org.incoords + self.num_boundary_points,
             grid_shape=extended_grid_shape,
             p0_additive=self.source_org.p0_additive,
             coords_additive=incoords_add_ext,
+            u0=getattr(self.source_org, "u0", None),
+            coords_u=incoords_u_ext,
+            v0=getattr(self.source_org, "v0", None),
+            coords_v=incoords_v_ext,
+            w0=getattr(self.source_org, "w0", None),
+            coords_w=incoords_w_ext,
         )
-        extended_sensor_grid_shape = tuple(
-            s + 2 * self.num_boundary_points for s in self.sensor_org.grid_shape
-        )
-        self.extended_sensor = fullwave.Sensor(
-            coords=self.sensor_org.outcoords + self.num_boundary_points,
-            grid_shape=extended_sensor_grid_shape,
-            sampling_modulus_time=self.sensor_org.sampling_modulus_time,
-        )
+        if self.sensor_org.is_sparse_grid:
+            self.extended_sensor = fullwave.Sensor(
+                mod_x=self.sensor_org.mod_x,
+                mod_y=self.sensor_org.mod_y,
+                mod_z=self.sensor_org.mod_z,
+                sampling_modulus_time=self.sensor_org.sampling_modulus_time,
+            )
+        else:
+            extended_sensor_grid_shape = tuple(
+                s + 2 * self.num_boundary_points for s in self.sensor_org.grid_shape
+            )
+            self.extended_sensor = fullwave.Sensor(
+                coords=self.sensor_org.outcoords + self.num_boundary_points,
+                grid_shape=extended_sensor_grid_shape,
+                sampling_modulus_time=self.sensor_org.sampling_modulus_time,
+            )
         logger.debug("Extended source and sensor for PML built successfully.")
 
         logger.debug("Localizing PML region...")
@@ -1652,6 +1959,8 @@ def run(self, *, use_pml: bool = True) -> fullwave.MediumExponentialAttenuation:
             extended_medium: fullwave.MediumExponentialAttenuation = (
                 self.extended_medium.build_exponential()
             )
+            # Free the intermediate Medium's GPU arrays — only alpha_exp is needed
+            self._free_extended_medium_gpu()
             logger.debug("Extended medium for PML built successfully.")
             if self.is_3d:
                 logger.debug("Applying 3D PML to the extended medium...")
@@ -1668,11 +1977,29 @@ def run(self, *, use_pml: bool = True) -> fullwave.MediumExponentialAttenuation:
         extended_medium: fullwave.MediumExponentialAttenuation = (
             self.extended_medium.build_exponential()
         )
+        self._free_extended_medium_gpu()
         logger.debug("Extended medium built successfully without applying PML.")
         return extended_medium
 
-    @staticmethod
-    def _mask_body_2d(nx: int, ny: int, n_body: int) -> NDArray[np.float32]:
+    def _free_extended_medium_gpu(self) -> None:
+        """Free GPU arrays from extended_medium that are no longer needed.
+
+        After build_exponential(), the Medium's alpha_coeff, alpha_power,
+        sound_speed, density, and beta are duplicated in the returned
+        MediumExponentialAttenuation. Free them to reclaim GPU memory.
+        """
+        if self.xp is np:
+            return
+        import cupy as cp  # noqa: PLC0415
+
+        medium = self.extended_medium
+        for attr in ("sound_speed", "density", "beta", "alpha_coeff", "alpha_power"):
+            val = getattr(medium, attr, None)
+            if val is not None and not isinstance(val, np.ndarray):
+                setattr(medium, attr, cp.asnumpy(val))
+        cp.get_default_memory_pool().free_all_blocks()
+
+    def _mask_body_2d(self, nx: int, ny: int, n_body: int) -> NDArray[np.float32]:
         """Create a mask for the PML region.
 
         Parameters
@@ -1691,28 +2018,37 @@ def _mask_body_2d(nx: int, ny: int, n_body: int) -> NDArray[np.float32]:
             Interior (body) region is 1, PML boundary approaches 0.
 
         """
+        xp = self.xp
+        use_gpu = xp is not np
 
         def edge_distance_1d(n: int, n_body: int) -> NDArray[np.float32]:
-            d = np.zeros(n, dtype=np.float32)
+            d = xp.zeros(n, dtype=xp.float32)
             if n_body <= 0:
                 return d
-            d[:n_body] = np.arange(n_body, 0, -1, dtype=np.float32)
-            d[-n_body:] = np.arange(1, n_body + 1, dtype=np.float32)
+            d[:n_body] = xp.arange(n_body, 0, -1, dtype=xp.float32)
+            d[-n_body:] = xp.arange(1, n_body + 1, dtype=xp.float32)
             return d
 
-        rx = edge_distance_1d(nx, n_body)[:, None]  # noqa: F841
-        ry = edge_distance_1d(ny, n_body)[None, :]  # noqa: F841
+        rx = edge_distance_1d(nx, n_body)[:, None]
+        ry = edge_distance_1d(ny, n_body)[None, :]
 
-        mask_sq = ne.evaluate("rx*rx + ry*ry")
+        if use_gpu:
+            mask_sq = rx * rx + ry * ry
+            mmax = float(xp.sqrt(mask_sq.max()))
+            if mmax > 0.0:
+                mask_sq = mask_sq / (mmax * mmax)
+            return xp.maximum(1 - xp.sqrt(mask_sq), 0)
 
+        rx_np = rx  # noqa: F841
+        ry_np = ry  # noqa: F841
+        mask_sq = ne.evaluate("rx_np*rx_np + ry_np*ry_np")
         mmax = float(np.sqrt(mask_sq.max()))
         if mmax > 0.0:
             mask_sq = ne.evaluate("mask_sq / (mmax*mmax)")
+        result = ne.evaluate("1 - sqrt(mask_sq)")
+        return np.maximum(result, 0)
 
-        return ne.evaluate("1 - sqrt(mask_sq)")
-
-    @staticmethod
-    def _mask_body_3d(nx: int, ny: int, nz: int, n_body: int) -> NDArray[np.float32]:
+    def _mask_body_3d(self, nx: int, ny: int, nz: int, n_body: int) -> NDArray[np.float32]:
         """Create a mask for the PML region.
 
         Parameters
@@ -1732,21 +2068,33 @@ def _mask_body_3d(nx: int, ny: int, nz: int, n_body: int) -> NDArray[np.float32]
             A 3D numpy array representing the PML mask.
 
         """
+        xp = self.xp
+        use_gpu = xp is not np
 
         def edge_distance_1d(n: int, n_body: int) -> NDArray[np.float32]:
-            d = np.zeros(n, dtype=np.float32)
+            d = xp.zeros(n, dtype=xp.float32)
             if n_body <= 0:
                 return d
-            d[:n_body] = np.arange(n_body, 0, -1, dtype=np.float32)
-            d[-n_body:] = np.arange(1, n_body + 1, dtype=np.float32)
+            d[:n_body] = xp.arange(n_body, 0, -1, dtype=xp.float32)
+            d[-n_body:] = xp.arange(1, n_body + 1, dtype=xp.float32)
             return d
 
-        rx = edge_distance_1d(nx, n_body)[:, None, None]  # noqa: F841
-        ry = edge_distance_1d(ny, n_body)[None, :, None]  # noqa: F841
-        rz = edge_distance_1d(nz, n_body)[None, None, :]  # noqa: F841
+        rx = edge_distance_1d(nx, n_body)[:, None, None]
+        ry = edge_distance_1d(ny, n_body)[None, :, None]
+        rz = edge_distance_1d(nz, n_body)[None, None, :]
 
+        if use_gpu:
+            mask_sq = rx * rx + ry * ry + rz * rz
+            mmax = float(xp.sqrt(mask_sq.max()))
+            if mmax > 0.0:
+                mask_sq = mask_sq / (mmax * mmax)
+            return xp.maximum(1 - xp.sqrt(mask_sq), 0)
+
+        rx_np = rx  # noqa: F841
+        ry_np = ry  # noqa: F841
+        rz_np = rz  # noqa: F841
         # 1) compute squared distance with numexpr (no reduction here)
-        mask_sq = ne.evaluate("rx*rx + ry*ry + rz*rz")  # shape (nx, ny, nz), float32
+        mask_sq = ne.evaluate("rx_np*rx_np + ry_np*ry_np + rz_np*rz_np")
 
         # 2) reduction done by NumPy, then scalar sqrt
         mmax = float(np.sqrt(mask_sq.max()))
@@ -1755,7 +2103,8 @@ def edge_distance_1d(n: int, n_body: int) -> NDArray[np.float32]:
             mask_sq = ne.evaluate("mask_sq / (mmax*mmax)")
 
         # 4) final sqrt elementwise
-        return ne.evaluate("1 - sqrt(mask_sq)")
+        result = ne.evaluate("1 - sqrt(mask_sq)")
+        return np.maximum(result, 0)
 
     def _apply_pml_3d(
         self,
@@ -1767,6 +2116,10 @@ def _apply_pml_3d(
             nz=extended_medium.alpha_exp.shape[2],
             n_body=self.num_boundary_points,
         )
+        if isinstance(extended_medium.alpha_exp, np.ndarray) and not isinstance(a_mask, np.ndarray):
+            import cupy as cp  # noqa: PLC0415
+
+            a_mask = cp.asnumpy(a_mask)
         extended_medium.alpha_exp *= a_mask
         return extended_medium
 
@@ -1779,5 +2132,9 @@ def _apply_pml_2d(
             ny=extended_medium.alpha_exp.shape[1],
             n_body=self.num_boundary_points,
         )
+        if isinstance(extended_medium.alpha_exp, np.ndarray) and not isinstance(a_mask, np.ndarray):
+            import cupy as cp  # noqa: PLC0415
+
+            a_mask = cp.asnumpy(a_mask)
         extended_medium.alpha_exp *= a_mask
         return extended_medium
diff --git a/fullwave/solver/solver.py b/fullwave/solver/solver.py
index 0f8b283..fdc548f 100644
--- a/fullwave/solver/solver.py
+++ b/fullwave/solver/solver.py
@@ -1,5 +1,6 @@
 """solver module."""
 
+import gc
 import logging
 import time
 from pathlib import Path
@@ -15,6 +16,7 @@
     MemoryTempfile,
     check_functions,
 )
+from fullwave.utils.signal_filter import apply_filter
 
 from .binary_manager import ensure_binary
 from .cuda_utils import get_cuda_architecture, retrieve_cuda_version
@@ -308,7 +310,7 @@ class Solver:
     generates the required input files, and runs the simulation executable.
     """
 
-    def __init__(  # noqa: PLR0912, PLR0915, C901
+    def __init__(  # noqa: PLR0912
         self,
         work_dir: Path,
         grid: fullwave.Grid,
@@ -328,6 +330,8 @@ def __init__(  # noqa: PLR0912, PLR0915, C901
         use_isotropic_relaxation: bool = True,
         cuda_device_id: str | int | list | None = None,
         save_gpu_memory: bool = False,
+        verify_gpu: bool = True,
+        use_gpu_pml: bool = False,
     ) -> None:
         """Initialize a Solver instance for the fullwave simulation.
 
@@ -405,6 +409,11 @@ def __init__(  # noqa: PLR0912, PLR0915, C901
             example 2: 2 for using GPU 2 or "2" as a string.
         save_gpu_memory : bool, optional
             Whether to save GPU memory by using ICMAT_MEMORY_SAVING flag in the simulation.
+        use_gpu_pml : bool, optional
+            Whether to use CuPy for GPU-accelerated PML computation (default is False).
+            Requires CuPy to be installed. Falls back to CPU if CuPy is unavailable.
+            This accelerates the PML array padding and transition computations
+            using the GPU, which is especially beneficial for large 3D grids.
             The simulation does not load initial conditions into GPU memory and
             it loads the slice of the wavefield needed for the current time step
             from CPU memory at each time step.
@@ -414,6 +423,11 @@ def __init__(  # noqa: PLR0912, PLR0915, C901
             depending on the hardware and the simulation settings.
             useful in 3D simulations with large grid sizes
             where GPU memory is a limiting factor.
+        verify_gpu : bool, optional
+            Whether to verify that the specified CUDA devices exist on the system.
+            Defaults to True. Set to False when generating input files only
+            (``generate_input_only=True``) on a machine that may not have
+            the target GPUs available.
 
         Raises
         ------
@@ -547,6 +561,7 @@ def __init__(  # noqa: PLR0912, PLR0915, C901
 
         self.path_fullwave_simulation_bin = path_fullwave_simulation_bin
         self.cuda_device_id = cuda_device_id
+        self.use_gpu_pml = use_gpu_pml
 
         self.fullwave_launcher = Launcher(
             path_fullwave_simulation_bin,
@@ -554,6 +569,7 @@ def __init__(  # noqa: PLR0912, PLR0915, C901
             use_gpu=self.use_gpu,
             cuda_device_id=self.cuda_device_id,
             save_gpu_memory=self.save_gpu_memory,
+            verify_gpu=verify_gpu,
         )
 
         if use_exponential_attenuation:
@@ -564,6 +580,7 @@ def __init__(  # noqa: PLR0912, PLR0915, C901
                 sensor=self.sensor,
                 m_spatial_order=m_spatial_order,
                 n_pml_layer=pml_layer_thickness_px,
+                use_gpu=use_gpu_pml,
             )
         else:
             self.pml_builder = PMLBuilder(
@@ -575,8 +592,27 @@ def __init__(  # noqa: PLR0912, PLR0915, C901
                 n_pml_layer=pml_layer_thickness_px,
                 n_transition_layer=n_transition_layer,
                 use_isotropic_relaxation=use_isotropic_relaxation,
+                use_gpu=use_gpu_pml,
             )
 
+    @staticmethod
+    def _release_gpu_memory_pools() -> None:
+        """Release all CuPy GPU memory pool blocks back to CUDA.
+
+        Call ``gc.collect()`` first so that Python releases references to
+        CuPy arrays, then drain both the device and pinned memory pools.
+        This prevents stale allocations from causing memory pressure when
+        subsequent operations allocate large GPU arrays.
+        """
+        gc.collect()
+        try:
+            import cupy as cp  # noqa: PLC0415
+
+            cp.get_default_memory_pool().free_all_blocks()
+            cp.get_default_pinned_memory_pool().free_all_blocks()
+        except ImportError:
+            pass
+
     @staticmethod
     def _check_input(
         grid: fullwave.Grid,
@@ -638,10 +674,86 @@ def _check_input(
         error_msg = f"{path_fullwave_simulation_bin} does not exist"
         assert path_fullwave_simulation_bin.exists(), error_msg
 
+    @staticmethod
+    def _validate_filter_params(
+        highpass_cutoff_mhz: float | None,
+        bandpass_cutoff_mhz: tuple[float, float] | None,
+        *,
+        load_results: bool,
+    ) -> None:
+        """Validate high-pass / band-pass filter arguments passed to run().
+
+        Raises
+        ------
+        ValueError
+            If both filter options are set simultaneously, or if a filter is
+            requested without ``load_results=True``.
+
+        """
+        if highpass_cutoff_mhz is not None and bandpass_cutoff_mhz is not None:
+            error_msg = (
+                "highpass_cutoff_mhz and bandpass_cutoff_mhz cannot both be specified. "
+                "Use highpass_cutoff_mhz for a simple high-pass filter or "
+                "bandpass_cutoff_mhz for a band-pass filter."
+            )
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+        if (highpass_cutoff_mhz is not None or bandpass_cutoff_mhz is not None) and (
+            not load_results
+        ):
+            error_msg = (
+                "Filtering requires load_results=True. "
+                "Set load_results=True or disable the filter options."
+            )
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+    @staticmethod
+    def _apply_output_filter(
+        result: NDArray[np.float64],
+        dt: float,
+        highpass_cutoff_mhz: float | None,
+        bandpass_cutoff_mhz: tuple[float, float] | None,
+    ) -> NDArray[np.float64]:
+        """Apply the optional frequency filter to the reshaped sensor output.
+
+        Parameters
+        ----------
+        result : NDArray[np.float64]
+            Sensor data shaped ``[n_sensors, n_t]``.
+        dt : float
+            Grid time step in seconds.
+        highpass_cutoff_mhz : float | None
+            High-pass edge in MHz, or ``None``.
+        bandpass_cutoff_mhz : tuple[float, float] | None
+            ``(f_low_mhz, f_high_mhz)`` band-pass edges, or ``None``.
+
+        Returns
+        -------
+        NDArray[np.float64]
+            Filtered (or unchanged) sensor data.
+
+        """
+        if highpass_cutoff_mhz is not None:
+            logger.info("Applying high-pass filter at %.4g MHz...", highpass_cutoff_mhz)
+            return apply_filter(result, dt, f_low_hz=highpass_cutoff_mhz * 1e6)
+        if bandpass_cutoff_mhz is not None:
+            f_low_hz = bandpass_cutoff_mhz[0] * 1e6
+            f_high_hz = bandpass_cutoff_mhz[1] * 1e6
+            logger.info(
+                "Applying band-pass filter %.4g-%.4g MHz...",
+                bandpass_cutoff_mhz[0],
+                bandpass_cutoff_mhz[1],
+            )
+            return apply_filter(result, dt, f_low_hz=f_low_hz, f_high_hz=f_high_hz)
+        return result
+
     @staticmethod
     def _reshape_sensor_data(
         raw_sensor_output: NDArray[np.float64],
         sensor: fullwave.Sensor,
+        *,
+        n_t: int | None = None,
     ) -> NDArray[np.float64]:
         """Reshape the raw sensor output data.
 
@@ -651,12 +763,21 @@ def _reshape_sensor_data(
             The raw sensor output data from the simulation. [nt*ncoordsout, 1]
         sensor: fullwave.Sensor
             The sensor object used in the simulation.
+        n_t: int | None
+            Number of time steps in the extended grid.  Required for sparse-grid
+            sensors because n_sensors is not known at Python time.
 
         Returns
         -------
         NDArray[np.float64]: The reshaped sensor output data. [ncoordsout, nt]
 
         """
+        if sensor.is_sparse_grid:
+            if n_t is None:
+                msg = "n_t is required to reshape sparse-grid sensor output"
+                raise ValueError(msg)
+            n_t_recorded = -(-n_t // sensor.sampling_modulus_time)  # ceiling division
+            return raw_sensor_output.reshape(n_t_recorded, -1).T
         return raw_sensor_output.reshape(-1, sensor.n_sensors).T
 
     def run(
@@ -670,6 +791,9 @@ def run(
         load_results: bool = True,
         generate_input_only: bool = False,
         release_after_write: bool = False,
+        highpass_cutoff_mhz: float | None = None,
+        bandpass_cutoff_mhz: tuple[float, float] | None = None,
+        gpu_memory_estimate: bool = True,
     ) -> NDArray[np.float64] | Path:
         r"""Run the fullwave simulation and return the result as a NumPy array.
 
@@ -729,6 +853,21 @@ def run(
             If True, the memory used by the input files will be released after writing them to disk.
             This is useful when run_on_memory is True to free up memory space for the simulation
             or when the input files are large. Default is False.
+        highpass_cutoff_mhz : float | None
+            Apply a high-pass filter to the sensor recordings after the simulation.
+            Removes low-frequency PML drift by attenuating frequencies below this value (in MHz).
+            Uses a cosine (Hann) taper to avoid Gibbs ringing.
+            Cannot be combined with ``bandpass_cutoff_mhz``.
+            Requires ``load_results=True``.  Default is ``None`` (no filtering).
+        bandpass_cutoff_mhz : tuple[float, float] | None
+            Apply a band-pass filter ``(f_low_mhz, f_high_mhz)`` to the sensor recordings
+            after the simulation.  Retains only frequencies inside the specified band.
+            Uses cosine (Hann) tapers on both edges.
+            Cannot be combined with ``highpass_cutoff_mhz``.
+            Requires ``load_results=True``.  Default is ``None`` (no filtering).
+        gpu_memory_estimate : bool
+            Whether to estimate GPU memory usage before running the simulation.
+            Default is True. If True, it estimates the GPU memory usage.
 
         Returns
         -------
@@ -746,6 +885,8 @@ def run(
             Static map simulations require input files to be stored on a disk.
             run_on_memory, on the other hand, removes the input files
             after the simulation is complete.
+            Also raised if both ``highpass_cutoff_mhz`` and ``bandpass_cutoff_mhz`` are given,
+            or if either filter option is set but ``load_results=False``.
 
         """
         # self._save_data_for_beamforming()
@@ -767,6 +908,12 @@ def run(
             logger.error(error_msg)
             raise ValueError(error_msg)
 
+        self._validate_filter_params(
+            highpass_cutoff_mhz,
+            bandpass_cutoff_mhz,
+            load_results=load_results,
+        )
+
         start_time = time.time()
         extended_medium = self.pml_builder.run(use_pml=self.use_pml)
         end_pml_builder_time = time.time()
@@ -783,30 +930,27 @@ def run(
             )
             logger.warning(warning_msg)
 
-        sensor_mask: NDArray[np.bool_]
         if record_whole_domain:
-            if self.is_3d:
-                sensor_mask = np.zeros(
-                    (
-                        self.pml_builder.extended_grid.nx,
-                        self.pml_builder.extended_grid.ny,
-                        self.pml_builder.extended_grid.nz,
-                    ),
-                    dtype=bool,
-                )
-            else:
-                sensor_mask = np.zeros(
-                    (self.pml_builder.extended_grid.nx, self.pml_builder.extended_grid.ny),
-                    dtype=bool,
-                )
-            sensor_mask[:, :] = True
+            mod_x = 1
+            mod_y = 1
+            mod_z = 1 if self.is_3d else None
+
             sensor = fullwave.Sensor(
-                mask=sensor_mask,
+                mod_x=mod_x,
+                mod_y=mod_y,
+                mod_z=mod_z,
                 sampling_modulus_time=sampling_modulus_time_whole_domain,
             )
         else:
             sensor = self.pml_builder.extended_sensor
 
+        # pml_thickness = PML + transition layers on each side, excluding ghost cells.
+        # Used by the binary to locate the interior domain when building a sparse sensor grid.
+        if record_whole_domain:
+            pml_thickness = 0
+        else:
+            pml_thickness = self.pml_builder.num_boundary_points - self.pml_builder.m_spatial_order
+
         start_input_file_writer_time = time.time()
         input_file_writer = InputFileWriter(
             work_dir=self.work_dir,
@@ -818,6 +962,8 @@ def run(
             use_exponential_attenuation=self.use_exponential_attenuation,
             use_isotropic_relaxation=self.use_isotropic_relaxation,
             release_after_write=release_after_write,
+            pml_thickness=pml_thickness,
+            use_gpu=self.use_gpu_pml,
         )
         simulation_dir = input_file_writer.run(
             simulation_dir_name,
@@ -831,13 +977,19 @@ def run(
         )
         logger.debug(message)
 
+        if gpu_memory_estimate:
+            self._estimate_gpu_memory(sensor)
+
         if generate_input_only:
             logger.info(
                 "Input data generation completed in %s. Skipping simulation execution.",
                 simulation_dir,
             )
+            self._release_gpu_memory_pools()
             return simulation_dir
 
+        self._release_gpu_memory_pools()
+
         sim_result = self.fullwave_launcher.run(
             simulation_dir,
             load_results=load_results,
@@ -850,6 +1002,7 @@ def run(
             result = self._reshape_sensor_data(
                 sim_result,
                 sensor=sensor,
+                n_t=self.pml_builder.extended_grid.nt,
             )
             end_loading_time = time.time()
             message = (
@@ -857,11 +1010,261 @@ def run(
                 f"{end_loading_time - start_loading_time:.2e} seconds."
             )
             logger.info(message)
-            return result
+
+            return self._apply_output_filter(
+                result,
+                self.grid.dt,
+                highpass_cutoff_mhz,
+                bandpass_cutoff_mhz,
+            )
         # if load_results is False, return the raw result
         # which is a list of file names
         return sim_result
 
+    def _estimate_gpu_memory(
+        self,
+        sensor: fullwave.Sensor,
+    ) -> None:
+        """Estimate and log GPU memory usage per device.
+
+        Provides a pre-launch estimate so users can verify that the simulation
+        fits in GPU memory before the binary starts allocating.
+
+        Parameters
+        ----------
+        sensor : fullwave.Sensor
+            The sensor that will actually be written to the input files (may
+            differ from ``self.sensor`` when ``record_whole_domain=True``).
+
+        """
+        # show that this is an experimental feature
+        logger.info("Estimating GPU memory usage... (experimental feature, may be inaccurate)")
+        device_ids = self.fullwave_launcher.cuda_device_id.split(",")
+        n_gpus = len(device_ids)
+
+        grid = self.pml_builder.extended_grid
+        source = self.pml_builder.extended_source
+        medium = self.pml_builder.extended_medium
+
+        depth = grid.nx
+        lateral = grid.ny * grid.nz if self.is_3d else grid.ny
+        halo_depth = 8
+
+        float_bytes = 4
+        int_bytes = 4
+
+        c_map = medium.sound_speed
+        c_range = int(np.rint(c_map.max()) - np.rint(c_map.min()))
+        n_deriv_levels = 1 if c_range == 0 else c_range + 1
+
+        n_source_timesteps = source.icmat.shape[1]
+
+        gb = 1024.0**3
+
+        base_depth = depth // n_gpus
+        remainder = depth % n_gpus
+
+        for rank, dev_id in enumerate(device_ids):
+            depth_this = base_depth + (1 if rank < remainder else 0)
+
+            if n_gpus == 1:
+                n_halo_sides = 0
+            elif rank == 0 or rank == n_gpus - 1:
+                n_halo_sides = 1
+            else:
+                n_halo_sides = 2
+            local_depth = depth_this + n_halo_sides * halo_depth
+            slab = local_depth * lateral
+
+            n_sources = max(source.n_sources // n_gpus, 0)
+            n_sensors = max(sensor.n_sensors // n_gpus, 0)
+            n_air_local = max(medium.n_air // n_gpus, 0)
+
+            if self.use_exponential_attenuation:
+                total = self._mem_exponential(
+                    slab,
+                    n_deriv_levels,
+                    n_sources,
+                    n_source_timesteps,
+                    save_gpu_memory=self.save_gpu_memory,
+                    n_sensors=n_sensors,
+                    float_bytes=float_bytes,
+                    int_bytes=int_bytes,
+                    is_3d=self.is_3d,
+                )
+            else:
+                total = self._mem_relaxation(
+                    slab,
+                    n_deriv_levels,
+                    n_sources,
+                    n_source_timesteps,
+                    save_gpu_memory=self.save_gpu_memory,
+                    n_air=n_air_local,
+                    n_sensors=n_sensors,
+                    n_relax=self.n_relax_mechanisms,
+                    float_bytes=float_bytes,
+                    int_bytes=int_bytes,
+                    is_3d=self.is_3d,
+                )
+
+            mode = "exponential" if self.use_exponential_attenuation else "relaxation"
+            saving = ", save_gpu_memory=True" if self.save_gpu_memory else ""
+            logger.info(
+                "GPU memory estimate [GPU %s] (%s mode%s): "
+                "%.2f GB  (depth=%d +%d halo, lateral=%d)",
+                dev_id.strip(),
+                mode,
+                saving,
+                total / gb,
+                depth_this,
+                n_halo_sides * halo_depth,
+                lateral,
+            )
+
+    @staticmethod
+    def _mem_exponential(
+        slab: int,
+        n_deriv_levels: int,
+        n_sources: int,
+        n_source_timesteps: int,
+        *,
+        save_gpu_memory: bool,
+        n_sensors: int,
+        float_bytes: int,
+        int_bytes: int,
+        is_3d: bool,
+    ) -> int:
+        """Return estimated GPU bytes for exponential-attenuation solver.
+
+        Parameters
+        ----------
+        slab : int
+            Grid points per GPU slab (local_depth * lateral).
+        n_deriv_levels : int
+            Number of derivative-map levels.
+        n_sources : int
+            Approximate source count on this GPU.
+        n_source_timesteps : int
+            Number of source time steps.
+        save_gpu_memory : bool
+            Whether memory-saving mode is active.
+        n_sensors : int
+            Approximate sensor count on this GPU.
+        float_bytes : int
+            Bytes per float (4).
+        int_bytes : int
+            Bytes per int (4).
+        is_3d: bool,
+            Whether the simulation is 3D (affects sensor memory).
+
+        Returns
+        -------
+        int
+            Total estimated bytes.
+
+        """
+        fb = float_bytes
+        ib = int_bytes
+        ndim = 3 if is_3d else 2
+        n_fields = 4 if is_3d else 3  # p, u, [v], w
+
+        # wave fields: n_fields pairs x 2 time levels
+        mem = n_fields * 2 * slab * fb
+        # material: rho + K + beta + a_exp
+        mem += 4 * slab * fb
+        # derivative maps (dmap + dcmap)
+        mem += 9 * 2 * n_deriv_levels * fb + slab * ib
+        # source (icmat + coords)
+        if n_sources > 0:
+            mem += n_sources * fb if save_gpu_memory else n_sources * n_source_timesteps * fb
+            mem += ndim * n_sources * ib
+        # sensor (genoutframe + coordsout_local + p_idx_array)
+        if n_sensors > 0:
+            mem += n_sensors * fb
+            mem += (ndim + 1) * n_sensors * ib
+            mem += n_sensors * ib
+        return mem
+
+    @staticmethod
+    def _mem_relaxation(
+        slab: int,
+        n_deriv_levels: int,
+        n_sources: int,
+        n_source_timesteps: int,
+        *,
+        save_gpu_memory: bool,
+        n_air: int,
+        n_sensors: int,
+        n_relax: int,
+        float_bytes: int,
+        int_bytes: int,
+        is_3d: bool,
+    ) -> int:
+        """Return estimated GPU bytes for relaxation (power-law) solver.
+
+        Parameters
+        ----------
+        slab : int
+            Grid points per GPU slab (local_depth * lateral).
+        n_deriv_levels : int
+            Number of derivative-map levels.
+        n_sources : int
+            Approximate source count on this GPU.
+        n_source_timesteps : int
+            Number of source time steps.
+        save_gpu_memory : bool
+            Whether memory-saving mode is active.
+        n_air : int
+            Number of zero-pressure (air) coordinates on this GPU.
+        n_sensors : int
+            Approximate sensor count on this GPU.
+        n_relax : int
+            Number of relaxation mechanisms.
+        float_bytes : int
+            Bytes per float (4).
+        int_bytes : int
+            Bytes per int (4).
+        is_3d: bool,
+            Whether the simulation is 3D (affects sensor memory).
+
+        Returns
+        -------
+        int
+            Total estimated bytes.
+
+        """
+        fb = float_bytes
+        ib = int_bytes
+        ndim = 3 if is_3d else 2
+        n_fields = 4 if is_3d else 3  # p, u, [v], w
+
+        # wave fields: n_fields pairs x 2 time levels
+        mem = n_fields * 2 * slab * fb
+        # relaxation psi:
+        mem += 2 * (ndim * n_relax * 2 * slab * fb)
+        # material: rho + K + beta
+        mem += 3 * slab * fb
+        # kappa: 2 arrays (kappa_x1, kappa_x2)
+        mem += 2 * slab * fb
+        # PML: pml_x1 + pml_x2, each has 2 * n_relax arrays
+        mem += 2 * (2 * n_relax) * slab * fb
+        # (dmap + dcmap)
+        mem += 9 * 2 * n_deriv_levels * fb + slab * ib
+        # source (icmat + coords)
+        if n_sources > 0:
+            mem += n_sources * fb if save_gpu_memory else n_sources * n_source_timesteps * fb
+            mem += ndim * n_sources * ib
+
+        # air
+        if n_air > 0:
+            mem += ndim * n_air * ib
+        # sensor
+        if n_sensors > 0:
+            mem += n_sensors * fb
+            mem += (ndim + 1) * n_sensors * ib
+            mem += n_sensors * ib
+        return mem
+
     def print_info(self) -> None:
         """Print the Solver instance information."""
         print(str(self))
diff --git a/fullwave/solver/utils.py b/fullwave/solver/utils.py
index 9986bd6..adbe878 100644
--- a/fullwave/solver/utils.py
+++ b/fullwave/solver/utils.py
@@ -17,10 +17,12 @@ def load_dat_data(dat_file_path: Path, dtype: DTypeLike = np.float32) -> NDArray
         dat_file_path (Path): Path to the .dat file.
         dtype: Data type to use when reading the file.
 
-    Raises:
+    Raises
+    ------
         ValueError: if dat_file_path does not exist.
 
-    Returns:
+    Returns
+    -------
         NDArray[np.float64]: Array of data read from the file.
 
     """
@@ -58,7 +60,8 @@ def load_dat_and_reshape(
         n_sensors: Number of sensors
         dtype: Data type to use when reading the file.
 
-    Returns:
+    Returns
+    -------
         NDArray[np.float64]: Array of data read from the file.
 
     """
@@ -88,7 +91,8 @@ def initialize_relaxation_param_dict(
 ) -> dict[str, NDArray[np.float64]]:
     """Initialize a dictionary with relaxation parameters.
 
-    Returns:
+    Returns
+    -------
         dict[str, NDArray[np.float64]]: Dictionary of relaxation parameters.
 
     """
@@ -125,10 +129,12 @@ def load_data_with_time_step(
         dtype: Data type to use when reading the file.
         time_step: Time step index to load.
 
-    Returns:
+    Returns
+    -------
         NDArray[np.float64]: Array of data read from the file.
 
-    Raises:
+    Raises
+    ------
         ValueError: if file_path does not exist.
 
     """
@@ -159,10 +165,12 @@ def load_data_with_sensor_index(
         sensor_index: Sensor index to load.
         dtype: Data type to use when reading the file.
 
-    Returns:
+    Returns
+    -------
         NDArray[np.float64]: Array of data read from the file.
 
-    Raises:
+    Raises
+    ------
         ValueError: if file_path does not exist.
 
     """
diff --git a/fullwave/source.py b/fullwave/source.py
index bbe4dba..74ca2e8 100644
--- a/fullwave/source.py
+++ b/fullwave/source.py
@@ -23,8 +23,14 @@ class Source:
     grid_shape: tuple[int, ...]
     p0_additive: NDArray[np.float64] | None = None
     incoords_add: NDArray[np.int64] | None = None
-
-    def __init__(  # noqa: C901 PLR0912 PLR0915
+    u0: NDArray[np.float64] | None = None
+    incoords_u: NDArray[np.int64] | None = None
+    v0: NDArray[np.float64] | None = None
+    incoords_v: NDArray[np.int64] | None = None
+    w0: NDArray[np.float64] | None = None
+    incoords_w: NDArray[np.int64] | None = None
+
+    def __init__(  # noqa: PLR0912
         self,
         p0: NDArray[np.float64] | None = None,
         mask: NDArray[np.bool] | None = None,
@@ -37,6 +43,16 @@ def __init__(  # noqa: C901 PLR0912 PLR0915
         coords_additive: NDArray[np.int64] | None = None,
         # ---
         grid_shape: tuple[int, ...] | None = None,
+        # --- velocity source components ---
+        u0: NDArray[np.float64] | None = None,
+        coords_u: NDArray[np.int64] | None = None,
+        mask_u: NDArray[np.bool] | None = None,
+        v0: NDArray[np.float64] | None = None,
+        coords_v: NDArray[np.int64] | None = None,
+        mask_v: NDArray[np.bool] | None = None,
+        w0: NDArray[np.float64] | None = None,
+        coords_w: NDArray[np.int64] | None = None,
+        mask_w: NDArray[np.bool] | None = None,
     ) -> None:
         """Source class for Fullwave.
 
@@ -44,8 +60,9 @@ def __init__(  # noqa: C901 PLR0912 PLR0915
         ----------
         p0 : NDArray[np.float64] | None
             Time-varying pressure at each source position; shape [n_sources, nt].
-            If None, p0_additive must be provided (additive-only / soft initial condition):
-            icmat is written as zeros and only the additive term drives the source.
+            If None, either p0_additive or a velocity component (u0/v0/w0) must be provided.
+            When omitted with a velocity-only source, the pressure node list is empty
+            and icmat is written as a zero-row matrix.
         mask : NDArray[np.bool] | None
             binary matrix specifying the positions of the time varying pressure source distribution
             shape: [nx, ny] for 2D, [nx, ny, nz] for 3D.
@@ -54,7 +71,7 @@ def __init__(  # noqa: C901 PLR0912 PLR0915
             Coordinate array of source positions (hard source); shape [n_sources, ndim].
             Must be provided together with grid_shape.
         grid_shape : tuple[int, ...] | None
-            Shape of the computational grid. Required when using coords input.
+            Shape of the computational grid. Required when using coords or velocity-only input.
         p0_additive : NDArray[np.float64] | None
             Optional additive (soft) source term; shape [n_sources_add, nt].
             When provided, positions come from coords_additive or mask_additive,
@@ -66,15 +83,36 @@ def __init__(  # noqa: C901 PLR0912 PLR0915
             Coordinates for the additive source; shape [n_sources_add, ndim].
             If None and p0_additive is provided, defaults to primary incoords.
             Mutually exclusive with mask_additive.
+        u0 : NDArray[np.float64] | None
+            Time-varying velocity in the depth (x) direction; shape [n_sources_u, nt].
+        coords_u : NDArray[np.int64] | None
+            Coordinates for the u-velocity source; shape [n_sources_u, ndim].
+            Mutually exclusive with mask_u.
+        mask_u : NDArray[np.bool] | None
+            Boolean mask for u-velocity source positions. Mutually exclusive with coords_u.
+        v0 : NDArray[np.float64] | None
+            Time-varying velocity in the lateral (y) direction; shape [n_sources_v, nt].
+        coords_v : NDArray[np.int64] | None
+            Coordinates for the v-velocity source; shape [n_sources_v, ndim].
+            Mutually exclusive with mask_v.
+        mask_v : NDArray[np.bool] | None
+            Boolean mask for v-velocity source positions. Mutually exclusive with coords_v.
+        w0 : NDArray[np.float64] | None
+            Time-varying velocity in the elevational (z) direction; shape [n_sources_w, nt].
+        coords_w : NDArray[np.int64] | None
+            Coordinates for the w-velocity source; shape [n_sources_w, ndim].
+            Mutually exclusive with mask_w.
+        mask_w : NDArray[np.bool] | None
+            Boolean mask for w-velocity source positions. Mutually exclusive with coords_w.
 
         Raises
         ------
         ValueError
-            If grid_shape is missing when using coords or additive-only.
+            If grid_shape is missing when using coords, additive-only, or velocity-only.
             If both mask and coords, or both coords_additive and mask_additive, are provided.
-            If primary source positions cannot be resolved (need coords+grid_shape,
-            or mask, or additive-only args).
-            If both p0 and p0_additive are None.
+            If primary source positions cannot be resolved (need coords+grid_shape, mask,
+            additive-only args, or at least one velocity component with grid_shape).
+            If all of p0, p0_additive, u0, v0, and w0 are None.
             If p0 / p0_additive row counts do not match their coordinate arrays.
 
         """
@@ -110,17 +148,28 @@ def __init__(  # noqa: C901 PLR0912 PLR0915
                 mask_add = np.atleast_2d(mask_additive)
                 self.grid_shape = mask_add.shape
                 self.incoords = map_to_coords(mask_add)
+        elif u0 is not None or v0 is not None or w0 is not None:
+            # Velocity-only: no hard pressure nodes; pressure arrays will be empty
+            if grid_shape is None:
+                error_msg = "grid_shape is required for velocity-only source"
+                raise ValueError(error_msg)
+            self.grid_shape = tuple(grid_shape)
+            self.incoords = np.empty((0, len(self.grid_shape)), dtype=np.int64)
         else:
             error_msg = (
                 "Provide (coords + grid_shape), or mask, or"
-                " (p0_additive + (coords_additive or mask_additive))"
+                " (p0_additive + (coords_additive or mask_additive)), or"
+                " a velocity component (u0/v0/w0) with grid_shape"
             )
             raise ValueError(
                 error_msg,
             )
 
-        if p0 is None and p0_additive is None:
-            error_msg = "At least one of p0 or p0_additive must be provided"
+        if p0 is None and p0_additive is None and u0 is None and v0 is None and w0 is None:
+            error_msg = (
+                "At least one of p0, p0_additive, or a velocity component"
+                " (u0/v0/w0) must be provided"
+            )
             raise ValueError(error_msg)
 
         # --- Resolve additive coords (for later use) ---
@@ -148,7 +197,7 @@ def __init__(  # noqa: C901 PLR0912 PLR0915
                 )
                 raise ValueError(error_msg)
             self.p0_additive = np.atleast_2d(p0_additive) if p0_additive is not None else None
-        else:
+        elif p0_additive is not None:
             # Additive-only: icmat written as zeros
             p0_add = np.atleast_2d(p0_additive)
             n_add = _coords_add.shape[0] if _coords_add is not None else self.incoords.shape[0]
@@ -166,6 +215,12 @@ def __init__(  # noqa: C901 PLR0912 PLR0915
                     dtype=np.float64,
                 )
             self.p0_additive = p0_add
+        else:
+            # Velocity-only: no pressure nodes; derive nt from the first velocity signal
+            _first_vel = next(s for s in (u0, v0, w0) if s is not None)
+            nt_vel = np.atleast_2d(_first_vel).shape[1]
+            self.p0 = np.zeros((0, nt_vel), dtype=np.float64)
+            self.p0_additive = None
 
         # --- Set incoords_add for writer/binary ---
         if self.p0_additive is not None:
@@ -173,6 +228,11 @@ def __init__(  # noqa: C901 PLR0912 PLR0915
         else:
             self.incoords_add = None
 
+        # --- Resolve velocity source components ---
+        self.incoords_u, self.u0 = self._resolve_velocity_component("u", u0, coords_u, mask_u)
+        self.incoords_v, self.v0 = self._resolve_velocity_component("v", v0, coords_v, mask_v)
+        self.incoords_w, self.w0 = self._resolve_velocity_component("w", w0, coords_w, mask_w)
+
         self.is_3d = len(self.grid_shape) == 3
         super().__init__()
         self.__post_init__()
@@ -202,11 +262,64 @@ def __post_init__(self) -> None:
                 )
                 raise ValueError(error_msg)
 
+    @staticmethod
+    def _resolve_velocity_component(
+        name: str,
+        signal: NDArray | None,
+        coords: NDArray | None,
+        mask: NDArray | None,
+    ) -> tuple[NDArray | None, NDArray | None]:
+        """Resolve a velocity source component to (incoords, signal_arr).
+
+        Returns
+        -------
+        tuple[NDArray | None, NDArray | None]
+            (incoords, signal_arr) if the component is provided, else (None, None).
+
+        Raises
+        ------
+        ValueError
+            If coords and mask are both provided.
+            If signal is provided but neither coords nor mask are given.
+            If signal is None but coords/mask are provided.
+            If signal row count does not match the number of coordinate points.
+
+        """
+        if signal is None:
+            if coords is not None or mask is not None:
+                error_msg = f"coords_{name}/mask_{name} provided without {name}0 signal"
+                raise ValueError(error_msg)
+            return None, None
+        if coords is not None and mask is not None:
+            error_msg = f"coords_{name} and mask_{name} are mutually exclusive"
+            raise ValueError(error_msg)
+        if coords is None and mask is None:
+            error_msg = f"{name}0 signal provided but neither coords_{name} nor mask_{name} given"
+            raise ValueError(error_msg)
+        if coords is not None:
+            incoords = np.atleast_2d(coords).astype(np.int64, copy=False)
+        else:
+            incoords = map_to_coords(np.atleast_2d(mask))
+        signal_arr = np.atleast_2d(signal)
+        if signal_arr.shape[0] != incoords.shape[0]:
+            error_msg = (
+                f"{name}0 has {signal_arr.shape[0]} rows but "
+                f"coords_{name}/mask_{name} has {incoords.shape[0]} (must match)"
+            )
+            raise ValueError(error_msg)
+        return incoords, signal_arr
+
     def validate(self, grid_shape: NDArray[np.int64] | tuple) -> None:
         """Check if the source coordinates are consistent with the grid shape."""
         grid_shape = tuple(grid_shape) if isinstance(grid_shape, np.ndarray) else grid_shape
         assert self.grid_shape == grid_shape, f"{self.grid_shape} != {grid_shape}"
-        assert self.n_sources > 0 or self.n_sources_add > 0, "No active source found."
+        assert (
+            self.n_sources > 0
+            or self.n_sources_add > 0
+            or self.n_sources_u > 0
+            or self.n_sources_v > 0
+            or self.n_sources_w > 0
+        ), "No active source found."
         logger.debug("Source validated against grid shape.")
 
     @property
@@ -246,6 +359,21 @@ def n_sources_add(self) -> int:
             return 0
         return self.incoords_add.shape[0]
 
+    @property
+    def n_sources_u(self) -> int:
+        """Return the number of velocity-u (depth) source positions."""
+        return self.incoords_u.shape[0] if self.incoords_u is not None else 0
+
+    @property
+    def n_sources_v(self) -> int:
+        """Return the number of velocity-v (lateral) source positions."""
+        return self.incoords_v.shape[0] if self.incoords_v is not None else 0
+
+    @property
+    def n_sources_w(self) -> int:
+        """Return the number of velocity-w (elevational) source positions."""
+        return self.incoords_w.shape[0] if self.incoords_w is not None else 0
+
     def plot(
         self,
         export_path: Path | str | None = Path("./temp/temp.png"),
@@ -312,6 +440,13 @@ def __str__(self) -> str:
             lines.append(f"  p0_additive shape: {self.p0_additive.shape}")
         if self.incoords_add is not None:
             lines.append(f"  incoords_add: {self.incoords_add.shape[0]} points")
+        for comp, sig, n in (
+            ("u", self.u0, self.n_sources_u),
+            ("v", self.v0, self.n_sources_v),
+            ("w", self.w0, self.n_sources_w),
+        ):
+            if sig is not None:
+                lines.append(f"  {comp}0 shape: {sig.shape} ({n} points)")
         return "\n".join(lines) + "\n"
 
     def __repr__(self) -> str:
diff --git a/fullwave/transducer.py b/fullwave/transducer.py
index c0dc858..8678c08 100644
--- a/fullwave/transducer.py
+++ b/fullwave/transducer.py
@@ -22,7 +22,8 @@
 def _make_pos_int(val: float | tuple[float] | tuple[int]) -> NDArray[np.int64]:
     """Force value to be a positive integer.
 
-    Returns:
+    Returns
+    -------
         NDArray[np.int64]: Array with positive integers.
 
     """
@@ -202,7 +203,7 @@ def __init__(
         ) = self._create_element_coords()
         logger.debug("TransducerGeometry instance created.")
 
-    def _init_dimensions(  # noqa: C901, PLR0912
+    def _init_dimensions(  # noqa: PLR0912
         self,
         grid: Grid,
         element_width_px: int | None,
@@ -313,7 +314,7 @@ def _init_positions(self, position_px: int, position_m: float) -> tuple[int, flo
             assert len(position_m) == 3, "position_m must have 3 elements for 3D transducer"
         return position_px, position_m
 
-    def _create_element_coords(  # noqa: PLR0912, PLR0915
+    def _create_element_coords(  # noqa: PLR0912
         self,
     ) -> tuple[NDArray[np.int64], NDArray[np.int64], NDArray[np.int64], NDArray[np.int64]]:
         """Build flat coordinate arrays for source and sensor pixels.
@@ -813,15 +814,15 @@ def set_signal(self, value: NDArray[np.float64]) -> None:
     def source_coords(self) -> NDArray[np.int64]:
         """Coordinates of active source pixels; shape [N_src, ndim]."""
         active_ids = np.where(self.active_source_elements)[0] + 1
-        mask = np.isin(self.transducer_geometry._source_ids, active_ids)  # noqa: SLF001
-        return self.transducer_geometry._source_coords[mask]  # noqa: SLF001
+        mask = np.isin(self.transducer_geometry._source_ids, active_ids)
+        return self.transducer_geometry._source_coords[mask]
 
     @property
     def sensor_coords(self) -> NDArray[np.int64]:
         """Coordinates of active sensor pixels; shape [N_snsr, ndim]."""
         active_ids = np.where(self.active_sensor_elements)[0] + 1
-        mask = np.isin(self.transducer_geometry._sensor_ids, active_ids)  # noqa: SLF001
-        return self.transducer_geometry._sensor_coords[mask]  # noqa: SLF001
+        mask = np.isin(self.transducer_geometry._sensor_ids, active_ids)
+        return self.transducer_geometry._sensor_coords[mask]
 
     @property
     def element_id_to_element_surface(self) -> dict[int, NDArray[np.int64]]:
@@ -986,8 +987,8 @@ def dict_source_index_to_location(self) -> dict[int, NDArray[np.int64]]:
     @property
     def element_id_to_element_center(self) -> dict[int, NDArray[np.int64]]:
         """Return the dictionary mapping element IDs to their center coordinates."""
-        coords = self.transducer_geometry._source_coords  # noqa: SLF001
-        ids = self.transducer_geometry._source_ids  # noqa: SLF001
+        coords = self.transducer_geometry._source_coords
+        ids = self.transducer_geometry._source_ids
         n = self.transducer_geometry.number_elements
         ndim = coords.shape[1] if len(coords) else (3 if self.is_3d else 2)
         out: dict[int, NDArray[np.int64]] = {}
@@ -1049,7 +1050,7 @@ def plot_source_mask(
 
         it plots whole transducer geometry including the inactive/active source and sensor elements.
         """
-        import matplotlib.pyplot as plt
+        import matplotlib.pyplot as plt  # noqa: PLC0415
 
         fig, ax = plt.subplots(1, 1, figsize=(10, 10))
         plot_mask = np.zeros(self.transducer_geometry.stored_grid_size)
@@ -1089,7 +1090,7 @@ def plot_sensor_mask(
 
         it plots whole transducer geometry including the inactive/active source and sensor elements.
         """
-        import matplotlib.pyplot as plt
+        import matplotlib.pyplot as plt  # noqa: PLC0415
 
         fig, ax = plt.subplots(1, 1, figsize=(10, 10))
         plot_mask = np.zeros(self.transducer_geometry.stored_grid_size)
diff --git a/fullwave/utils/__init__.py b/fullwave/utils/__init__.py
index 1af42f7..6175687 100644
--- a/fullwave/utils/__init__.py
+++ b/fullwave/utils/__init__.py
@@ -1,6 +1,6 @@
 """misc utils for fullwave package."""
 
-from . import pulse, relaxation_parameters, signal_process
+from . import pulse, relaxation_parameters, signal_filter, signal_process
 from .memory_tempfile import MemoryTempfile
 from .scatterer import (
     generate_resolution_based_scatterer,
@@ -14,5 +14,6 @@
     "generate_scatterer",
     "pulse",
     "relaxation_parameters",
+    "signal_filter",
     "signal_process",
 ]
diff --git a/fullwave/utils/check_functions.py b/fullwave/utils/check_functions.py
index b15d77f..0cc6f58 100644
--- a/fullwave/utils/check_functions.py
+++ b/fullwave/utils/check_functions.py
@@ -7,7 +7,8 @@
 def check_instance(target_var: Any, instances: list[Any] | Any) -> None:  # noqa: ANN401
     """Check whether target_var is an instance of the specified type.
 
-    Raises:
+    Raises
+    ------
         TypeError: If target_var is not an instance of instance.
 
     """
@@ -21,7 +22,8 @@ def check_instance(target_var: Any, instances: list[Any] | Any) -> None:  # noqa
 def check_path_exists(target_path: Path) -> None:
     """Check whether the provided target_path exists.
 
-    Raises:
+    Raises
+    ------
         ValueError: If target_path does not exist.
 
     """
@@ -37,7 +39,8 @@ def check_compatible_value(
 ) -> None:
     """Check whether the provided value is in the list of compatible values.
 
-    Raises:
+    Raises
+    ------
         ValueError: If value is not in compatible_values.
 
     """
diff --git a/fullwave/utils/coordinates.py b/fullwave/utils/coordinates.py
index 1a7798a..cf78427 100644
--- a/fullwave/utils/coordinates.py
+++ b/fullwave/utils/coordinates.py
@@ -16,7 +16,8 @@ def make_circle_idx(
         cen: The center of the circle.
         rad: The radius of the circle.
 
-    Returns:
+    Returns
+    -------
         mask: The mask of the circle.
 
     """
@@ -32,7 +33,8 @@ def map_to_coords(
 ) -> NDArray[np.int64]:
     """Map the mask map to coordinates.
 
-    Returns:
+    Returns
+    -------
         NDArray[np.int64]: An array of coordinates corresponding to non-zero elements in the mask.
 
     """
@@ -103,7 +105,8 @@ def map_to_coords_with_sort(map_data: NDArray[np.int64]) -> NDArray[np.int64]:
     Args:
         map_data: The mask map.
 
-    Returns:
+    Returns
+    -------
         NDArray[np.int64]: An array of coordinates corresponding to non-zero elements in the mask.
 
     """
@@ -124,7 +127,8 @@ def map_to_coordinates(
         is_3d: Whether the grid is 3D or not.
         sort: Whether to sort the coordinates by the first dimension.
 
-    Returns:
+    Returns
+    -------
         NDArray[np.int64]: An array of coordinates corresponding to non-zero elements in the mask.
 
     """
diff --git a/fullwave/utils/memory_tempfile.py b/fullwave/utils/memory_tempfile.py
index 237362d..47b011c 100644
--- a/fullwave/utils/memory_tempfile.py
+++ b/fullwave/utils/memory_tempfile.py
@@ -31,7 +31,7 @@ class MemoryTempfile:
     and create temporary files and directories.
     """
 
-    def __init__(  # noqa: C901, PLR0912
+    def __init__(  # noqa: PLR0912
         self,
         *,
         preferred_paths: list | None = None,
@@ -182,7 +182,7 @@ def mkdtemp(
         self,
         suffix: str | None = None,
         prefix: str | None = None,
-        dir: str | None = None,  # noqa: A002
+        dir: str | None = None,
     ) -> str:
         """Create a temporary directory.
 
@@ -207,7 +207,7 @@ def mkstemp(
         self,
         suffix: str | None = None,
         prefix: str | None = None,
-        dir: str | None = None,  # noqa: A002
+        dir: str | None = None,
         *,
         text: bool = False,
     ) -> tuple[int, str]:
@@ -241,7 +241,7 @@ def TemporaryDirectory(  # noqa: N802
         self,
         suffix: str | None = None,
         prefix: str | None = None,
-        dir: str | None = None,  # noqa: A002
+        dir: str | None = None,
     ) -> tempfile.TemporaryDirectory[str]:
         """Create a temporary directory and return a TemporaryDirectory object.
 
@@ -275,7 +275,7 @@ def SpooledTemporaryFile(  # noqa: N802
         newline: str | None = None,
         suffix: str | None = None,
         prefix: str | None = None,
-        dir: str | None = None,  # noqa: A002
+        dir: str | None = None,
     ) -> tempfile.SpooledTemporaryFile:
         """Create a spooled temporary file.
 
@@ -329,7 +329,7 @@ def NamedTemporaryFile(  # noqa: N802
         newline: str | None = None,
         suffix: str | None = None,
         prefix: str | None = None,
-        dir: str | None = None,  # noqa: A002
+        dir: str | None = None,
         *,
         delete: bool = True,
     ) -> tempfile.NamedTemporaryFile:
@@ -379,7 +379,7 @@ def TemporaryFile(  # noqa: N802
         newline: str | None = None,
         suffix: str | None = None,
         prefix: str | None = None,
-        dir: str | None = None,  # noqa: A002
+        dir: str | None = None,
     ) -> tempfile.TemporaryFile:
         """Create and return a temporary file.
 
diff --git a/fullwave/utils/plot_utils.py b/fullwave/utils/plot_utils.py
index 6f63f1f..c353229 100644
--- a/fullwave/utils/plot_utils.py
+++ b/fullwave/utils/plot_utils.py
@@ -13,7 +13,7 @@
 from tqdm import tqdm
 
 
-def plot_array(  # noqa: C901, D417, PLR0912
+def plot_array(  # noqa: PLR0912
     x: NDArray[np.float64 | np.int64 | np.bool],
     aspect: float | None = None,
     vmin: float | None = None,
@@ -553,7 +553,7 @@ def plot_wave_propagation_animation(
     animation_data.save(export_name, writer="ffmpeg", dpi=dpi)
 
 
-def plot_wave_propagation_with_map(  # noqa: PLR0915, C901
+def plot_wave_propagation_with_map(
     propagation_map: NDArray[np.float64],
     c_map: NDArray[np.float64],
     rho_map: NDArray[np.float64],
@@ -868,7 +868,7 @@ def plot_wave_propagation_with_map(  # noqa: PLR0915, C901
     plt.close("all")
 
 
-def plot_wave_propagation_snapshot(  # noqa: PLR0915
+def plot_wave_propagation_snapshot(
     propagation_map: NDArray[np.float64],
     c_map: NDArray[np.float64],
     rho_map: NDArray[np.float64],
diff --git a/fullwave/utils/relaxation_parameters.py b/fullwave/utils/relaxation_parameters.py
index e820888..17a80c3 100644
--- a/fullwave/utils/relaxation_parameters.py
+++ b/fullwave/utils/relaxation_parameters.py
@@ -247,6 +247,91 @@ def _map_parameters_search(
     return output_flat.reshape(*spatial_shape, n_params)
 
 
+def _map_parameters_search_gpu(
+    input_tensor: NDArray[np.float64],
+    look_up_table: NDArray[np.float64],
+    alpha_list: NDArray[np.float64],
+    power_list: NDArray[np.float64],
+    invalid_matrix: NDArray[np.bool_],
+) -> NDArray[np.float64]:
+    """GPU version of _map_parameters_search using CuPy searchsorted + fancy indexing.
+
+    Parameters
+    ----------
+    input_tensor : cp.ndarray
+        Input tensor with shape (..., 2) where last dim is (alpha, power).
+    look_up_table : NDArray[np.float64]
+        Precomputed parameter table shape (B1, B2, 4 * n_relaxation + 2).
+    alpha_list : NDArray[np.float64]
+        List of alpha values for the lookup table.
+    power_list : NDArray[np.float64]
+        List of power values for the lookup table.
+    invalid_matrix : NDArray[np.bool_]
+        Matrix indicating invalid (alpha, power) combinations.
+
+    Returns
+    -------
+    cp.ndarray
+        Output tensor with shape (..., 4 * n_relaxation + 2).
+
+    """
+    import cupy as cp  # noqa: PLC0415
+
+    logger.debug("Mapping parameters using CuPy GPU kernel.")
+    time_start = time.time()
+
+    spatial_shape = input_tensor.shape[:-1]
+
+    # Transfer small LUT arrays to GPU (these are tiny, ~KB)
+    alpha_sorted = cp.asarray(alpha_list[0].round(10))
+    power_sorted = cp.asarray(power_list[0].round(10))
+    lut_gpu = cp.asarray(look_up_table)
+
+    # Flatten spatial dims
+    n_elements = int(cp.prod(cp.asarray(list(spatial_shape))))
+    input_flat = input_tensor.reshape(n_elements, 2)
+
+    # Searchsorted on GPU
+    alpha_indices = cp.searchsorted(alpha_sorted, input_flat[:, 0], side="left")
+    power_indices = cp.searchsorted(power_sorted, input_flat[:, 1], side="left")
+
+    # Clip to valid range
+    alpha_indices = cp.clip(alpha_indices, 0, len(alpha_sorted) - 1)
+    power_indices = cp.clip(power_indices, 0, len(power_sorted) - 1)
+
+    # LUT lookup via fancy indexing
+    output_flat = lut_gpu[alpha_indices, power_indices, :]
+
+    time_end = time.time()
+    logger.debug("CuPy GPU kernel time: %.4f seconds.", time_end - time_start)
+
+    # Check for invalid combinations (transfer only the boolean result)
+    invalid_gpu = cp.asarray(invalid_matrix)
+    has_invalid = bool(cp.any(invalid_gpu[alpha_indices, power_indices]))
+    if has_invalid:
+        invalid_flags = invalid_gpu[alpha_indices, power_indices].reshape(spatial_shape)
+        # Transfer only the small set of invalid points for the warning message
+        invalid_indices_np = cp.asnumpy(invalid_flags)
+        input_np = cp.asnumpy(input_tensor)
+        invalid_alpha_power = np.unique(
+            input_np[..., :2][np.where(invalid_indices_np)],
+            axis=0,
+        )
+        invalid_attenuation = ", ".join(
+            [f"({a:.4f}, {p:.4f})" for a, p in invalid_alpha_power],
+        )
+        message = (
+            "Warning: Some attenuation values correspond to invalid relaxation parameters. "
+            "This is due to the limitations of the precomputed lookup table. "
+            "Please change the attenuation values.\n"
+            f"Number of invalid points: {int(cp.sum(invalid_flags))}.\n"
+            f"Invalid attenuation values (alpha, power): {invalid_attenuation}\n"
+        )
+        logger.warning(message)
+
+    return output_flat.reshape(*spatial_shape, look_up_table.shape[2])
+
+
 def generate_relaxation_params(
     alpha_coeff: NDArray[np.float64],
     alpha_power: NDArray[np.float64],
@@ -361,6 +446,9 @@ def generate(
     ) -> dict[str, NDArray[np.float64]]:
         """Generate relaxation parameters based on attenuation values.
 
+        Dispatches to CuPy GPU path when inputs are CuPy arrays,
+        otherwise uses the Numba CPU path.
+
         Parameters
         ----------
         alpha_coeff : NDArray[np.float64]
@@ -374,32 +462,24 @@ def generate(
             A dictionary containing the computed relaxation parameters.
 
         """
-        if np.any(alpha_coeff < self.alpha_min) or np.any(alpha_power < self.power_min):
-            error_msg = (
-                "attenuation is out of range."
-                "the out-of-range values will be clipped to the min value."
-                f"alpha minimum: {self.alpha_min}, "
-                f"power minimum: {self.power_min}"
-            )
-            logger.warning(error_msg)
-        if np.any(alpha_coeff > self.alpha_max) or np.any(alpha_power > self.power_max):
-            error_msg = (
-                "attenuation is out of range."
-                "the out-of-range values will be clipped to the max value."
-                f"alpha maximum: {self.alpha_max}, "
-                f"power maximum: {self.power_max}"
-            )
-            logger.warning(error_msg)
+        use_gpu = not isinstance(alpha_coeff, np.ndarray)
+
+        if use_gpu:
+            return self._generate_gpu(alpha_coeff, alpha_power)
+        return self._generate_cpu(alpha_coeff, alpha_power)
+
+    def _generate_cpu(
+        self,
+        alpha_coeff: NDArray[np.float64],
+        alpha_power: NDArray[np.float64],
+    ) -> dict[str, NDArray[np.float64]]:
+        """CPU path using Numba fused kernel."""
+        self._warn_out_of_range(alpha_coeff, alpha_power, xp=np)
 
         alpha_coeff = np.clip(alpha_coeff, self.alpha_min, self.alpha_max)
         alpha_power = np.clip(alpha_power, self.power_min, self.power_max)
 
-        # # Normalize to [0, 1] for the lookup table
-        # alpha_coeff = (alpha_coeff - self.alpha_min) / (self.alpha_max - self.alpha_min)
-        # alpha_power = (alpha_power - self.power_min) / (self.power_max - self.power_min)
-
         input_data = np.stack([alpha_coeff, alpha_power], axis=-1)
-        # output = _map_parameters(input_data, self.look_up_table, self.alpha_list, self.power_list)
         output = _map_parameters_search(
             input_data,
             self.look_up_table,
@@ -412,3 +492,49 @@ def generate(
         for i, key in enumerate(relaxation_param_dict.keys()):
             relaxation_param_dict[key] = output[..., i]
         return relaxation_param_dict
+
+    def _generate_gpu(
+        self,
+        alpha_coeff: NDArray[np.float64],
+        alpha_power: NDArray[np.float64],
+    ) -> dict[str, NDArray[np.float64]]:
+        """GPU path using CuPy searchsorted + fancy indexing."""
+        import cupy as cp  # noqa: PLC0415
+
+        self._warn_out_of_range(alpha_coeff, alpha_power, xp=cp)
+
+        alpha_coeff = cp.clip(alpha_coeff, self.alpha_min, self.alpha_max)
+        alpha_power = cp.clip(alpha_power, self.power_min, self.power_max)
+
+        input_data = cp.stack([alpha_coeff, alpha_power], axis=-1)
+        output = _map_parameters_search_gpu(
+            input_data,
+            self.look_up_table,
+            self.alpha_list,
+            self.power_list,
+            self.invalid_matrix,
+        )
+
+        relaxation_param_dict = initialize_relaxation_param_dict(self.n_relaxation_mechanisms)
+        for i, key in enumerate(relaxation_param_dict.keys()):
+            relaxation_param_dict[key] = output[..., i]
+        return relaxation_param_dict
+
+    def _warn_out_of_range(self, alpha_coeff: NDArray, alpha_power: NDArray, *, xp: object) -> None:
+        """Log warnings if attenuation values are out of LUT range."""
+        if xp.any(alpha_coeff < self.alpha_min) or xp.any(alpha_power < self.power_min):
+            error_msg = (
+                "attenuation is out of range."
+                "the out-of-range values will be clipped to the min value."
+                f"alpha minimum: {self.alpha_min}, "
+                f"power minimum: {self.power_min}"
+            )
+            logger.warning(error_msg)
+        if xp.any(alpha_coeff > self.alpha_max) or xp.any(alpha_power > self.power_max):
+            error_msg = (
+                "attenuation is out of range."
+                "the out-of-range values will be clipped to the max value."
+                f"alpha maximum: {self.alpha_max}, "
+                f"power maximum: {self.power_max}"
+            )
+            logger.warning(error_msg)
diff --git a/fullwave/utils/signal_filter.py b/fullwave/utils/signal_filter.py
new file mode 100644
index 0000000..8264286
--- /dev/null
+++ b/fullwave/utils/signal_filter.py
@@ -0,0 +1,161 @@
+"""FFT-based frequency-domain filtering for sensor data.
+
+GPU backend: CuPy when available; falls back silently to NumPy.
+No new hard dependencies — CuPy is already listed under the ``examples`` optional extra.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+if TYPE_CHECKING:
+    from numpy.typing import NDArray
+
+logger = logging.getLogger("__main__." + __name__)
+
+# Module-level cache to avoid repeated import overhead
+_CUPY_AVAILABLE: bool | None = None
+
+
+def _check_cupy() -> bool:
+    """Return True if CuPy is importable; result is cached after the first call."""
+    global _CUPY_AVAILABLE  # noqa: PLW0603
+    if _CUPY_AVAILABLE is None:
+        try:
+            import cupy  # noqa: F401, PLC0415
+
+            _CUPY_AVAILABLE = True
+        except ImportError:
+            _CUPY_AVAILABLE = False
+    return _CUPY_AVAILABLE
+
+
+def _build_frequency_mask(
+    n_fft: int,
+    dt: float,
+    f_low_hz: float | None = None,
+    f_high_hz: float | None = None,
+    taper_ratio: float = 0.1,
+) -> NDArray[np.float64]:
+    """Build a frequency-domain gain mask with cosine (Hann) tapers.
+
+    Parameters
+    ----------
+    n_fft : int
+        FFT length (number of time samples before zero-padding, i.e. ``n_t``).
+    dt : float
+        Simulation time step in seconds.
+    f_low_hz : float | None
+        High-pass cut-off frequency in Hz.  Frequencies below this value are
+        attenuated.  The mask transitions smoothly from 0 to 1 in a window of
+        width ``f_low_hz * taper_ratio`` centred at ``f_low_hz``.
+    f_high_hz : float | None
+        Low-pass cut-off frequency in Hz.  Frequencies above this value are
+        attenuated.  The mask transitions smoothly from 1 to 0 in a window of
+        width ``f_high_hz * taper_ratio`` centred at ``f_high_hz``.
+    taper_ratio : float
+        Fractional width of each cosine taper relative to its centre frequency.
+        Default is 0.1 (10 %).
+
+    Returns
+    -------
+    NDArray[np.float64]
+        Frequency-domain gain mask of shape ``[n_fft // 2 + 1]``.
+
+    """
+    freqs = np.fft.rfftfreq(n_fft, d=dt)
+    mask = np.ones(len(freqs), dtype=np.float64)
+
+    if f_low_hz is not None:
+        half_width = f_low_hz * taper_ratio / 2.0
+        f_start = f_low_hz - half_width
+        f_end = f_low_hz + half_width
+        width = f_end - f_start  # == f_low_hz * taper_ratio
+
+        in_taper = (freqs >= f_start) & (freqs <= f_end)
+        below_taper = freqs < f_start
+
+        mask[below_taper] = 0.0
+        mask[in_taper] = 0.5 * (1.0 - np.cos(np.pi * (freqs[in_taper] - f_start) / width))
+
+    if f_high_hz is not None:
+        half_width = f_high_hz * taper_ratio / 2.0
+        f_start = f_high_hz - half_width
+        f_end = f_high_hz + half_width
+        width = f_end - f_start  # == f_high_hz * taper_ratio
+
+        in_taper = (freqs >= f_start) & (freqs <= f_end)
+        above_taper = freqs > f_end
+
+        lp_taper = np.ones(len(freqs), dtype=np.float64)
+        lp_taper[in_taper] = 0.5 * (1.0 + np.cos(np.pi * (freqs[in_taper] - f_start) / width))
+        lp_taper[above_taper] = 0.0
+        mask *= lp_taper
+
+    return mask
+
+
+def apply_filter(
+    data: NDArray[np.float64],
+    dt: float,
+    f_low_hz: float | None = None,
+    f_high_hz: float | None = None,
+    taper_ratio: float = 0.1,
+    *,
+    use_gpu: bool = True,
+) -> NDArray[np.float64]:
+    """Apply a frequency-domain filter to sensor data.
+
+    The filter is built as a cosine-tapered gain mask (see :func:`_build_frequency_mask`).
+    When CuPy is available and ``use_gpu=True``, the FFT operations run on the GPU
+    for maximum throughput; otherwise NumPy is used transparently.
+
+    Parameters
+    ----------
+    data : NDArray[np.float64]
+        Sensor time traces, shape ``[n_sensors, n_t]``.
+    dt : float
+        Simulation time step in seconds.
+    f_low_hz : float | None
+        High-pass edge frequency in Hz.  Pass ``None`` to skip high-passing.
+    f_high_hz : float | None
+        Low-pass edge frequency in Hz.  Pass ``None`` to skip low-passing.
+    taper_ratio : float
+        Fractional taper width relative to each cut-off frequency.  Default 0.1.
+    use_gpu : bool
+        If ``True`` (default), attempt to use CuPy for GPU-accelerated FFTs.
+        Falls back to NumPy silently if CuPy is unavailable.
+
+    Returns
+    -------
+    NDArray[np.float64]
+        Filtered data, same shape as ``data``.
+
+    """
+    n_t = data.shape[1]
+    mask = _build_frequency_mask(
+        n_t,
+        dt,
+        f_low_hz=f_low_hz,
+        f_high_hz=f_high_hz,
+        taper_ratio=taper_ratio,
+    )
+
+    if use_gpu and _check_cupy():
+        import cupy as cp  # noqa: PLC0415
+
+        logger.debug("apply_filter: using CuPy GPU backend")
+        data_gpu = cp.asarray(data, dtype=cp.float64)
+        mask_gpu = cp.asarray(mask, dtype=cp.float64)
+        spec = cp.fft.rfft(data_gpu, axis=1)
+        spec *= mask_gpu[cp.newaxis, :]
+        filtered = cp.fft.irfft(spec, n=n_t, axis=1)
+        return cp.asnumpy(filtered)
+
+    logger.debug("apply_filter: using NumPy CPU backend")
+    spec = np.fft.rfft(data, axis=1)
+    spec *= mask[np.newaxis, :]
+    return np.fft.irfft(spec, n=n_t, axis=1)
diff --git a/pyproject.toml b/pyproject.toml
index 2351fce..9365b50 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "fullwave25"
-version = "1.2.3" # Update via bump-my-version, not manually
+version = "1.2.6-dev8" # Update via bump-my-version, not manually
 description = "Fullwave 2.5: Ultrasound wave propagation simulation with heterogeneous power law attenuation modelling capabilities"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -12,7 +12,6 @@ dependencies = [
     "opencv-python-headless>=4.12.0.88",
     "tomli>=2.3.0",
     "numba>=0.63.1",
-    "joblib>=1.5.3",
     "numexpr>=2.14.1",
 ]
 authors = [{ name = "Masashi Sode" }, { name = "Gianmarco Pinton" }]
@@ -57,4 +56,13 @@ build-backend = "hatchling.build"
 packages = ["fullwave", "fullwave.utils"]
 
 [tool.hatch.build]
-exclude = ["tests", "figs", "examples", ".vscode", ".github", "fullwave/solver/bins/gpu", "fullwave/solver/bins/exponential_attenuation", "fullwave/solver/bins/_exponential_attenuation"]
+exclude = [
+    "tests",
+    "figs",
+    "examples",
+    ".vscode",
+    ".github",
+    "fullwave/solver/bins/gpu",
+    "fullwave/solver/bins/exponential_attenuation",
+    "fullwave/solver/bins/_exponential_attenuation",
+]
diff --git a/ruff.toml b/ruff.toml
index 955b704..f6b2d84 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -1,4 +1,8 @@
-# Exclude a variety of commonly ignored directories.
+# -------- Core options --------
+line-length = 100
+indent-width = 4
+target-version = "py312"
+
 exclude = [
     ".bzr",
     ".direnv",
@@ -28,87 +32,109 @@ exclude = [
     "venv",
 ]
 
-# Same as Black.
-line-length = 100
-indent-width = 4
-
-# Assume Python 3.13
-target-version = "py312"
-
 [lint]
-# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)  codes by default.
-# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
-# McCabe complexity (`C901`) by default.
-select = ["ALL"]
-# select = ["E4", "E7", "E9", "F"]
+select = [
+    "E",      # pycodestyle errors
+    "F",      # Pyflakes
+    "W",      # pycodestyle warnings
+    "I",      # isort
+    "N",      # pep8-naming
+    "UP",     # pyupgrade
+    "B",      # bugbear (bug-prone patterns)
+    "SIM",    # simplify
+    "S",      # bandit (security)
+    "C4",     # comprehensions
+    "ARG",    # unused function args
+    "ANN",    # type annotations
+    "D",      # docstrings
+    "PL",     # pylint-like
+    "TC",     # type-checking import placement
+    "RET",    # return-value patterns
+    "RUF100", # unused-noqa
+    "FBT",    # boolean-trap
+    "NPY",    # numpy-specific
+]
 
-# ignore PLR0913 Too many arguments in function definition
-# ignore S101 Use of assert detected
-# ignore PLR2004 Use of Magic number detected
-# ignore S404 `subprocess` module is possibly insecure ()
-# ignore CPY001 Missing copyright notice at top of file
 ignore = [
-    "PLR0913",
-    "PLR0914",
-    "ERA001",
-    "S101",
-    "PLR2004",
-    "S404",
-    "CPY001",
-    "PLR1702",
-    "T201",
+    "PLR0913", # too many arguments
+    "PLR0914", # too many locals
+    "PLR1702", # too many branches
+    "PLR2004", # magic numbers
+    "PLR0915", # too many statements
+    "S101",    # assert allowed in non-prod / tests
+    "S404",    # subprocess warning (you already know what you're doing)
+    "CPY001",  # copyright header
+    "ERA001",  # commented-out code
+    "PLR0904",
 ]
 
-# Allow fix for all enabled rules (when `--fix`) is provided.
 fixable = ["ALL"]
 unfixable = []
 
-# Allow unused variables when underscore-prefixed.
 dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
 
 [lint.per-file-ignores]
-
-# disable D103 (missing docstring in public function) and ANN201 (missing type annotation)
 "tests/**" = [
+    "D100",
     "D101",
+    "D102",
     "D103",
     "D104",
     "D107",
+    "ANN001",
+    "ANN002",
+    "ANN003",
     "ANN201",
     "ANN202",
     "ANN204",
+    "ARG001",
+    "RET504",
+    "S101",
+    "PLC0415", # `import` should be at the top-level of a file
+]
+
+"examples/**" = [
+    "D100",
+    "D101",
+    "D102",
+    "D103",
+    "D104",
+    "D107",
     "ANN001",
     "ANN002",
     "ANN003",
-    "ARG001",
-    "RET504",
+    "ANN201",
+    "ANN202",
+    "ANN204",
+]
+
+# Example: allow more imperative / exploratory style in notebooks.
+"notebooks/**" = [
     "D100",
-    "S101",
+    "D101",
+    "D102",
+    "D103",
+    "D104",
+    "D107",
+    "ANN001",
+    "ANN002",
+    "ANN003",
+    "ANN201",
+    "ANN202",
+    "ANN204",
 ]
 
+[lint.pydocstyle]
+convention = "numpy"
+
 [format]
 # Like Black, use double quotes for strings.
 quote-style = "double"
-
 # Like Black, indent with spaces, rather than tabs.
 indent-style = "space"
-
 # Like Black, respect magic trailing commas.
 skip-magic-trailing-comma = false
-
 # Like Black, automatically detect the appropriate line ending.
 line-ending = "auto"
-
-# Enable auto-formatting of code examples in docstrings. Markdown,
-# reStructuredText code/literal blocks and doctests are all supported.
-#
-# This is currently disabled by default, but it is planned for this
-# to be opt-out in the future.
 docstring-code-format = true
-
-# Set the line length limit used when formatting code snippets in
-# docstrings.
-#
-# This only has an effect when the `docstring-code-format` setting is
-# enabled.
-docstring-code-line-length = "dynamic"
+docstring-code-line-length = 120
diff --git a/tests/medium_builder/presets/test_domain_background.py b/tests/medium_builder/presets/test_domain_background.py
index fd01c28..5de3e80 100644
--- a/tests/medium_builder/presets/test_domain_background.py
+++ b/tests/medium_builder/presets/test_domain_background.py
@@ -76,14 +76,14 @@ def named_material():
 
 def test_setup_base_geometry(grid, default_material):
     bd = BackgroundDomain(grid=grid, material_properties=default_material)
-    geo = bd._setup_base_geometry()  # noqa: SLF001
+    geo = bd._setup_base_geometry()
     assert geo.shape == (grid.nx, grid.ny)
     np.testing.assert_array_equal(geo, np.ones((grid.nx, grid.ny)))
 
 
 def test_setup_sound_speed_default(grid, default_material):
     bd = BackgroundDomain(grid=grid, material_properties=default_material)
-    sound_speed = bd._setup_sound_speed()  # noqa: SLF001
+    sound_speed = bd._setup_sound_speed()
     expected = np.ones((grid.nx, grid.ny)) * default_material.sound_speed
     np.testing.assert_array_almost_equal(sound_speed, expected)
 
@@ -94,7 +94,7 @@ def test_setup_sound_speed_named(grid, named_material):
         background_property_name="custom",
         material_properties=named_material,
     )
-    sound_speed = bd._setup_sound_speed()  # noqa: SLF001
+    sound_speed = bd._setup_sound_speed()
     expected_value = named_material.custom["sound_speed"]
     expected = np.ones((grid.nx, grid.ny)) * expected_value
     np.testing.assert_array_almost_equal(sound_speed, expected)
@@ -102,7 +102,7 @@ def test_setup_sound_speed_named(grid, named_material):
 
 def test_setup_density_default(grid, default_material):
     bd = BackgroundDomain(grid=grid, material_properties=default_material)
-    density = bd._setup_density()  # noqa: SLF001
+    density = bd._setup_density()
     expected = np.ones((grid.nx, grid.ny)) * default_material.density
     np.testing.assert_array_almost_equal(density, expected)
 
@@ -113,7 +113,7 @@ def test_setup_density_named(grid, named_material):
         background_property_name="custom",
         material_properties=named_material,
     )
-    density = bd._setup_density()  # noqa: SLF001
+    density = bd._setup_density()
     expected_value = named_material.custom["density"]
     expected = np.ones((grid.nx, grid.ny)) * expected_value
     np.testing.assert_array_almost_equal(density, expected)
@@ -121,7 +121,7 @@ def test_setup_density_named(grid, named_material):
 
 def test_setup_alpha_coeff_default(grid, default_material):
     bd = BackgroundDomain(grid=grid, material_properties=default_material)
-    alpha_coeff = bd._setup_alpha_coeff()  # noqa: SLF001
+    alpha_coeff = bd._setup_alpha_coeff()
     expected = np.ones((grid.nx, grid.ny)) * default_material.alpha_coeff
     np.testing.assert_array_almost_equal(alpha_coeff, expected)
 
@@ -132,7 +132,7 @@ def test_setup_alpha_coeff_named(grid, named_material):
         background_property_name="custom",
         material_properties=named_material,
     )
-    alpha_coeff = bd._setup_alpha_coeff()  # noqa: SLF001
+    alpha_coeff = bd._setup_alpha_coeff()
     expected_value = named_material.custom["alpha_coeff"]
     expected = np.ones((grid.nx, grid.ny)) * expected_value
     np.testing.assert_array_almost_equal(alpha_coeff, expected)
@@ -140,7 +140,7 @@ def test_setup_alpha_coeff_named(grid, named_material):
 
 def test_setup_alpha_power_default(grid, default_material):
     bd = BackgroundDomain(grid=grid, material_properties=default_material)
-    alpha_power = bd._setup_alpha_power()  # noqa: SLF001
+    alpha_power = bd._setup_alpha_power()
     expected = np.ones((grid.nx, grid.ny)) * default_material.alpha_power
     np.testing.assert_array_almost_equal(alpha_power, expected)
 
@@ -151,7 +151,7 @@ def test_setup_alpha_power_named(grid, named_material):
         background_property_name="custom",
         material_properties=named_material,
     )
-    alpha_power = bd._setup_alpha_power()  # noqa: SLF001
+    alpha_power = bd._setup_alpha_power()
     expected_value = named_material.custom["alpha_power"]
     expected = np.ones((grid.nx, grid.ny)) * expected_value
     np.testing.assert_array_almost_equal(alpha_power, expected)
@@ -159,7 +159,7 @@ def test_setup_alpha_power_named(grid, named_material):
 
 def test_setup_beta_default(grid, default_material):
     bd = BackgroundDomain(grid=grid, material_properties=default_material)
-    beta = bd._setup_beta()  # noqa: SLF001
+    beta = bd._setup_beta()
     expected = np.ones((grid.nx, grid.ny)) * default_material.beta
     np.testing.assert_array_almost_equal(beta, expected)
 
@@ -170,7 +170,7 @@ def test_setup_beta_named(grid, named_material):
         background_property_name="custom",
         material_properties=named_material,
     )
-    beta = bd._setup_beta()  # noqa: SLF001
+    beta = bd._setup_beta()
     expected_value = named_material.custom["beta"]
     expected = np.ones((grid.nx, grid.ny)) * expected_value
     np.testing.assert_array_almost_equal(beta, expected)
diff --git a/tests/solver/test_gpu_memory_estimate.py b/tests/solver/test_gpu_memory_estimate.py
new file mode 100644
index 0000000..d9ebed5
--- /dev/null
+++ b/tests/solver/test_gpu_memory_estimate.py
@@ -0,0 +1,445 @@
+from pathlib import Path
+from unittest.mock import patch
+
+import numpy as np
+import pytest
+
+import fullwave
+from fullwave.solver import solver as solver_module
+
+_FAKE_BINARY = (
+    Path(__file__).parent.parent.parent
+    / "fullwave"
+    / "solver"
+    / "bins"
+    / "gpu"
+    / "2d"
+    / "num_relax=2"
+    / "fullwave2_2d_2_relax_multi_gpu_cuda129"
+)
+
+
+def _build_solver(
+    tmp_path,
+    *,
+    save_gpu_memory=False,
+    cuda_device_id=None,
+):
+    domain_size = (1e-3, 1e-3)
+    f0 = 1e6
+    c0 = 1540
+    duration = domain_size[0] / c0 * 2
+
+    grid = fullwave.Grid(
+        domain_size=domain_size,
+        f0=f0,
+        duration=duration,
+        c0=c0,
+    )
+
+    shape = (grid.nx, grid.ny)
+    medium = fullwave.Medium(
+        grid=grid,
+        sound_speed=c0 * np.ones(shape),
+        density=1000 * np.ones(shape),
+        alpha_coeff=0.5 * np.ones(shape),
+        alpha_power=1.0 * np.ones(shape),
+        beta=np.zeros(shape),
+        use_isotropic_relaxation=True,
+    )
+
+    p_mask = np.zeros(shape, dtype=bool)
+    p_mask[grid.nx // 2, :] = True
+    source = fullwave.Source(np.ones((p_mask.sum(), grid.nt)), p_mask)
+
+    sensor_mask = np.ones(shape, dtype=bool)
+    sensor = fullwave.Sensor(mask=sensor_mask)
+
+    return fullwave.Solver(
+        work_dir=tmp_path / "work",
+        grid=grid,
+        medium=medium,
+        source=source,
+        sensor=sensor,
+        use_isotropic_relaxation=True,
+        save_gpu_memory=save_gpu_memory,
+        cuda_device_id=cuda_device_id,
+        verify_gpu=False,
+    )
+
+
+def _gpu_log_calls(mock_logger):
+    """Return only per-GPU log calls (skip the experimental feature message)."""
+    return [
+        call for call in mock_logger.info.call_args_list if "GPU memory estimate" in str(call[0][0])
+    ]
+
+
+@pytest.fixture(autouse=True)
+def _patch_binary(monkeypatch):
+    monkeypatch.setattr(
+        solver_module,
+        "_retrieve_fullwave_simulation_path",
+        lambda **kwargs: _FAKE_BINARY,  # noqa: ARG005
+    )
+
+
+def test_single_gpu_relaxation_logs_info(tmp_path):
+    solver = _build_solver(tmp_path)
+    sensor = solver.pml_builder.extended_sensor
+
+    with patch("fullwave.solver.solver.logger") as mock_logger:
+        solver._estimate_gpu_memory(sensor)
+
+    gpu_calls = _gpu_log_calls(mock_logger)
+    assert len(gpu_calls) == 1
+    fmt = gpu_calls[0][0][0]
+    mode_arg = gpu_calls[0][0][2]
+    assert "GPU memory estimate" in fmt
+    assert mode_arg == "relaxation"
+
+
+def test_single_gpu_no_halo(tmp_path):
+    solver = _build_solver(tmp_path)
+    sensor = solver.pml_builder.extended_sensor
+
+    with patch("fullwave.solver.solver.logger") as mock_logger:
+        solver._estimate_gpu_memory(sensor)
+
+    gpu_calls = _gpu_log_calls(mock_logger)
+    # halo argument: dev_id, mode, saving, GB, depth, halo, lateral
+    halo_value = gpu_calls[0][0][6]
+    assert halo_value == 0
+
+
+def test_estimate_is_positive(tmp_path):
+    solver = _build_solver(tmp_path)
+    sensor = solver.pml_builder.extended_sensor
+
+    with patch("fullwave.solver.solver.logger") as mock_logger:
+        solver._estimate_gpu_memory(sensor)
+
+    gpu_calls = _gpu_log_calls(mock_logger)
+    gb_value = gpu_calls[0][0][4]
+    assert gb_value > 0
+
+
+def test_save_gpu_memory_reduces_estimate(tmp_path):
+    solver_no_save = _build_solver(tmp_path, save_gpu_memory=False)
+    solver_save = _build_solver(tmp_path, save_gpu_memory=True)
+
+    with patch("fullwave.solver.solver.logger") as mock_no_save:
+        solver_no_save._estimate_gpu_memory(solver_no_save.pml_builder.extended_sensor)
+    gb_no_save = _gpu_log_calls(mock_no_save)[0][0][4]
+
+    with patch("fullwave.solver.solver.logger") as mock_save:
+        solver_save._estimate_gpu_memory(solver_save.pml_builder.extended_sensor)
+    gb_save = _gpu_log_calls(mock_save)[0][0][4]
+
+    assert gb_save <= gb_no_save
+
+
+def test_save_gpu_memory_label_in_log(tmp_path):
+    solver = _build_solver(tmp_path, save_gpu_memory=True)
+    sensor = solver.pml_builder.extended_sensor
+
+    with patch("fullwave.solver.solver.logger") as mock_logger:
+        solver._estimate_gpu_memory(sensor)
+
+    gpu_calls = _gpu_log_calls(mock_logger)
+    saving_arg = gpu_calls[0][0][3]
+    assert "save_gpu_memory" in saving_arg
+
+
+def test_multi_gpu_two_devices_logs_per_device(tmp_path):
+    solver = _build_solver(tmp_path, cuda_device_id=[0, 1])
+    sensor = solver.pml_builder.extended_sensor
+
+    with patch("fullwave.solver.solver.logger") as mock_logger:
+        solver._estimate_gpu_memory(sensor)
+
+    gpu_calls = _gpu_log_calls(mock_logger)
+    assert len(gpu_calls) == 2
+
+
+def test_multi_gpu_two_devices_each_has_one_halo_side(tmp_path):
+    solver = _build_solver(tmp_path, cuda_device_id=[0, 1])
+    sensor = solver.pml_builder.extended_sensor
+
+    with patch("fullwave.solver.solver.logger") as mock_logger:
+        solver._estimate_gpu_memory(sensor)
+
+    gpu_calls = _gpu_log_calls(mock_logger)
+    for call in gpu_calls:
+        halo_value = call[0][6]
+        assert halo_value == 8  # 1 side * 8 ghost cells
+
+
+def test_multi_gpu_three_devices_middle_has_two_halo_sides(tmp_path):
+    solver = _build_solver(tmp_path, cuda_device_id=[0, 1, 2])
+    sensor = solver.pml_builder.extended_sensor
+
+    with patch("fullwave.solver.solver.logger") as mock_logger:
+        solver._estimate_gpu_memory(sensor)
+
+    gpu_calls = _gpu_log_calls(mock_logger)
+    assert len(gpu_calls) == 3
+    assert gpu_calls[0][0][6] == 8  # first GPU: 1 halo side
+    assert gpu_calls[1][0][6] == 16  # middle GPU: 2 halo sides
+    assert gpu_calls[2][0][6] == 8  # last GPU: 1 halo side
+
+
+def test_multi_gpu_splits_depth(tmp_path):
+    solver = _build_solver(tmp_path, cuda_device_id=[0, 1])
+    sensor = solver.pml_builder.extended_sensor
+    total_depth = solver.pml_builder.extended_grid.nx
+
+    with patch("fullwave.solver.solver.logger") as mock_logger:
+        solver._estimate_gpu_memory(sensor)
+
+    gpu_calls = _gpu_log_calls(mock_logger)
+    depth_gpu0 = gpu_calls[0][0][5]
+    depth_gpu1 = gpu_calls[1][0][5]
+    assert depth_gpu0 + depth_gpu1 == total_depth
+
+
+# ---------------------------------------------------------------------------
+# Direct formula-verification tests for _mem_exponential / _mem_relaxation
+# ---------------------------------------------------------------------------
+
+# Shared test parameters
+_SLAB = 1000
+_NDLEV = 5
+_NSRC = 100
+_NTIC = 200
+_NSEN = 50
+_NRELAX = 2
+_NAIR = 30
+_FB = 4  # float_bytes
+_IB = 4  # int_bytes
+
+
+class TestMemExponentialFormula:
+    """Verify _mem_exponential returns byte counts matching the C formulas."""
+
+    def test_3d_matches_c_formula(self):
+        expected = (
+            4 * 2 * _SLAB * _FB  # fields (p, u, v, w) x 2 time levels
+            + 4 * _SLAB * _FB  # material: rho, K, beta, a_exp
+            + 9 * 2 * _NDLEV * _FB
+            + _SLAB * _IB  # dmap + dcmap
+            + _NSRC * _NTIC * _FB  # source icmat (no save)
+            + 3 * _NSRC * _IB  # source coords (3D)
+            + _NSEN * _FB  # genoutframe
+            + (3 + 1) * _NSEN * _IB  # coordsout_local
+            + _NSEN * _IB  # p_idx_array
+        )
+        result = fullwave.Solver._mem_exponential(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=_NSRC,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=False,
+            n_sensors=_NSEN,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=True,
+        )
+        assert result == expected
+
+    def test_2d_matches_c_formula(self):
+        expected = (
+            3 * 2 * _SLAB * _FB  # fields (p, u, w) x 2 time levels
+            + 4 * _SLAB * _FB  # material
+            + 9 * 2 * _NDLEV * _FB
+            + _SLAB * _IB  # dmap + dcmap
+            + _NSRC * _NTIC * _FB  # source icmat
+            + 2 * _NSRC * _IB  # source coords (2D)
+            + _NSEN * _FB  # genoutframe
+            + (2 + 1) * _NSEN * _IB  # coordsout_local
+            + _NSEN * _IB  # p_idx_array
+        )
+        result = fullwave.Solver._mem_exponential(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=_NSRC,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=False,
+            n_sensors=_NSEN,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=False,
+        )
+        assert result == expected
+
+    def test_save_gpu_memory(self):
+        """save_gpu_memory uses single-slice icmat (n_src x fb) instead of full."""
+        result = fullwave.Solver._mem_exponential(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=_NSRC,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=True,
+            n_sensors=_NSEN,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=True,
+        )
+        result_no_save = fullwave.Solver._mem_exponential(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=_NSRC,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=False,
+            n_sensors=_NSEN,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=True,
+        )
+        # Difference should be exactly the saved icmat bytes
+        saved = _NSRC * _NTIC * _FB - _NSRC * _FB
+        assert result_no_save - result == saved
+
+    def test_no_sources_no_sensors(self):
+        """Source and sensor terms are zero when counts are 0."""
+        expected = (
+            4 * 2 * _SLAB * _FB  # fields
+            + 4 * _SLAB * _FB  # material
+            + 9 * 2 * _NDLEV * _FB
+            + _SLAB * _IB  # dmap + dcmap
+        )
+        result = fullwave.Solver._mem_exponential(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=0,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=False,
+            n_sensors=0,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=True,
+        )
+        assert result == expected
+
+
+class TestMemRelaxationFormula:
+    """Verify _mem_relaxation returns byte counts matching the C formulas."""
+
+    def test_3d_matches_c_formula(self):
+        expected = (
+            4 * 2 * _SLAB * _FB  # fields (p, u, v, w) x 2 time levels
+            + 2 * (3 * _NRELAX * 2 * _SLAB * _FB)  # psi arrays
+            + 3 * _SLAB * _FB  # material: rho, K, beta
+            + 2 * _SLAB * _FB  # kappa
+            + 2 * (2 * _NRELAX) * _SLAB * _FB  # PML
+            + 9 * 2 * _NDLEV * _FB
+            + _SLAB * _IB  # dmap + dcmap
+            + _NSRC * _NTIC * _FB  # source icmat (no save)
+            + 3 * _NSRC * _IB  # source coords (3D)
+            + 3 * _NAIR * _IB  # air coords (3D)
+            + _NSEN * _FB  # genoutframe
+            + (3 + 1) * _NSEN * _IB  # coordsout_local
+            + _NSEN * _IB  # p_idx_array
+        )
+        result = fullwave.Solver._mem_relaxation(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=_NSRC,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=False,
+            n_air=_NAIR,
+            n_sensors=_NSEN,
+            n_relax=_NRELAX,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=True,
+        )
+        assert result == expected
+
+    def test_2d_matches_c_formula(self):
+        expected = (
+            3 * 2 * _SLAB * _FB  # fields (p, u, w) x 2 time levels
+            + 2 * (2 * _NRELAX * 2 * _SLAB * _FB)  # psi arrays (ndim=2)
+            + 3 * _SLAB * _FB  # material
+            + 2 * _SLAB * _FB  # kappa
+            + 2 * (2 * _NRELAX) * _SLAB * _FB  # PML
+            + 9 * 2 * _NDLEV * _FB
+            + _SLAB * _IB  # dmap + dcmap
+            + _NSRC * _NTIC * _FB  # source icmat
+            + 2 * _NSRC * _IB  # source coords (2D)
+            + 2 * _NAIR * _IB  # air coords (2D)
+            + _NSEN * _FB  # genoutframe
+            + (2 + 1) * _NSEN * _IB  # coordsout_local
+            + _NSEN * _IB  # p_idx_array
+        )
+        result = fullwave.Solver._mem_relaxation(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=_NSRC,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=False,
+            n_air=_NAIR,
+            n_sensors=_NSEN,
+            n_relax=_NRELAX,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=False,
+        )
+        assert result == expected
+
+    def test_no_air_no_sources_no_sensors(self):
+        """Conditional terms are zero when counts are 0."""
+        expected = (
+            4 * 2 * _SLAB * _FB  # fields
+            + 2 * (3 * _NRELAX * 2 * _SLAB * _FB)  # psi
+            + 3 * _SLAB * _FB  # material
+            + 2 * _SLAB * _FB  # kappa
+            + 2 * (2 * _NRELAX) * _SLAB * _FB  # PML
+            + 9 * 2 * _NDLEV * _FB
+            + _SLAB * _IB  # dmap + dcmap
+        )
+        result = fullwave.Solver._mem_relaxation(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=0,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=False,
+            n_air=0,
+            n_sensors=0,
+            n_relax=_NRELAX,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=True,
+        )
+        assert result == expected
+
+    def test_save_gpu_memory(self):
+        """save_gpu_memory uses single-slice icmat (n_src x fb) instead of full."""
+        result = fullwave.Solver._mem_relaxation(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=_NSRC,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=True,
+            n_air=_NAIR,
+            n_sensors=_NSEN,
+            n_relax=_NRELAX,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=True,
+        )
+        result_no_save = fullwave.Solver._mem_relaxation(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=_NSRC,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=False,
+            n_air=_NAIR,
+            n_sensors=_NSEN,
+            n_relax=_NRELAX,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=True,
+        )
+        saved = _NSRC * _NTIC * _FB - _NSRC * _FB
+        assert result_no_save - result == saved
diff --git a/tests/solver/test_input_file_writer.py b/tests/solver/test_input_file_writer.py
index 5da42ad..abca35e 100644
--- a/tests/solver/test_input_file_writer.py
+++ b/tests/solver/test_input_file_writer.py
@@ -44,6 +44,10 @@ def create_dummy_objects():
         outcoords=np.array([[1, 2], [3, 4]], dtype=np.int64),
         sampling_modulus_time=0.5,
         n_sensors=2,
+        mod_x=0,
+        mod_y=0,
+        mod_z=0,
+        is_sparse_grid=False,
     )
     return grid, medium, source, sensor
 
@@ -242,6 +246,246 @@ def test_run_with_incoords_add_writes_icc_add_and_ncoords_add(
     assert ncoords_add_val == 2
 
 
+def _source_with_velocity(**vel_attrs):
+    """Return a dummy source SimpleNamespace with optional velocity attributes."""
+    src = SimpleNamespace(
+        icmat=np.array([[1, 2], [3, 4]], dtype=np.float64),
+        incoords=np.array([[1, 2], [3, 4]], dtype=np.int64),
+        n_sources=2,
+        p0_additive=None,
+        incoords_add=None,
+        n_sources_add=0,
+        u0=None,
+        incoords_u=None,
+        n_sources_u=0,
+        v0=None,
+        incoords_v=None,
+        n_sources_v=0,
+        w0=None,
+        incoords_w=None,
+        n_sources_w=0,
+    )
+    for k, v in vel_attrs.items():
+        setattr(src, k, v)
+    return src
+
+
+def _run_writer(work_dir, bin_file, source, monkeypatch, sim_dir_name="sim_vel"):
+    """Build writer, bypass validation, run, and return sim_path."""
+    grid, medium, _, sensor = create_dummy_objects()
+    monkeypatch.setattr(check_functions, "check_path_exists", lambda x: None)  # noqa: ARG005
+    monkeypatch.setattr(check_functions, "check_instance", lambda inst, cls: None)  # noqa: ARG005
+    writer = InputFileWriter(
+        work_dir,
+        grid,
+        medium,
+        source,
+        sensor,
+        path_fullwave_simulation_bin=bin_file,
+        validate_input=False,
+    )
+    return Path(writer.run(sim_dir_name, is_static_map=False, recalculate_pml=True))
+
+
+class TestVelocitySourceWriter:
+    """Tests that InputFileWriter emits correct files for velocity source components."""
+
+    def test_u0_writes_icmat_u_and_icc_u(self, work_and_bin, monkeypatch):
+        """u0 + incoords_u → icmat_u.dat and icc_u.dat are created."""
+        work_dir, bin_file = work_and_bin
+        u0 = np.array([[0.5, 1.0]], dtype=np.float64)
+        incoords_u = np.array([[2, 3]], dtype=np.int64)
+        source = _source_with_velocity(u0=u0, incoords_u=incoords_u, n_sources_u=1)
+
+        sim_path = _run_writer(work_dir, bin_file, source, monkeypatch)
+
+        assert (sim_path / "icmat_u.dat").exists(), "icmat_u.dat not created"
+        assert (sim_path / "icc_u.dat").exists(), "icc_u.dat not created"
+
+    def test_u0_content(self, work_and_bin, monkeypatch):
+        """icmat_u.dat contains u0 data; icc_u.dat contains incoords_u."""
+        work_dir, bin_file = work_and_bin
+        u0 = np.array([[0.5, 1.0]], dtype=np.float64)  # shape (1, 2) — transpose is still (2, 1)
+        incoords_u = np.array([[2, 3]], dtype=np.int64)
+        source = _source_with_velocity(u0=u0, incoords_u=incoords_u, n_sources_u=1)
+
+        sim_path = _run_writer(work_dir, bin_file, source, monkeypatch)
+
+        # icmat_u.dat is written as np.transpose(u0) in float32
+        icmat_u_data = np.fromfile(sim_path / "icmat_u.dat", dtype=np.float32)
+        np.testing.assert_array_almost_equal(icmat_u_data, u0.astype(np.float32).ravel())
+
+        # icc_u.dat written as int32
+        icc_u_data = np.fromfile(sim_path / "icc_u.dat", dtype=np.int32)
+        np.testing.assert_array_equal(icc_u_data, incoords_u.ravel())
+
+    def test_ncoords_u_dat_written_with_correct_count(self, work_and_bin, monkeypatch):
+        """ncoords_u.dat is written with the correct integer count."""
+        work_dir, bin_file = work_and_bin
+        u0 = np.array([[0.5, 1.0], [0.2, 0.3]], dtype=np.float64)
+        incoords_u = np.array([[0, 1], [1, 0]], dtype=np.int64)
+        source = _source_with_velocity(u0=u0, incoords_u=incoords_u, n_sources_u=2)
+
+        sim_path = _run_writer(work_dir, bin_file, source, monkeypatch)
+
+        ncoords_u_path = sim_path / "ncoords_u.dat"
+        assert ncoords_u_path.exists(), "ncoords_u.dat not created"
+        val = np.fromfile(ncoords_u_path, dtype=np.int32)
+        assert int(val) == 2
+
+    def test_all_three_velocity_components_write_files(self, work_and_bin, monkeypatch):
+        """All three u/v/w components produce their respective files."""
+        work_dir, bin_file = work_and_bin
+        u0 = np.array([[1.0, 2.0]], dtype=np.float64)
+        v0 = np.array([[3.0, 4.0]], dtype=np.float64)
+        w0 = np.array([[5.0, 6.0]], dtype=np.float64)
+        source = _source_with_velocity(
+            u0=u0,
+            incoords_u=np.array([[0, 1]], dtype=np.int64),
+            n_sources_u=1,
+            v0=v0,
+            incoords_v=np.array([[1, 0]], dtype=np.int64),
+            n_sources_v=1,
+            w0=w0,
+            incoords_w=np.array([[0, 0]], dtype=np.int64),
+            n_sources_w=1,
+        )
+
+        sim_path = _run_writer(work_dir, bin_file, source, monkeypatch)
+
+        for suffix in ("u", "v", "w"):
+            assert (sim_path / f"icmat_{suffix}.dat").exists(), f"icmat_{suffix}.dat missing"
+            assert (sim_path / f"icc_{suffix}.dat").exists(), f"icc_{suffix}.dat missing"
+            assert (sim_path / f"ncoords_{suffix}.dat").exists(), f"ncoords_{suffix}.dat missing"
+
+    def test_no_velocity_source_no_velocity_files(self, work_and_bin, monkeypatch):
+        """When no velocity source is set, no velocity-related files are created."""
+        work_dir, bin_file = work_and_bin
+        _, _, source, _ = create_dummy_objects()
+
+        sim_path = _run_writer(work_dir, bin_file, source, monkeypatch)
+
+        for suffix in ("u", "v", "w"):
+            assert not (sim_path / f"icmat_{suffix}.dat").exists()
+            assert not (sim_path / f"icc_{suffix}.dat").exists()
+            assert not (sim_path / f"ncoords_{suffix}.dat").exists()
+
+    def test_only_v_component_writes_only_v_files(self, work_and_bin, monkeypatch):
+        """Only v present → only icmat_v/icc_v/ncoords_v written; u and w absent."""
+        work_dir, bin_file = work_and_bin
+        v0 = np.array([[7.0, 8.0]], dtype=np.float64)
+        source = _source_with_velocity(
+            v0=v0,
+            incoords_v=np.array([[3, 3]], dtype=np.int64),
+            n_sources_v=1,
+        )
+
+        sim_path = _run_writer(work_dir, bin_file, source, monkeypatch)
+
+        assert (sim_path / "icmat_v.dat").exists()
+        assert (sim_path / "icc_v.dat").exists()
+        assert (sim_path / "ncoords_v.dat").exists()
+        assert not (sim_path / "icmat_u.dat").exists()
+        assert not (sim_path / "icc_u.dat").exists()
+        assert not (sim_path / "ncoords_u.dat").exists()
+        assert not (sim_path / "icmat_w.dat").exists()
+        assert not (sim_path / "icc_w.dat").exists()
+        assert not (sim_path / "ncoords_w.dat").exists()
+
+    def test_velocity_only_no_pressure_files_are_empty(self, work_and_bin, monkeypatch):
+        """Velocity-only source (n_sources=0) writes empty icmat/icc and correct ncoords=0."""
+        work_dir, bin_file = work_and_bin
+        u0 = np.array([[1.0, 2.0]], dtype=np.float64)
+        source = _source_with_velocity(
+            # override pressure fields to match a velocity-only source
+            icmat=np.zeros((0, 2), dtype=np.float64),
+            incoords=np.empty((0, 2), dtype=np.int64),
+            n_sources=0,
+            u0=u0,
+            incoords_u=np.array([[0, 1]], dtype=np.int64),
+            n_sources_u=1,
+        )
+
+        sim_path = _run_writer(work_dir, bin_file, source, monkeypatch)
+
+        # ncoords.dat should be 0
+        ncoords_val = np.fromfile(sim_path / "ncoords.dat", dtype=np.int32)
+        assert int(ncoords_val) == 0
+
+        # icmat_u.dat and icc_u.dat should exist with correct content
+        assert (sim_path / "icmat_u.dat").exists()
+        assert (sim_path / "ncoords_u.dat").exists()
+        ncoords_u_val = np.fromfile(sim_path / "ncoords_u.dat", dtype=np.int32)
+        assert int(ncoords_u_val) == 1
+
+    def test_all_five_components_written_and_content_correct(
+        self,
+        work_and_bin,
+        monkeypatch,
+    ):
+        """p, additive, u, v, and w all set → all files written with correct binary content."""
+        work_dir, bin_file = work_and_bin
+
+        p0 = np.array([[1.0, 2.0]], dtype=np.float64)
+        p0_additive = np.array([[0.1, 0.2]], dtype=np.float64)
+        u0 = np.array([[3.0, 4.0]], dtype=np.float64)
+        v0 = np.array([[5.0, 6.0]], dtype=np.float64)
+        w0 = np.array([[7.0, 8.0]], dtype=np.float64)
+
+        incoords = np.array([[0, 0]], dtype=np.int64)
+        incoords_add = np.array([[1, 0]], dtype=np.int64)
+        incoords_u = np.array([[0, 1]], dtype=np.int64)
+        incoords_v = np.array([[1, 1]], dtype=np.int64)
+        incoords_w = np.array([[2, 0]], dtype=np.int64)
+
+        source = SimpleNamespace(
+            icmat=p0,
+            incoords=incoords,
+            n_sources=1,
+            p0_additive=p0_additive,
+            incoords_add=incoords_add,
+            n_sources_add=1,
+            u0=u0,
+            incoords_u=incoords_u,
+            n_sources_u=1,
+            v0=v0,
+            incoords_v=incoords_v,
+            n_sources_v=1,
+            w0=w0,
+            incoords_w=incoords_w,
+            n_sources_w=1,
+        )
+
+        sim_path = _run_writer(work_dir, bin_file, source, monkeypatch)
+
+        # --- pressure (hard) ---
+        icmat_data = np.fromfile(sim_path / "icmat.dat", dtype=np.float32)
+        np.testing.assert_array_almost_equal(icmat_data, p0.astype(np.float32).ravel())
+        icc_data = np.fromfile(sim_path / "icc.dat", dtype=np.int32)
+        np.testing.assert_array_equal(icc_data, incoords.ravel())
+
+        # --- additive (soft) ---
+        icmat_add_data = np.fromfile(sim_path / "icmat_add.dat", dtype=np.float32)
+        np.testing.assert_array_almost_equal(icmat_add_data, p0_additive.astype(np.float32).ravel())
+        icc_add_data = np.fromfile(sim_path / "icc_add.dat", dtype=np.int32)
+        np.testing.assert_array_equal(icc_add_data, incoords_add.ravel())
+        ncoords_add_val = np.fromfile(sim_path / "ncoords_add.dat", dtype=np.int32)
+        assert int(ncoords_add_val) == 1
+
+        # --- velocity u, v, w ---
+        for suffix, sig, icc in (
+            ("u", u0, incoords_u),
+            ("v", v0, incoords_v),
+            ("w", w0, incoords_w),
+        ):
+            icmat_vel = np.fromfile(sim_path / f"icmat_{suffix}.dat", dtype=np.float32)
+            np.testing.assert_array_almost_equal(icmat_vel, sig.astype(np.float32).ravel())
+            icc_vel = np.fromfile(sim_path / f"icc_{suffix}.dat", dtype=np.int32)
+            np.testing.assert_array_equal(icc_vel, icc.ravel())
+            ncoords_vel = np.fromfile(sim_path / f"ncoords_{suffix}.dat", dtype=np.int32)
+            assert int(ncoords_vel) == 1
+
+
 def _dc_map_original(c_map: np.ndarray) -> np.ndarray:
     """Original _set_dc_map logic before in-place optimization."""
     return matlab_round(c_map) - matlab_round(c_map.min()) + 1
diff --git a/tests/solver/test_launcher.py b/tests/solver/test_launcher.py
index b7f97b1..f52dac7 100644
--- a/tests/solver/test_launcher.py
+++ b/tests/solver/test_launcher.py
@@ -127,7 +127,7 @@ def test_configure_cuda_device_id_none(monkeypatch):
     )
 
     # Test with None input
-    result = Launcher._configure_cuda_device_id(None)  # noqa: SLF001
+    result = Launcher._configure_cuda_device_id(None)
     assert result == "0"
 
 
@@ -141,7 +141,7 @@ def test_configure_cuda_device_id_int(monkeypatch):
         selective_run_wrapper(monkeypatch=monkeypatch, gpu_ids=gpu_ids),
     )
     # Test with integer input
-    result = Launcher._configure_cuda_device_id(0)  # noqa: SLF001
+    result = Launcher._configure_cuda_device_id(0)
     assert result == "0"
 
 
@@ -156,7 +156,7 @@ def test_configure_cuda_device_id_string(monkeypatch):
     )
 
     # Test with string input
-    result = Launcher._configure_cuda_device_id("1")  # noqa: SLF001
+    result = Launcher._configure_cuda_device_id("1")
     assert result == "1"
 
 
@@ -170,7 +170,7 @@ def test_configure_cuda_device_id_list(monkeypatch):
         selective_run_wrapper(monkeypatch=monkeypatch, gpu_ids=gpu_ids),
     )
     # Test with list input
-    result = Launcher._configure_cuda_device_id([0, 1, 2])  # noqa: SLF001
+    result = Launcher._configure_cuda_device_id([0, 1, 2])
     assert result == "0,1,2"
 
 
@@ -185,7 +185,7 @@ def test_configure_cuda_device_id_negative_int(monkeypatch):
     )
     # Test with negative integer
     with pytest.raises(ValueError, match="CUDA device ID must be a non-negative integer"):
-        Launcher._configure_cuda_device_id(-1)  # noqa: SLF001
+        Launcher._configure_cuda_device_id(-1)
 
 
 def test_configure_cuda_device_id_invalid_string(monkeypatch):
@@ -202,7 +202,7 @@ def test_configure_cuda_device_id_invalid_string(monkeypatch):
         ValueError,
         match="CUDA device ID string must represent a non-negative integer",
     ):
-        Launcher._configure_cuda_device_id("invalid")  # noqa: SLF001
+        Launcher._configure_cuda_device_id("invalid")
 
 
 def test_configure_cuda_device_id_list_with_negative(monkeypatch):
@@ -220,7 +220,7 @@ def test_configure_cuda_device_id_list_with_negative(monkeypatch):
         ValueError,
         match="All CUDA device IDs in the list must be non-negative integers",
     ):
-        Launcher._configure_cuda_device_id([0, -1, 2])  # noqa: SLF001
+        Launcher._configure_cuda_device_id([0, -1, 2])
 
 
 def test_configure_cuda_device_id_invalid_type(monkeypatch):
@@ -238,7 +238,7 @@ def test_configure_cuda_device_id_invalid_type(monkeypatch):
         ValueError,
         match="CUDA device ID must be an integer, string, list, or None",
     ):
-        Launcher._configure_cuda_device_id(math.pi)  # noqa: SLF001
+        Launcher._configure_cuda_device_id(math.pi)
 
 
 def test_launcher_init_invalid_bin_path(monkeypatch):
diff --git a/tests/solver/test_signal_filter.py b/tests/solver/test_signal_filter.py
new file mode 100644
index 0000000..8738a66
--- /dev/null
+++ b/tests/solver/test_signal_filter.py
@@ -0,0 +1,261 @@
+"""Unit tests for fullwave.utils.signal_filter — no GPU required."""
+
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+
+import fullwave
+from fullwave.solver.solver import Solver
+from fullwave.utils.signal_filter import _build_frequency_mask, apply_filter
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_signal(dt: float, n_t: int, frequencies_hz: list[float]) -> np.ndarray:
+    """Sum of pure sinusoids, shape [1, n_t]."""
+    t = np.arange(n_t) * dt
+    sig = sum(np.sin(2 * np.pi * f * t) for f in frequencies_hz)
+    return np.asarray(sig, dtype=np.float64)[np.newaxis, :]
+
+
+# ---------------------------------------------------------------------------
+# _build_frequency_mask
+# ---------------------------------------------------------------------------
+
+
+def test_mask_shape():
+    n_fft = 512
+    dt = 1e-8
+    mask = _build_frequency_mask(n_fft, dt)
+    assert mask.shape == (n_fft // 2 + 1,)
+
+
+def test_mask_all_ones_when_no_cutoff():
+    mask = _build_frequency_mask(256, 1e-8)
+    np.testing.assert_array_equal(mask, np.ones(256 // 2 + 1))
+
+
+def test_mask_highpass_dc_zero():
+    """DC bin (index 0) must be zero for any high-pass cutoff > 0."""
+    mask = _build_frequency_mask(256, 1e-8, f_low_hz=1e6)
+    assert mask[0] == 0.0
+
+
+def test_mask_highpass_passband_one():
+    """Bins well above the cutoff should be ≈ 1."""
+    dt = 1e-8
+    n_fft = 1024
+    f_low_hz = 1e6
+    mask = _build_frequency_mask(n_fft, dt, f_low_hz=f_low_hz)
+    freqs = np.fft.rfftfreq(n_fft, d=dt)
+    passband = freqs > f_low_hz * 1.1
+    np.testing.assert_allclose(mask[passband], 1.0, atol=1e-10)
+
+
+def test_mask_bandpass_range():
+    """Bins well inside the band ≈ 1; bins well outside ≈ 0."""
+    dt = 1e-8
+    n_fft = 1024
+    f_low_hz = 1e6
+    f_high_hz = 5e6
+    mask = _build_frequency_mask(n_fft, dt, f_low_hz=f_low_hz, f_high_hz=f_high_hz)
+    freqs = np.fft.rfftfreq(n_fft, d=dt)
+
+    passband = (freqs > f_low_hz * 1.2) & (freqs < f_high_hz * 0.8)
+    stopband_low = freqs < f_low_hz * 0.8
+    stopband_high = freqs > f_high_hz * 1.2
+
+    np.testing.assert_allclose(mask[passband], 1.0, atol=1e-10)
+    np.testing.assert_allclose(mask[stopband_low], 0.0, atol=1e-10)
+    np.testing.assert_allclose(mask[stopband_high], 0.0, atol=1e-10)
+
+
+# ---------------------------------------------------------------------------
+# apply_filter — functional tests (CPU only via use_gpu=False)
+# ---------------------------------------------------------------------------
+
+
+def test_no_filter_passthrough():
+    """With no cutoffs the output must equal the input (up to float rounding)."""
+    dt = 1e-8
+    n_t = 512
+    rng = np.random.default_rng(0)
+    data = rng.standard_normal((8, n_t))
+    result = apply_filter(data, dt, use_gpu=False)
+    np.testing.assert_allclose(result, data, atol=1e-10)
+
+
+def test_highpass_removes_dc():
+    """DC + sine: after high-pass the DC component should vanish, sine should survive."""
+    dt = 1e-8
+    n_t = 2048
+    f_signal_hz = 3e6  # 3 MHz — well above the 0.5 MHz cutoff
+    f_low_hz = 0.5e6
+
+    t = np.arange(n_t) * dt
+    dc = 5.0 * np.ones(n_t)
+    sine = np.sin(2 * np.pi * f_signal_hz * t)
+    data = (dc + sine)[np.newaxis, :]
+
+    result = apply_filter(data, dt, f_low_hz=f_low_hz, use_gpu=False)
+
+    # DC (mean) should be nearly zero
+    assert abs(result[0].mean()) < 0.05
+
+    # Sine amplitude should be close to 1.0 — check via RMS
+    rms = np.sqrt(np.mean(result[0] ** 2))
+    assert 0.6 < rms < 1.1, f"Expected RMS ~0.7, got {rms}"
+
+
+def test_bandpass_filter_gain_matches_mask():
+    """Gold-standard performance test: measured per-frequency gain must match the mask.
+
+    The mask returned by ``_build_frequency_mask`` is the specification.  We feed
+    pure sinusoids at two frequencies (one well inside the band, one well outside),
+    run the filter, then recover the actual gain at each frequency via FFT.  The
+    measured gain must agree with the mask value at that bin to within 1 %.
+    """
+    dt = 1e-8
+    n_t = 8192  # large n for clean spectral resolution
+    f_inband_hz = 3e6  # 3 MHz — flat passband, mask ≈ 1
+    f_outband_hz = 0.2e6  # 0.2 MHz — well into stopband, mask ≈ 0
+
+    f_low_hz = 1e6
+    f_high_hz = 5e6
+
+    t = np.arange(n_t) * dt
+    data = (np.sin(2 * np.pi * f_inband_hz * t) + np.sin(2 * np.pi * f_outband_hz * t))[
+        np.newaxis,
+        :,
+    ]
+
+    result = apply_filter(data, dt, f_low_hz=f_low_hz, f_high_hz=f_high_hz, use_gpu=False)
+
+    # --- gold standard: mask values at the exact test frequencies ---
+    freqs = np.fft.rfftfreq(n_t, d=dt)
+    mask = _build_frequency_mask(n_t, dt, f_low_hz=f_low_hz, f_high_hz=f_high_hz)
+
+    idx_inband = int(np.argmin(np.abs(freqs - f_inband_hz)))
+    idx_outband = int(np.argmin(np.abs(freqs - f_outband_hz)))
+
+    expected_gain_inband = mask[idx_inband]  # should be ≈ 1.0
+    expected_gain_outband = mask[idx_outband]  # should be ≈ 0.0
+
+    # --- measured gains via amplitude spectrum ---
+    input_spec = np.abs(np.fft.rfft(data[0]))
+    output_spec = np.abs(np.fft.rfft(result[0]))
+
+    measured_gain_inband = output_spec[idx_inband] / input_spec[idx_inband]
+    measured_gain_outband = output_spec[idx_outband] / input_spec[idx_outband]
+
+    tol = 0.01  # 1 % absolute tolerance on gain
+
+    assert abs(measured_gain_inband - expected_gain_inband) < tol, (
+        f"In-band gain {measured_gain_inband:.4f} deviates from mask {expected_gain_inband:.4f}"
+    )
+    assert abs(measured_gain_outband - expected_gain_outband) < tol, (
+        f"Out-of-band gain {measured_gain_outband:.4f} deviates from mask "
+        f"{expected_gain_outband:.4f}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Validation via solver.run() — no GPU, no binary needed
+# ---------------------------------------------------------------------------
+
+
+def test_highpass_and_bandpass_raises():
+    """solver.run() must raise ValueError when both filter params are set."""
+    grid = MagicMock(spec=fullwave.Grid)
+    grid.is_3d = False
+    grid.ppw = 4
+    grid.nx = 100
+    grid.ny = 100
+    grid.nz = 1
+    grid.dt = 1e-8
+
+    medium = MagicMock(spec=fullwave.Medium)
+    medium.n_relaxation_mechanisms = 2
+    medium.use_isotropic_relaxation = True
+
+    source = MagicMock(spec=fullwave.Source)
+    sensor = MagicMock(spec=fullwave.Sensor)
+
+    fake_bin = MagicMock()
+    fake_bin.exists.return_value = True
+
+    # Bypass __init__ entirely, just test the validation in run()
+    solver = object.__new__(Solver)
+    solver.run_on_memory = False
+    solver.work_dir = MagicMock()
+    solver.grid = grid
+    solver.medium = medium
+    solver.is_3d = False
+    solver.use_gpu = False
+    solver.use_exponential_attenuation = False
+    solver.use_isotropic_relaxation = True
+    solver.n_relax_mechanisms = 2
+    solver.source = source
+    solver.sensor = sensor
+    solver.transducer = None
+    solver.path_fullwave_simulation_bin = fake_bin
+    solver.cuda_device_id = None
+    solver.use_pml = True
+    solver.save_gpu_memory = False
+
+    pml_builder = MagicMock()
+    solver.pml_builder = pml_builder
+    solver.fullwave_launcher = MagicMock()
+
+    with pytest.raises(ValueError, match="cannot both be specified"):
+        solver.run(
+            highpass_cutoff_mhz=0.5,
+            bandpass_cutoff_mhz=(1.0, 5.0),
+        )
+
+
+def test_filter_requires_load_results():
+    """Passing a filter option with load_results=False must raise ValueError."""
+    grid = MagicMock(spec=fullwave.Grid)
+    grid.is_3d = False
+    grid.ppw = 4
+    grid.nx = 100
+    grid.ny = 100
+    grid.nz = 1
+    grid.dt = 1e-8
+
+    medium = MagicMock(spec=fullwave.Medium)
+    medium.n_relaxation_mechanisms = 2
+    medium.use_isotropic_relaxation = True
+
+    source = MagicMock(spec=fullwave.Source)
+    sensor = MagicMock(spec=fullwave.Sensor)
+
+    solver = object.__new__(Solver)
+    solver.run_on_memory = False
+    solver.work_dir = MagicMock()
+    solver.grid = grid
+    solver.medium = medium
+    solver.is_3d = False
+    solver.use_gpu = False
+    solver.use_exponential_attenuation = False
+    solver.use_isotropic_relaxation = True
+    solver.n_relax_mechanisms = 2
+    solver.source = source
+    solver.sensor = sensor
+    solver.transducer = None
+    solver.path_fullwave_simulation_bin = MagicMock()
+    solver.cuda_device_id = None
+    solver.use_pml = True
+    solver.save_gpu_memory = False
+    solver.pml_builder = MagicMock()
+    solver.fullwave_launcher = MagicMock()
+
+    with pytest.raises(ValueError, match="load_results=True"):
+        solver.run(
+            highpass_cutoff_mhz=0.5,
+            load_results=False,
+        )
diff --git a/tests/test_cupy_equivalence.py b/tests/test_cupy_equivalence.py
new file mode 100644
index 0000000..ac64f54
--- /dev/null
+++ b/tests/test_cupy_equivalence.py
@@ -0,0 +1,926 @@
+"""Tests verifying that CuPy (GPU) and NumPy (CPU) paths produce identical results.
+
+Every test in this module is automatically skipped when CuPy is not installed
+or when no CUDA device is available.
+"""
+
+import numpy as np
+import pytest
+
+import fullwave.medium as medium_module
+from fullwave.medium import Medium, MediumExponentialAttenuation, MediumRelaxationMaps
+from fullwave.solver.input_file_writer import InputFileWriter
+from fullwave.solver.pml_builder import PMLBuilder, PMLBuilderExponentialAttenuation
+from fullwave.solver.utils import initialize_relaxation_param_dict
+from fullwave.utils.relaxation_parameters import RelaxationParametersGenerator
+
+# ---------------------------------------------------------------------------
+# Skip the entire module when CuPy / CUDA is unavailable
+# ---------------------------------------------------------------------------
+try:
+    import cupy as cp
+
+    cp.cuda.runtime.getDeviceCount()  # raises if no device
+    _CUPY_AVAILABLE = True
+except Exception:
+    _CUPY_AVAILABLE = False
+
+pytestmark = pytest.mark.skipif(not _CUPY_AVAILABLE, reason="CuPy or CUDA device not available")
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+class DummyGrid2D:
+    def __init__(self, nx, ny, dt=1e-8, f0=1e6, c0=1540.0, ppw=12, cfl=0.4):
+        self.nx = nx
+        self.ny = ny
+        self.nz = 1
+        self.dx = c0 / (f0 * ppw)
+        self.dy = self.dx
+        self.dz = self.dx
+        self.dt = dt
+        self.f0 = f0
+        self.c0 = c0
+        self.ppw = ppw
+        self.cfl = cfl
+        self.duration = dt * 100
+        self.omega = 2.0 * np.pi * f0
+        self.is_3d = False
+
+
+class DummyGrid3D:
+    def __init__(self, nx, ny, nz, dt=1e-8, f0=1e6, c0=1540.0, ppw=12, cfl=0.4):
+        self.nx = nx
+        self.ny = ny
+        self.nz = nz
+        self.dx = c0 / (f0 * ppw)
+        self.dy = self.dx
+        self.dz = self.dx
+        self.dt = dt
+        self.f0 = f0
+        self.c0 = c0
+        self.ppw = ppw
+        self.cfl = cfl
+        self.duration = dt * 100
+        self.omega = 2.0 * np.pi * f0
+        self.is_3d = True
+
+
+def _to_np(arr):
+    """Convert CuPy array to numpy; no-op for numpy arrays."""
+    if isinstance(arr, np.ndarray):
+        return arr
+    return arr.get()
+
+
+def _dummy_check_functions():
+    return type(
+        "dummy",
+        (),
+        {
+            "check_instance": lambda *_args: None,
+            "check_path_exists": lambda *_args: None,
+            "check_compatible_value": lambda *_args: None,
+        },
+    )()
+
+
+def _get_relaxation_dict(shape, n_relaxation_mechanisms=2):
+    base = initialize_relaxation_param_dict(n_relaxation_mechanisms=n_relaxation_mechanisms)
+    rng = np.random.default_rng(42)
+    return {key: rng.uniform(0.5, 2.0, size=shape) for key in base}
+
+
+# ---------------------------------------------------------------------------
+# Medium tests
+# ---------------------------------------------------------------------------
+class TestMediumCupyEquivalence:
+    """Compare CPU vs GPU for Medium methods."""
+
+    @pytest.fixture(autouse=True)
+    def _patch(self, monkeypatch):
+        monkeypatch.setattr(medium_module, "check_functions", _dummy_check_functions())
+
+    def _make_medium_pair(self, grid_shape, grid):
+        rng = np.random.default_rng(123)
+        sound_speed = rng.uniform(1400, 1600, grid_shape)
+        density = rng.uniform(900, 1100, grid_shape)
+        alpha_coeff = rng.uniform(0.1, 1.0, grid_shape)
+        alpha_power = rng.uniform(1.0, 2.0, grid_shape)
+        beta = rng.uniform(0.5, 1.5, grid_shape)
+
+        cpu = Medium(
+            grid,
+            sound_speed.copy(),
+            density.copy(),
+            alpha_coeff.copy(),
+            alpha_power.copy(),
+            beta.copy(),
+            use_gpu=False,
+        )
+        gpu = Medium(
+            grid,
+            sound_speed.copy(),
+            density.copy(),
+            alpha_coeff.copy(),
+            alpha_power.copy(),
+            beta.copy(),
+            use_gpu=True,
+        )
+        return cpu, gpu
+
+    def test_db_mhz_cm_to_a_exp_2d(self):
+        shape = (32, 32)
+        grid = DummyGrid2D(nx=shape[0], ny=shape[1])
+        cpu, gpu = self._make_medium_pair(shape, grid)
+
+        cpu_result = cpu._db_mhz_cm_to_a_exp(cpu.alpha_coeff)
+        gpu_result = gpu._db_mhz_cm_to_a_exp(gpu.alpha_coeff)
+
+        np.testing.assert_allclose(_to_np(gpu_result), cpu_result, rtol=1e-12)
+
+    def test_db_mhz_cm_to_a_exp_3d(self):
+        shape = (16, 16, 16)
+        grid = DummyGrid3D(nx=shape[0], ny=shape[1], nz=shape[2])
+        cpu, gpu = self._make_medium_pair(shape, grid)
+
+        cpu_result = cpu._db_mhz_cm_to_a_exp(cpu.alpha_coeff)
+        gpu_result = gpu._db_mhz_cm_to_a_exp(gpu.alpha_coeff)
+
+        np.testing.assert_allclose(_to_np(gpu_result), cpu_result, rtol=1e-12)
+
+
+class TestMediumRelaxationMapsCupyEquivalence:
+    """Compare CPU vs GPU for MediumRelaxationMaps methods."""
+
+    @pytest.fixture(autouse=True)
+    def _patch(self, monkeypatch):
+        monkeypatch.setattr(medium_module, "check_functions", _dummy_check_functions())
+
+    def _make_pair(self, grid_shape, grid):
+        rng = np.random.default_rng(456)
+        sound_speed = rng.uniform(1400, 1600, grid_shape)
+        density = rng.uniform(900, 1100, grid_shape)
+        beta = rng.uniform(0.5, 1.5, grid_shape)
+        relax = _get_relaxation_dict(grid_shape)
+
+        cpu = MediumRelaxationMaps(
+            grid,
+            sound_speed.copy(),
+            density.copy(),
+            beta.copy(),
+            {k: v.copy() for k, v in relax.items()},
+            use_gpu=False,
+        )
+        gpu = MediumRelaxationMaps(
+            grid,
+            sound_speed.copy(),
+            density.copy(),
+            beta.copy(),
+            {k: v.copy() for k, v in relax.items()},
+            use_gpu=True,
+        )
+        return cpu, gpu
+
+    def test_relaxation_param_dict_2d(self):
+        shape = (20, 20)
+        grid = DummyGrid2D(nx=shape[0], ny=shape[1])
+        cpu, gpu = self._make_pair(shape, grid)
+
+        for key in cpu.relaxation_param_dict:
+            np.testing.assert_allclose(
+                _to_np(gpu.relaxation_param_dict[key]),
+                cpu.relaxation_param_dict[key],
+                rtol=1e-10,
+                err_msg=f"relaxation_param_dict[{key}] mismatch",
+            )
+
+    def test_relaxation_param_dict_for_fw2_2d(self):
+        shape = (20, 20)
+        grid = DummyGrid2D(nx=shape[0], ny=shape[1])
+        cpu, gpu = self._make_pair(shape, grid)
+
+        for key in cpu.relaxation_param_dict_for_fw2:
+            np.testing.assert_allclose(
+                _to_np(gpu.relaxation_param_dict_for_fw2[key]),
+                cpu.relaxation_param_dict_for_fw2[key],
+                rtol=1e-10,
+                err_msg=f"relaxation_param_dict_for_fw2[{key}] mismatch",
+            )
+
+    def test_calc_a_and_b(self):
+        shape = (20, 20)
+        grid = DummyGrid2D(nx=shape[0], ny=shape[1])
+        cpu, gpu = self._make_pair(shape, grid)
+
+        rng = np.random.default_rng(789)
+        dx = rng.uniform(0.01, 0.5, shape)
+        kappa = rng.uniform(0.5, 3.0, shape)
+        alpha = rng.uniform(0.01, 0.5, shape)
+        dt = 1e-8
+
+        a_cpu, b_cpu = cpu._calc_a_and_b(dx, kappa, alpha, dt)
+        a_gpu, b_gpu = gpu._calc_a_and_b(dx, kappa, alpha, dt)
+
+        np.testing.assert_allclose(_to_np(a_gpu), a_cpu, rtol=1e-12)
+        np.testing.assert_allclose(_to_np(b_gpu), b_cpu, rtol=1e-12)
+
+
+# ---------------------------------------------------------------------------
+# PMLBuilder tests
+# ---------------------------------------------------------------------------
+class TestPMLBuilderCupyEquivalence:
+    """Compare CPU vs GPU for PMLBuilder._extend_map_for_pml and _apply_transition_and_pml."""
+
+    @pytest.fixture(autouse=True)
+    def _patch(self, monkeypatch):
+        monkeypatch.setattr(medium_module, "check_functions", _dummy_check_functions())
+
+    def _make_pml_pair(self, grid, medium, source, sensor, **kwargs):
+        cpu = PMLBuilder(
+            grid,
+            medium,
+            source,
+            sensor,
+            use_gpu=False,
+            **kwargs,
+        )
+        gpu = PMLBuilder(
+            grid,
+            medium,
+            source,
+            sensor,
+            use_gpu=True,
+            **kwargs,
+        )
+        return cpu, gpu
+
+    @pytest.fixture()
+    def setup_2d(self):
+        import fullwave
+
+        grid = fullwave.Grid(
+            domain_size=(0.01, 0.01),
+            f0=1e6,
+            duration=1e-6,
+            c0=1540.0,
+            ppw=12,
+            cfl=0.4,
+        )
+        rng = np.random.default_rng(111)
+        shape = (grid.nx, grid.ny)
+        sound_speed = rng.uniform(1400, 1600, shape)
+        density = rng.uniform(900, 1100, shape)
+        alpha_coeff = rng.uniform(0.1, 1.0, shape)
+        alpha_power = rng.uniform(1.0, 2.0, shape)
+        beta = rng.uniform(0.5, 1.5, shape)
+
+        medium = fullwave.Medium(
+            grid=grid,
+            sound_speed=sound_speed,
+            density=density,
+            alpha_coeff=alpha_coeff,
+            alpha_power=alpha_power,
+            beta=beta,
+        )
+        src_coords = np.array([[grid.nx // 2, y] for y in range(grid.ny)])
+        source = fullwave.Source(
+            p0=np.ones((src_coords.shape[0], grid.nt)),
+            coords=src_coords,
+            grid_shape=shape,
+        )
+        sen_coords = np.array([[grid.nx // 2 + 5, y] for y in range(grid.ny)])
+        sensor = fullwave.Sensor(
+            coords=sen_coords,
+            grid_shape=shape,
+        )
+        return grid, medium, source, sensor
+
+    def test_extend_map_for_pml_2d_fill_edge(self, setup_2d):
+        grid, medium, source, sensor = setup_2d
+        cpu, gpu = self._make_pml_pair(grid, medium, source, sensor)
+
+        rng = np.random.default_rng(222)
+        arr = rng.uniform(0.0, 100.0, (grid.nx, grid.ny))
+
+        cpu_result = cpu._extend_map_for_pml(arr.copy(), fill_edge=True)
+        gpu_result = gpu._extend_map_for_pml(arr.copy(), fill_edge=True)
+
+        np.testing.assert_allclose(_to_np(gpu_result), cpu_result, rtol=1e-14)
+
+    def test_extend_map_for_pml_2d_zero_fill(self, setup_2d):
+        grid, medium, source, sensor = setup_2d
+        cpu, gpu = self._make_pml_pair(grid, medium, source, sensor)
+
+        rng = np.random.default_rng(333)
+        arr = rng.uniform(0.0, 100.0, (grid.nx, grid.ny))
+
+        cpu_result = cpu._extend_map_for_pml(arr.copy(), fill_edge=False)
+        gpu_result = gpu._extend_map_for_pml(arr.copy(), fill_edge=False)
+
+        np.testing.assert_allclose(_to_np(gpu_result), cpu_result, rtol=1e-14)
+
+    def test_apply_transition_and_pml_2d(self, setup_2d):
+        grid, medium, source, sensor = setup_2d
+        cpu, gpu = self._make_pml_pair(grid, medium, source, sensor)
+
+        ext_shape = cpu.extended_medium.sound_speed.shape
+        rng = np.random.default_rng(444)
+        arr = rng.uniform(0.1, 5.0, ext_shape)
+
+        for axis in [0, 1]:
+            for transition_type in ["smooth", "linear", "cosine", "polynomial"]:
+                cpu_result = cpu._apply_transition_and_pml(
+                    arr.copy(),
+                    value_target=0.0,
+                    array_shape=ext_shape,
+                    axis=axis,
+                    transition_type=transition_type,
+                    is_3d=False,
+                )
+                gpu_result = gpu._apply_transition_and_pml(
+                    arr.copy(),
+                    value_target=0.0,
+                    array_shape=ext_shape,
+                    axis=axis,
+                    transition_type=transition_type,
+                    is_3d=False,
+                )
+                np.testing.assert_allclose(
+                    _to_np(gpu_result),
+                    cpu_result,
+                    rtol=1e-12,
+                    err_msg=f"axis={axis}, transition_type={transition_type}",
+                )
+
+    def test_calc_a_and_b(self, setup_2d):
+        grid, medium, source, sensor = setup_2d
+        cpu, gpu = self._make_pml_pair(grid, medium, source, sensor)
+
+        rng = np.random.default_rng(555)
+        shape = cpu.extended_medium.sound_speed.shape
+        dx = rng.uniform(0.01, 0.5, shape)
+        kappa = rng.uniform(0.5, 3.0, shape)
+        alpha = rng.uniform(0.01, 0.5, shape)
+        dt = grid.dt
+
+        a_cpu, b_cpu = cpu._calc_a_and_b(dx, kappa, alpha, dt)
+        a_gpu, b_gpu = gpu._calc_a_and_b(dx, kappa, alpha, dt)
+
+        np.testing.assert_allclose(_to_np(a_gpu), a_cpu, rtol=1e-12)
+        np.testing.assert_allclose(_to_np(b_gpu), b_cpu, rtol=1e-12)
+
+    def test_extended_medium_identical(self, setup_2d):
+        """The extended medium (after __init__) should be identical CPU vs GPU."""
+        grid, medium, source, sensor = setup_2d
+        cpu, gpu = self._make_pml_pair(grid, medium, source, sensor)
+
+        np.testing.assert_allclose(
+            _to_np(gpu.extended_medium.sound_speed),
+            cpu.extended_medium.sound_speed,
+            rtol=1e-14,
+        )
+        np.testing.assert_allclose(
+            _to_np(gpu.extended_medium.density),
+            cpu.extended_medium.density,
+            rtol=1e-14,
+        )
+        np.testing.assert_allclose(
+            _to_np(gpu.extended_medium.alpha_coeff),
+            cpu.extended_medium.alpha_coeff,
+            rtol=1e-14,
+        )
+        np.testing.assert_allclose(
+            _to_np(gpu.extended_medium.alpha_power),
+            cpu.extended_medium.alpha_power,
+            rtol=1e-14,
+        )
+        np.testing.assert_allclose(
+            _to_np(gpu.extended_medium.beta),
+            cpu.extended_medium.beta,
+            rtol=1e-14,
+        )
+
+
+class TestPMLBuilderExponentialAttenuationCupyEquivalence:
+    """Compare CPU vs GPU for PMLBuilderExponentialAttenuation."""
+
+    @pytest.fixture()
+    def setup_2d(self):
+        import fullwave
+
+        grid = fullwave.Grid(
+            domain_size=(0.01, 0.01),
+            f0=1e6,
+            duration=1e-6,
+            c0=1540.0,
+            ppw=12,
+            cfl=0.4,
+        )
+        rng = np.random.default_rng(666)
+        shape = (grid.nx, grid.ny)
+        sound_speed = rng.uniform(1400, 1600, shape)
+        density = rng.uniform(900, 1100, shape)
+        alpha_coeff = rng.uniform(0.1, 1.0, shape)
+        alpha_power = rng.uniform(1.0, 2.0, shape)
+        beta = rng.uniform(0.5, 1.5, shape)
+
+        medium = fullwave.Medium(
+            grid=grid,
+            sound_speed=sound_speed,
+            density=density,
+            alpha_coeff=alpha_coeff,
+            alpha_power=alpha_power,
+            beta=beta,
+        )
+        src_coords = np.array([[grid.nx // 2, y] for y in range(grid.ny)])
+        source = fullwave.Source(
+            p0=np.ones((src_coords.shape[0], grid.nt)),
+            coords=src_coords,
+            grid_shape=shape,
+        )
+        sen_coords = np.array([[grid.nx // 2 + 5, y] for y in range(grid.ny)])
+        sensor = fullwave.Sensor(
+            coords=sen_coords,
+            grid_shape=shape,
+        )
+        return grid, medium, source, sensor
+
+    def test_mask_body_2d(self, setup_2d):
+        grid, medium, source, sensor = setup_2d
+        cpu = PMLBuilderExponentialAttenuation(
+            grid,
+            medium,
+            source,
+            sensor,
+            use_gpu=False,
+        )
+        gpu = PMLBuilderExponentialAttenuation(
+            grid,
+            medium,
+            source,
+            sensor,
+            use_gpu=True,
+        )
+
+        nx, ny = cpu.extended_medium.sound_speed.shape[:2]
+        cpu_mask = cpu._mask_body_2d(nx, ny, cpu.num_boundary_points)
+        gpu_mask = gpu._mask_body_2d(nx, ny, gpu.num_boundary_points)
+
+        np.testing.assert_allclose(_to_np(gpu_mask), cpu_mask, rtol=1e-5, atol=1e-7)
+
+    def test_extended_medium_identical(self, setup_2d):
+        grid, medium, source, sensor = setup_2d
+        cpu = PMLBuilderExponentialAttenuation(
+            grid,
+            medium,
+            source,
+            sensor,
+            use_gpu=False,
+        )
+        gpu = PMLBuilderExponentialAttenuation(
+            grid,
+            medium,
+            source,
+            sensor,
+            use_gpu=True,
+        )
+
+        np.testing.assert_allclose(
+            _to_np(gpu.extended_medium.sound_speed),
+            cpu.extended_medium.sound_speed,
+            rtol=1e-14,
+        )
+        np.testing.assert_allclose(
+            _to_np(gpu.extended_medium.density),
+            cpu.extended_medium.density,
+            rtol=1e-14,
+        )
+
+    def test_run_identical(self, setup_2d):
+        grid, medium, source, sensor = setup_2d
+        cpu_builder = PMLBuilderExponentialAttenuation(
+            grid,
+            medium,
+            source,
+            sensor,
+            use_gpu=False,
+        )
+        gpu_builder = PMLBuilderExponentialAttenuation(
+            grid,
+            medium,
+            source,
+            sensor,
+            use_gpu=True,
+        )
+
+        cpu_result = cpu_builder.run(use_pml=True)
+        gpu_result = gpu_builder.run(use_pml=True)
+
+        np.testing.assert_allclose(
+            _to_np(gpu_result.sound_speed),
+            cpu_result.sound_speed,
+            rtol=1e-12,
+        )
+        np.testing.assert_allclose(
+            _to_np(gpu_result.density),
+            cpu_result.density,
+            rtol=1e-12,
+        )
+        np.testing.assert_allclose(
+            _to_np(gpu_result.alpha_exp),
+            cpu_result.alpha_exp,
+            rtol=1e-5,
+            atol=1e-7,
+        )
+        np.testing.assert_allclose(
+            _to_np(gpu_result.beta),
+            cpu_result.beta,
+            rtol=1e-12,
+        )
+
+
+class TestMediumExponentialAttenuationCupyEquivalence:
+    """Compare CPU vs GPU for MediumExponentialAttenuation."""
+
+    @pytest.fixture(autouse=True)
+    def _patch(self, monkeypatch):
+        monkeypatch.setattr(medium_module, "check_functions", _dummy_check_functions())
+
+    def _make_pair(self, grid_shape, grid):
+        rng = np.random.default_rng(999)
+        sound_speed = rng.uniform(1400, 1600, grid_shape)
+        density = rng.uniform(900, 1100, grid_shape)
+        alpha_exp = rng.uniform(0.9, 1.0, grid_shape)
+        beta = rng.uniform(0.5, 1.5, grid_shape)
+
+        cpu = MediumExponentialAttenuation(
+            grid,
+            sound_speed.copy(),
+            density.copy(),
+            alpha_exp.copy(),
+            beta.copy(),
+            use_gpu=False,
+        )
+        gpu = MediumExponentialAttenuation(
+            grid,
+            sound_speed.copy(),
+            density.copy(),
+            alpha_exp.copy(),
+            beta.copy(),
+            use_gpu=True,
+        )
+        return cpu, gpu
+
+    def test_bulk_modulus_2d(self):
+        shape = (32, 32)
+        grid = DummyGrid2D(nx=shape[0], ny=shape[1])
+        cpu, gpu = self._make_pair(shape, grid)
+
+        np.testing.assert_allclose(_to_np(gpu.bulk_modulus), cpu.bulk_modulus, rtol=1e-12)
+
+    def test_bulk_modulus_3d(self):
+        shape = (16, 16, 16)
+        grid = DummyGrid3D(nx=shape[0], ny=shape[1], nz=shape[2])
+        cpu, gpu = self._make_pair(shape, grid)
+
+        np.testing.assert_allclose(_to_np(gpu.bulk_modulus), cpu.bulk_modulus, rtol=1e-12)
+
+
+class TestMediumRelaxationMapsBulkModulusCupyEquivalence:
+    """Compare CPU vs GPU for MediumRelaxationMaps.bulk_modulus."""
+
+    @pytest.fixture(autouse=True)
+    def _patch(self, monkeypatch):
+        monkeypatch.setattr(medium_module, "check_functions", _dummy_check_functions())
+
+    def test_bulk_modulus_2d(self):
+        shape = (20, 20)
+        grid = DummyGrid2D(nx=shape[0], ny=shape[1])
+        rng = np.random.default_rng(456)
+        sound_speed = rng.uniform(1400, 1600, shape)
+        density = rng.uniform(900, 1100, shape)
+        beta = rng.uniform(0.5, 1.5, shape)
+        relax = _get_relaxation_dict(shape)
+
+        cpu = MediumRelaxationMaps(
+            grid,
+            sound_speed.copy(),
+            density.copy(),
+            beta.copy(),
+            {k: v.copy() for k, v in relax.items()},
+            use_gpu=False,
+        )
+        gpu = MediumRelaxationMaps(
+            grid,
+            sound_speed.copy(),
+            density.copy(),
+            beta.copy(),
+            {k: v.copy() for k, v in relax.items()},
+            use_gpu=True,
+        )
+        np.testing.assert_allclose(_to_np(gpu.bulk_modulus), cpu.bulk_modulus, rtol=1e-12)
+
+
+class TestInputFileWriterCupyEquivalence:
+    """Compare CPU vs GPU for InputFileWriter._set_dc_map and dim calc."""
+
+    @pytest.fixture(autouse=True)
+    def _patch(self, monkeypatch):
+        monkeypatch.setattr(medium_module, "check_functions", _dummy_check_functions())
+
+    @pytest.fixture()
+    def setup_2d(self):
+        import fullwave
+
+        grid = fullwave.Grid(
+            domain_size=(1e-2, 1e-2),
+            f0=1e6,
+            duration=1e-5,
+            c0=1540.0,
+            ppw=6,
+            cfl=0.4,
+        )
+        shape = (grid.nx, grid.ny)
+
+        rng = np.random.default_rng(777)
+        sound_speed = rng.uniform(1400, 1600, shape)
+        density = rng.uniform(900, 1100, shape)
+        alpha_exp = rng.uniform(0.9, 1.0, shape)
+        beta = np.zeros(shape)
+
+        medium = fullwave.MediumExponentialAttenuation(
+            grid,
+            sound_speed,
+            density,
+            alpha_exp,
+            beta,
+            use_gpu=False,
+        )
+        return grid, medium
+
+    def test_dim_calc(self, setup_2d):
+        grid, medium = setup_2d
+        # CPU dim
+        cpu_dim = int(
+            np.rint(medium.sound_speed.max()) - np.rint(medium.sound_speed.min()),
+        )
+        # GPU dim
+        import cupy as cp
+
+        c_gpu = cp.asarray(medium.sound_speed)
+        gpu_dim = int(cp.rint(c_gpu.max()) - cp.rint(c_gpu.min()))
+
+        assert cpu_dim == gpu_dim
+
+    def test_dc_map(self, setup_2d, tmp_path):
+        import fullwave
+
+        grid, medium = setup_2d
+
+        src_coords = np.array([[grid.nx // 2, grid.ny // 2]])
+        source = fullwave.Source(
+            p0=np.ones((1, 10)),
+            coords=src_coords,
+            grid_shape=(grid.nx, grid.ny),
+        )
+        sensor = fullwave.Sensor(
+            coords=src_coords,
+            grid_shape=(grid.nx, grid.ny),
+        )
+
+        cpu_writer = InputFileWriter(
+            work_dir=tmp_path / "cpu",
+            grid=grid,
+            medium=medium,
+            source=source,
+            sensor=sensor,
+            validate_input=False,
+            use_exponential_attenuation=True,
+            use_gpu=False,
+        )
+        gpu_writer = InputFileWriter(
+            work_dir=tmp_path / "gpu",
+            grid=grid,
+            medium=medium,
+            source=source,
+            sensor=sensor,
+            validate_input=False,
+            use_exponential_attenuation=True,
+            use_gpu=True,
+        )
+        np.testing.assert_array_equal(_to_np(gpu_writer._dc_map), cpu_writer._dc_map)
+
+    def test_precomputed_bulk_modulus(self, setup_2d, tmp_path):
+        import fullwave
+
+        grid, medium = setup_2d
+
+        src_coords = np.array([[grid.nx // 2, grid.ny // 2]])
+        source = fullwave.Source(
+            p0=np.ones((1, 10)),
+            coords=src_coords,
+            grid_shape=(grid.nx, grid.ny),
+        )
+        sensor = fullwave.Sensor(
+            coords=src_coords,
+            grid_shape=(grid.nx, grid.ny),
+        )
+
+        gpu_writer = InputFileWriter(
+            work_dir=tmp_path / "gpu",
+            grid=grid,
+            medium=medium,
+            source=source,
+            sensor=sensor,
+            validate_input=False,
+            use_exponential_attenuation=True,
+            use_gpu=True,
+        )
+        # GPU path should precompute bulk_modulus
+        assert gpu_writer._precomputed_bulk_modulus is not None
+
+        # Compare with medium.bulk_modulus (CPU reference)
+        expected = np.multiply(
+            medium.sound_speed.astype(np.float32) ** 2,
+            medium.density.astype(np.float32),
+        )
+        np.testing.assert_allclose(
+            gpu_writer._precomputed_bulk_modulus,
+            expected,
+            rtol=1e-5,
+        )
+
+
+class TestPMLBuilderRelaxationCupyEquivalence:
+    """Compare CPU vs GPU for PMLBuilder (multiple relaxation path)."""
+
+    @pytest.fixture(autouse=True)
+    def _patch(self, monkeypatch):
+        monkeypatch.setattr(medium_module, "check_functions", _dummy_check_functions())
+
+    @pytest.fixture()
+    def setup_2d(self):
+        import fullwave
+
+        grid = fullwave.Grid(
+            domain_size=(1e-2, 1e-2),
+            f0=1e6,
+            duration=1e-5,
+            c0=1540.0,
+            ppw=6,
+            cfl=0.4,
+        )
+        shape = (grid.nx, grid.ny)
+
+        rng = np.random.default_rng(321)
+        sound_speed = rng.uniform(1400, 1600, shape)
+        density = rng.uniform(900, 1100, shape)
+        beta = rng.uniform(0.5, 1.5, shape)
+        relax = _get_relaxation_dict(shape)
+
+        medium = fullwave.MediumRelaxationMaps(
+            grid,
+            sound_speed,
+            density,
+            beta,
+            relax,
+            use_gpu=False,
+        )
+        src_coords = np.array([[grid.nx // 2, i] for i in range(grid.ny)])
+        sen_coords = np.array([[0, i] for i in range(grid.ny)])
+        source = fullwave.Source(
+            p0=np.ones((src_coords.shape[0], 10)),
+            coords=src_coords,
+            grid_shape=shape,
+        )
+        sensor = fullwave.Sensor(
+            coords=sen_coords,
+            grid_shape=shape,
+        )
+        return grid, medium, source, sensor
+
+    def test_extended_medium_identical(self, setup_2d):
+        grid, medium, source, sensor = setup_2d
+        cpu = PMLBuilder(grid, medium, source, sensor, use_gpu=False)
+        gpu = PMLBuilder(grid, medium, source, sensor, use_gpu=True)
+
+        np.testing.assert_allclose(
+            _to_np(gpu.extended_medium.sound_speed),
+            cpu.extended_medium.sound_speed,
+            rtol=1e-14,
+        )
+        np.testing.assert_allclose(
+            _to_np(gpu.extended_medium.density),
+            cpu.extended_medium.density,
+            rtol=1e-14,
+        )
+        for key in cpu.extended_medium.relaxation_param_dict:
+            np.testing.assert_allclose(
+                _to_np(gpu.extended_medium.relaxation_param_dict[key]),
+                cpu.extended_medium.relaxation_param_dict[key],
+                rtol=1e-10,
+                err_msg=f"extended relaxation_param_dict[{key}] mismatch",
+            )
+
+    def test_run_identical(self, setup_2d):
+        grid, medium, source, sensor = setup_2d
+        cpu_builder = PMLBuilder(grid, medium, source, sensor, use_gpu=False)
+        gpu_builder = PMLBuilder(grid, medium, source, sensor, use_gpu=True)
+
+        cpu_result = cpu_builder.run(use_pml=True)
+        gpu_result = gpu_builder.run(use_pml=True)
+
+        np.testing.assert_allclose(
+            _to_np(gpu_result.sound_speed),
+            cpu_result.sound_speed,
+            rtol=1e-12,
+        )
+        np.testing.assert_allclose(
+            _to_np(gpu_result.density),
+            cpu_result.density,
+            rtol=1e-12,
+        )
+        for key in cpu_result.relaxation_param_dict_for_fw2:
+            np.testing.assert_allclose(
+                _to_np(gpu_result.relaxation_param_dict_for_fw2[key]),
+                cpu_result.relaxation_param_dict_for_fw2[key],
+                rtol=1e-10,
+                err_msg=f"run() relaxation_param_dict_for_fw2[{key}] mismatch",
+            )
+
+
+class TestRelaxationParametersGeneratorCupyEquivalence:
+    """Compare CPU (Numba) vs GPU (CuPy) for RelaxationParametersGenerator."""
+
+    @pytest.fixture()
+    def generator(self):
+        from pathlib import Path
+
+        db_path = (
+            Path(__file__).parent.parent
+            / "fullwave"
+            / "solver"
+            / "bins"
+            / "database"
+            / "relaxation_params_database_num_relax=2_20260113_0957.mat"
+        )
+        if not db_path.exists():
+            pytest.skip(f"LUT database not found at {db_path}")
+        return RelaxationParametersGenerator(n_relaxation_mechanisms=2, path_database=db_path)
+
+    def test_generate_identical_2d(self, generator):
+        """GPU generate must produce identical results to CPU generate for 2D arrays."""
+        rng = np.random.default_rng(123)
+        shape = (50, 60)
+        alpha_coeff = rng.uniform(generator.alpha_min, generator.alpha_max, size=shape)
+        alpha_power = rng.uniform(generator.power_min, generator.power_max, size=shape)
+
+        cpu_result = generator._generate_cpu(alpha_coeff, alpha_power)
+        gpu_result = generator._generate_gpu(cp.asarray(alpha_coeff), cp.asarray(alpha_power))
+
+        for key in cpu_result:
+            np.testing.assert_allclose(
+                _to_np(gpu_result[key]),
+                cpu_result[key],
+                rtol=1e-12,
+                err_msg=f"key={key} mismatch (2D)",
+            )
+
+    def test_generate_identical_3d(self, generator):
+        """GPU generate must produce identical results to CPU generate for 3D arrays."""
+        rng = np.random.default_rng(456)
+        shape = (20, 25, 15)
+        alpha_coeff = rng.uniform(generator.alpha_min, generator.alpha_max, size=shape)
+        alpha_power = rng.uniform(generator.power_min, generator.power_max, size=shape)
+
+        cpu_result = generator._generate_cpu(alpha_coeff, alpha_power)
+        gpu_result = generator._generate_gpu(cp.asarray(alpha_coeff), cp.asarray(alpha_power))
+
+        for key in cpu_result:
+            np.testing.assert_allclose(
+                _to_np(gpu_result[key]),
+                cpu_result[key],
+                rtol=1e-12,
+                err_msg=f"key={key} mismatch (3D)",
+            )
+
+    def test_generate_dispatches_correctly(self, generator):
+        """generate() dispatches to CPU for numpy, GPU for CuPy."""
+        rng = np.random.default_rng(789)
+        shape = (10, 10)
+        alpha_coeff = rng.uniform(generator.alpha_min, generator.alpha_max, size=shape)
+        alpha_power = rng.uniform(generator.power_min, generator.power_max, size=shape)
+
+        cpu_result = generator.generate(alpha_coeff, alpha_power)
+        gpu_result = generator.generate(cp.asarray(alpha_coeff), cp.asarray(alpha_power))
+
+        for key in cpu_result:
+            assert isinstance(cpu_result[key], np.ndarray), f"CPU result {key} should be numpy"
+            np.testing.assert_allclose(
+                _to_np(gpu_result[key]),
+                cpu_result[key],
+                rtol=1e-12,
+                err_msg=f"key={key} dispatch mismatch",
+            )
diff --git a/tests/test_sensor.py b/tests/test_sensor.py
index 283d360..42ac132 100644
--- a/tests/test_sensor.py
+++ b/tests/test_sensor.py
@@ -102,5 +102,72 @@ def test_coords_and_mask_raises():
 
 
 def test_no_input_raises():
-    with pytest.raises(ValueError, match="Either mask or coords"):
+    with pytest.raises(ValueError, match="Either mask"):
         Sensor()
+
+
+# --- Sparse-grid mode tests ---
+
+
+def test_sparse_grid_2d():
+    sensor = Sensor(mod_x=4, mod_y=4)
+    assert sensor.is_sparse_grid
+    assert sensor.mod_x == 4
+    assert sensor.mod_y == 4
+    assert sensor.mod_z == 0
+    assert not sensor.is_3d
+    assert sensor.n_sensors == 0
+    assert sensor.outcoords.shape == (0, 2)
+    assert sensor.grid_shape is None
+
+
+def test_sparse_grid_3d():
+    sensor = Sensor(mod_x=4, mod_y=4, mod_z=2)
+    assert sensor.is_sparse_grid
+    assert sensor.mod_z == 2
+    assert sensor.is_3d
+    assert sensor.outcoords.shape == (0, 3)
+
+
+def test_sparse_grid_validate_2d_ok():
+    sensor = Sensor(mod_x=4, mod_y=4)
+    sensor.validate((100, 200))  # 2D grid — should not raise
+
+
+def test_sparse_grid_validate_3d_missing_mod_z_raises():
+    sensor = Sensor(mod_x=4, mod_y=4)  # mod_z defaults to 0
+    with pytest.raises(ValueError, match="mod_z"):
+        sensor.validate((100, 200, 50))  # 3D grid
+
+
+def test_sparse_grid_validate_3d_ok():
+    sensor = Sensor(mod_x=4, mod_y=4, mod_z=4)
+    sensor.validate((100, 200, 50))  # should not raise
+
+
+def test_sparse_grid_missing_mod_y_raises():
+    with pytest.raises(ValueError, match="Both mod_x and mod_y"):
+        Sensor(mod_x=4)
+
+
+def test_sparse_grid_missing_mod_x_raises():
+    with pytest.raises(ValueError, match="Both mod_x and mod_y"):
+        Sensor(mod_y=4)
+
+
+def test_sparse_grid_with_mask_raises():
+    mask = np.array([[1, 0]])
+    with pytest.raises(ValueError, match="mutually exclusive"):
+        Sensor(mask=mask, mod_x=4, mod_y=4)
+
+
+def test_sparse_grid_with_coords_raises():
+    coords = np.array([[0, 0]])
+    with pytest.raises(ValueError, match="mutually exclusive"):
+        Sensor(coords=coords, grid_shape=(1, 2), mod_x=4, mod_y=4)
+
+
+def test_sparse_grid_str():
+    sensor = Sensor(mod_x=4, mod_y=4)
+    assert "sparse-grid" in str(sensor)
+    assert "mod_x=4" in str(sensor)
diff --git a/tests/test_source.py b/tests/test_source.py
index 39b4f74..6195378 100644
--- a/tests/test_source.py
+++ b/tests/test_source.py
@@ -111,7 +111,7 @@ def test_coords_and_mask_raises():
 def test_no_mask_or_coords_raises():
     p0 = np.array([[1.0]])
     with pytest.raises(
-        ValueError,  # noqa: PT011
+        ValueError,
     ):
         Source(p0)
 
@@ -172,7 +172,7 @@ def test_additive_only_with_coords():
 def test_both_p0_and_p0_additive_none_raises():
     """Source with both p0 and p0_additive None raises ValueError."""
     p_mask = np.array([[False, True]])
-    with pytest.raises(ValueError, match="At least one of p0 or p0_additive must be provided"):
+    with pytest.raises(ValueError, match="At least one of p0, p0_additive"):
         Source(None, p_mask)
 
 
@@ -244,3 +244,211 @@ def test_p0_additive_incoords_add_length_mismatch_raises():
             p0_additive=p0_add,
             coords_additive=incoords_add,
         )
+
+
+# --- Velocity source (u0/v0/w0) tests ---
+
+
+class TestVelocitySource:
+    """Tests for velocity source components (u0, v0, w0)."""
+
+    _grid_shape = (4, 4)
+    _nt = 5
+
+    def _p0(self, n: int = 1) -> np.ndarray:
+        return np.ones((n, self._nt), dtype=np.float64)
+
+    def _base_source(self, n: int = 1) -> Source:
+        """Minimal hard pressure source at (0,0)..(n-1,0)."""
+        coords = np.array([[i, 0] for i in range(n)])
+        return Source(self._p0(n), coords=coords, grid_shape=self._grid_shape)
+
+    # ---- construction with coords ----
+
+    def test_u0_with_coords(self):
+        """u0 with coords_u stores incoords_u and u0."""
+        coords_u = np.array([[1, 2]])
+        u0 = self._p0()
+        src = Source(
+            self._p0(),
+            coords=np.array([[0, 0]]),
+            grid_shape=self._grid_shape,
+            u0=u0,
+            coords_u=coords_u,
+        )
+        assert src.n_sources_u == 1
+        np.testing.assert_array_equal(src.incoords_u, coords_u)
+        np.testing.assert_array_equal(src.u0, u0)
+
+    def test_u0_with_mask(self):
+        """u0 with mask_u is equivalent to coords extracted from mask."""
+        mask_u = np.zeros(self._grid_shape, dtype=bool)
+        mask_u[1, 2] = True
+        u0 = self._p0()
+        src = Source(
+            self._p0(),
+            coords=np.array([[0, 0]]),
+            grid_shape=self._grid_shape,
+            u0=u0,
+            mask_u=mask_u,
+        )
+        expected_coords = np.argwhere(mask_u)
+        assert src.n_sources_u == 1
+        np.testing.assert_array_equal(src.incoords_u, expected_coords)
+
+    # ---- all three components ----
+
+    def test_all_three_velocity_components(self):
+        """All three u/v/w components can be specified simultaneously."""
+        coords_u = np.array([[0, 1]])
+        coords_v = np.array([[1, 0], [1, 1]])
+        coords_w = np.array([[2, 2], [3, 3], [0, 3]])
+        src = Source(
+            self._p0(),
+            coords=np.array([[0, 0]]),
+            grid_shape=self._grid_shape,
+            u0=np.ones((1, self._nt)),
+            coords_u=coords_u,
+            v0=np.ones((2, self._nt)),
+            coords_v=coords_v,
+            w0=np.ones((3, self._nt)),
+            coords_w=coords_w,
+        )
+        assert src.n_sources_u == 1
+        assert src.n_sources_v == 2
+        assert src.n_sources_w == 3
+
+    # ---- partial (only u provided) ----
+
+    def test_only_u_provided(self):
+        """Only u component; v and w remain None with count 0."""
+        coords_u = np.array([[0, 1]])
+        src = Source(
+            self._p0(),
+            coords=np.array([[0, 0]]),
+            grid_shape=self._grid_shape,
+            u0=self._p0(),
+            coords_u=coords_u,
+        )
+        assert src.n_sources_u == 1
+        assert src.n_sources_v == 0
+        assert src.n_sources_w == 0
+        assert src.v0 is None
+        assert src.w0 is None
+        assert src.incoords_v is None
+        assert src.incoords_w is None
+
+    # ---- error cases ----
+
+    def test_u0_without_coords_or_mask_raises(self):
+        """Providing u0 without coords_u or mask_u must raise ValueError."""
+        with pytest.raises(ValueError, match="neither coords_u nor mask_u"):
+            Source(
+                self._p0(),
+                coords=np.array([[0, 0]]),
+                grid_shape=self._grid_shape,
+                u0=self._p0(),
+            )
+
+    def test_coords_u_and_mask_u_raises(self):
+        """Providing both coords_u and mask_u must raise ValueError."""
+        mask_u = np.zeros(self._grid_shape, dtype=bool)
+        mask_u[0, 1] = True
+        coords_u = np.array([[0, 1]])
+        with pytest.raises(ValueError, match="mutually exclusive"):
+            Source(
+                self._p0(),
+                coords=np.array([[0, 0]]),
+                grid_shape=self._grid_shape,
+                u0=self._p0(),
+                coords_u=coords_u,
+                mask_u=mask_u,
+            )
+
+    def test_coords_u_without_u0_raises(self):
+        """Providing coords_u but no u0 must raise ValueError."""
+        with pytest.raises(ValueError, match="without u0"):
+            Source(
+                self._p0(),
+                coords=np.array([[0, 0]]),
+                grid_shape=self._grid_shape,
+                coords_u=np.array([[0, 1]]),
+            )
+
+    def test_u0_row_mismatch_raises(self):
+        """u0 row count not matching coords_u raises ValueError."""
+        coords_u = np.array([[0, 1], [1, 0]])  # 2 points
+        u0_wrong = self._p0(1)  # 1 row
+        with pytest.raises(ValueError, match="u0 has 1 rows"):
+            Source(
+                self._p0(),
+                coords=np.array([[0, 0]]),
+                grid_shape=self._grid_shape,
+                u0=u0_wrong,
+                coords_u=coords_u,
+            )
+
+    # ---- __str__ includes velocity info ----
+
+    def test_str_includes_velocity_component(self):
+        """__str__ reports velocity components when present."""
+        coords_u = np.array([[0, 1]])
+        src = Source(
+            self._p0(),
+            coords=np.array([[0, 0]]),
+            grid_shape=self._grid_shape,
+            u0=self._p0(),
+            coords_u=coords_u,
+        )
+        assert "u0" in str(src)
+
+    # ---- combined with hard pressure source ----
+
+    def test_combined_pressure_and_velocity(self):
+        """Hard pressure source and velocity source can coexist."""
+        coords_p = np.array([[0, 0], [0, 1]])
+        coords_u = np.array([[1, 0]])
+        src = Source(
+            self._p0(2),
+            coords=coords_p,
+            grid_shape=self._grid_shape,
+            u0=self._p0(1),
+            coords_u=coords_u,
+        )
+        assert src.n_sources == 2
+        assert src.n_sources_u == 1
+
+    # ---- velocity-only (no hard pressure source) ----
+
+    def test_velocity_only_no_pressure_source(self):
+        """Source with only u0 and no p0 has n_sources==0 and empty p0."""
+        coords_u = np.array([[0, 1], [1, 0]])
+        src = Source(
+            grid_shape=self._grid_shape,
+            u0=self._p0(2),
+            coords_u=coords_u,
+        )
+        assert src.n_sources == 0
+        assert src.n_sources_u == 2
+        assert src.p0.shape == (0, self._nt)
+        assert src.p0_additive is None
+        assert src.incoords.shape == (0, 2)
+
+    def test_velocity_only_missing_grid_shape_raises(self):
+        """Velocity-only source without grid_shape raises ValueError."""
+        with pytest.raises(ValueError, match="grid_shape is required"):
+            Source(u0=self._p0(), coords_u=np.array([[0, 1]]))
+
+    def test_velocity_only_validate_passes(self):
+        """validate() accepts a velocity-only source."""
+        src = Source(
+            grid_shape=self._grid_shape,
+            u0=self._p0(),
+            coords_u=np.array([[0, 1]]),
+        )
+        src.validate(self._grid_shape)
+
+    def test_all_none_raises(self):
+        """No p0, no p0_additive, no velocity → ValueError."""
+        with pytest.raises(ValueError):
+            Source(grid_shape=self._grid_shape)
diff --git a/uv.lock b/uv.lock
index b9a0356..fca99ac 100644
--- a/uv.lock
+++ b/uv.lock
@@ -735,10 +735,9 @@ wheels = [
 
 [[package]]
 name = "fullwave25"
-version = "1.2.3"
+version = "1.2.6.dev8"
 source = { editable = "." }
 dependencies = [
-    { name = "joblib" },
     { name = "matplotlib" },
     { name = "numba" },
     { name = "numexpr" },
@@ -780,7 +779,6 @@ dev = [
 requires-dist = [
     { name = "cupy-cuda12x", marker = "extra == 'examples'", specifier = ">=13.6.0" },
     { name = "ipykernel", marker = "extra == 'examples'", specifier = ">=6.28.0" },
-    { name = "joblib", specifier = ">=1.5.3" },
     { name = "jupyter", marker = "extra == 'examples'", specifier = ">=1.0.0" },
     { name = "line-profiler", marker = "extra == 'dev'", specifier = ">=5.0.1" },
     { name = "mach-beamform", marker = "extra == 'examples'", specifier = ">=0.0.4" },
@@ -1034,15 +1032,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" },
 ]
 
-[[package]]
-name = "joblib"
-version = "1.5.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/41/f2/d34e8b3a08a9cc79a50b2208a93dce981fe615b64d5a4d4abee421d898df/joblib-1.5.3.tar.gz", hash = "sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3", size = 331603, upload-time = "2025-12-15T08:41:46.427Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" },
-]
-
 [[package]]
 name = "json5"
 version = "0.12.1"