From a9fbfc75e378e6161b12f038953cb4ac8d3f8542 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Tue, 24 Feb 2026 12:32:45 -0500
Subject: [PATCH 01/31] Add support for velocity source components (u0, v0, w0)
 in Source class and update InputFileWriter tests

---
 .../simple_plane_wave_velocity_source.py      | 197 ++++++++++++++
 fullwave/solver/binary_manager.py             |   2 +-
 fullwave/solver/input_file_writer.py          |  17 ++
 fullwave/solver/pml_builder.py                |  44 +++-
 fullwave/source.py                            | 159 +++++++++++-
 tests/solver/test_input_file_writer.py        | 240 ++++++++++++++++++
 tests/test_source.py                          | 210 ++++++++++++++-
 7 files changed, 854 insertions(+), 15 deletions(-)
 create mode 100644 examples/simple_plane_wave/simple_plane_wave_velocity_source.py

diff --git a/examples/simple_plane_wave/simple_plane_wave_velocity_source.py b/examples/simple_plane_wave/simple_plane_wave_velocity_source.py
new file mode 100644
index 0000000..028960a
--- /dev/null
+++ b/examples/simple_plane_wave/simple_plane_wave_velocity_source.py
@@ -0,0 +1,197 @@
+"""Simple plane wave transmit example using a velocity (u0) source.
+
+A velocity source drives the particle-velocity equation directly, as opposed to
+the hard pressure source (p0) which drives the pressure equation.  For a plane
+wave propagating in the depth (x) direction the relevant component is u0.
+
+The acoustic impedance relation  p = rho · c · u  links the two formulations, so
+the velocity amplitude is scaled as
+
+    u_amp = p_amp / (ρ₀ · c₀)
+
+Everything else (grid, medium, sensor, solver) is identical to
+simple_plane_wave.py so the two examples can be run side-by-side for comparison.
+"""
+
+import logging
+from pathlib import Path
+
+import numpy as np
+
+import fullwave
+from fullwave.utils import plot_utils, signal_process
+from fullwave.utils.coordinates import map_to_coords
+
+
+def main() -> None:
+    """Run simple plane wave transmit example with a velocity source."""
+    logging.getLogger("__main__").setLevel(logging.INFO)
+
+    #
+    # --- working directory ---
+    #
+    work_dir = Path("./outputs/") / "simple_plane_wave_velocity_source"
+    work_dir.mkdir(parents=True, exist_ok=True)
+
+    #
+    # --- computational grid ---
+    #
+    domain_size = (3e-2, 2e-2)  # (depth, lateral) in metres
+    f0 = 3e6  # centre frequency [Hz]
+    c0 = 1540.0  # background sound speed [m/s]
+    rho0 = 1000.0  # background density [kg/m³]
+    duration = domain_size[0] / c0 * 2  # two-way travel time across depth
+    grid = fullwave.Grid(
+        domain_size=domain_size,
+        f0=f0,
+        duration=duration,
+        c0=c0,
+    )
+
+    #
+    # --- acoustic medium ---
+    #
+    sound_speed_map = c0 * np.ones((grid.nx, grid.ny))  # m/s
+    density_map = rho0 * np.ones((grid.nx, grid.ny))  # kg/m³
+    alpha_coeff_map = 0.5 * np.ones((grid.nx, grid.ny))  # dB/(MHz^y cm)
+    alpha_power_map = 1.0 * np.ones((grid.nx, grid.ny))  # power-law exponent
+    beta_map = 0.0 * np.ones((grid.nx, grid.ny))  # nonlinearity coefficient
+
+    # embed a scatterer with different acoustic properties
+    obj_x_start = grid.nx // 3
+    obj_x_end = 2 * grid.nx // 3
+    obj_y_start = grid.ny // 3
+    obj_y_end = 2 * grid.ny // 3
+
+    sound_speed_map[obj_x_start:obj_x_end, obj_y_start:obj_y_end] = 1600
+    density_map[obj_x_start:obj_x_end, obj_y_start:obj_y_end] = 1100
+    alpha_coeff_map[obj_x_start:obj_x_end, obj_y_start:obj_y_end] = 0.75
+    alpha_power_map[obj_x_start:obj_x_end, obj_y_start:obj_y_end] = 1.1
+
+    medium = fullwave.Medium(
+        grid=grid,
+        sound_speed=sound_speed_map,
+        density=density_map,
+        alpha_coeff=alpha_coeff_map,
+        alpha_power=alpha_power_map,
+        beta=beta_map,
+    )
+    medium.plot(export_path=work_dir / "medium.png")
+
+    #
+    # --- velocity source ---
+    #
+    # Use the u-component (depth / x direction) to drive a downward-travelling
+    # plane wave.  The source occupies the top `element_thickness_px` rows of the
+    # grid, matching the pressure-source layout in simple_plane_wave.py.
+    #
+    # Velocity amplitude is derived from the target pressure amplitude via the
+    # plane-wave impedance relation:  u_amp = p_amp / (ρ₀ · c₀)
+    #
+    p_amp = 1e5  # target pressure amplitude [Pa]
+    u_amp = p_amp / (rho0 * c0)  # corresponding velocity amplitude [m/s]
+
+    ncycles = 2
+    drop_off = 2
+    # element_thickness_px = 3
+
+    # Build the coordinate array for the velocity source layer
+    # small velocity source at the center of the domain
+
+    source_width_px_x = 2
+    source_width_px_y = 2
+    u_mask = np.zeros((grid.nx, grid.ny), dtype=bool)
+    u_mask[
+        grid.nx // 2 - source_width_px_x // 2 : grid.nx // 2 + source_width_px_x // 2,
+        grid.ny // 2 - source_width_px_y // 2 : grid.ny // 2 + source_width_px_y // 2,
+    ] = True
+
+    coords_u = map_to_coords(u_mask)  # shape [n_sources_u, 2]
+
+    # Build the u0 signal matrix [n_sources_u, nt]
+    u0 = np.zeros((coords_u.shape[0], grid.nt))
+
+    u0_vec = fullwave.utils.pulse.gaussian_modulated_sinusoidal_signal(
+        nt=grid.nt,
+        f0=f0,
+        duration=duration,
+        ncycles=ncycles,
+        drop_off=drop_off,
+        p0=u_amp,  # amplitude in m/s
+    )
+    u0[:, :] = u0_vec
+
+    # for i_layer in range(element_thickness_px):
+    #     u0_vec = fullwave.utils.pulse.gaussian_modulated_sinusoidal_signal(
+    #         nt=grid.nt,
+    #         f0=f0,
+    #         duration=duration,
+    #         ncycles=ncycles,
+    #         drop_off=drop_off,
+    #         p0=u_amp,  # amplitude in m/s
+    #         i_layer=i_layer,
+    #         dt_for_layer_delay=grid.dt,
+    #         cfl_for_layer_delay=grid.cfl,
+    #     )
+    #     n_y = coords_u.shape[0] // element_thickness_px
+    #     u0[n_y * i_layer : n_y * (i_layer + 1), :] = u0_vec
+
+    source = fullwave.Source(
+        grid_shape=grid.shape,
+        u0=u0,
+        coords_u=coords_u,
+    )
+
+    #
+    # --- sensor ---
+    #
+    sensor_mask = np.ones((grid.nx, grid.ny), dtype=bool)
+    sensor = fullwave.Sensor(mask=sensor_mask, sampling_modulus_time=7)
+
+    #
+    # --- solver ---
+    #
+    fw_solver = fullwave.Solver(
+        work_dir=work_dir,
+        grid=grid,
+        medium=medium,
+        source=source,
+        sensor=sensor,
+        run_on_memory=False,
+        use_exponential_attenuation=True,
+        save_gpu_memory=True,
+        path_fullwave_simulation_bin=Path(
+            "/home/msode/workspace/lab_repos/fullwave-python-public/debug_solver_bin/fullwave2_2d_exponential_attenuation_multi_gpu",
+        ),
+    )
+    sensor_output = fw_solver.run()
+
+    #
+    # --- visualisation ---
+    #
+    propagation_map = signal_process.reshape_whole_sensor_to_nt_nx_ny(
+        sensor_output,
+        grid,
+    )
+    p_max_plot = np.abs(propagation_map).max().item() / 4
+    time_step = propagation_map.shape[0] // 3
+    plot_utils.plot_array(
+        propagation_map[time_step, :, :],
+        aspect=propagation_map.shape[2] / propagation_map.shape[1],
+        export_path=work_dir / "wave_propagation_snapshot.png",
+        vmax=p_max_plot,
+        vmin=-p_max_plot,
+    )
+    plot_utils.plot_wave_propagation_with_map(
+        propagation_map=propagation_map,
+        c_map=medium.sound_speed,
+        rho_map=medium.density,
+        export_name=work_dir / "wave_propagation_animation.mp4",
+        vmax=p_max_plot,
+        vmin=-p_max_plot,
+        figsize=(4, 6),
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fullwave/solver/binary_manager.py b/fullwave/solver/binary_manager.py
index 3c8b755..1b8a789 100644
--- a/fullwave/solver/binary_manager.py
+++ b/fullwave/solver/binary_manager.py
@@ -28,7 +28,7 @@
 
 # Pinned release tag for the solver binaries.
 # Update this only when new binaries are uploaded to a GitHub release.
-BINARY_RELEASE_TAG = "fullwave_bin_v1.1"
+BINARY_RELEASE_TAG = "fullwave_bin_v1.2"
 
 
 def _download_url(filename: str, tag: str) -> str:
diff --git a/fullwave/solver/input_file_writer.py b/fullwave/solver/input_file_writer.py
index a67cba3..fc67888 100644
--- a/fullwave/solver/input_file_writer.py
+++ b/fullwave/solver/input_file_writer.py
@@ -169,6 +169,19 @@ def run(
             )
         if incoords_add is not None:
             self._queue_coords_write(simulation_dir / "icc_add.dat", incoords_add)
+        for _vel_suffix, _vel_attr in (("u", "u0"), ("v", "v0"), ("w", "w0")):
+            _signal_vel = getattr(self.source, _vel_attr, None)
+            _incoords_vel = getattr(self.source, f"incoords_{_vel_suffix}", None)
+            if _signal_vel is not None:
+                self._queue_ic_write(
+                    simulation_dir / f"icmat_{_vel_suffix}.dat",
+                    np.transpose(_signal_vel),
+                )
+            if _incoords_vel is not None:
+                self._queue_coords_write(
+                    simulation_dir / f"icc_{_vel_suffix}.dat",
+                    _incoords_vel,
+                )
         self._queue_coords_write(simulation_dir / "icc.dat", self.source.incoords)
         self._copy_simulation_bin_file(simulation_dir)
 
@@ -1047,6 +1060,10 @@ def _save_coords_params(self, simulation_dir: Path) -> None:
         n_sources_add = getattr(self.source, "n_sources_add", 0)
         if n_sources_add > 0:
             var_list.append(("ncoords_add", n_sources_add))
+        for _vel_suffix in ("u", "v", "w"):
+            _n_vel = getattr(self.source, f"n_sources_{_vel_suffix}", 0)
+            if _n_vel > 0:
+                var_list.append((f"ncoords_{_vel_suffix}", _n_vel))
         if self.is_3d:
             var_list.extend(
                 [
diff --git a/fullwave/solver/pml_builder.py b/fullwave/solver/pml_builder.py
index bf8efc7..9e3899c 100644
--- a/fullwave/solver/pml_builder.py
+++ b/fullwave/solver/pml_builder.py
@@ -322,12 +322,33 @@ def __init__(  # noqa: PLR0915
             if getattr(self.source_org, "incoords_add", None) is not None
             else None
         )
+        incoords_u_ext = (
+            self.source_org.incoords_u + self.num_boundary_points
+            if getattr(self.source_org, "incoords_u", None) is not None
+            else None
+        )
+        incoords_v_ext = (
+            self.source_org.incoords_v + self.num_boundary_points
+            if getattr(self.source_org, "incoords_v", None) is not None
+            else None
+        )
+        incoords_w_ext = (
+            self.source_org.incoords_w + self.num_boundary_points
+            if getattr(self.source_org, "incoords_w", None) is not None
+            else None
+        )
         self.extended_source = fullwave.Source(
             p0=self.source_org.p0,
             coords=self.source_org.incoords + self.num_boundary_points,
             grid_shape=extended_grid_shape,
             p0_additive=self.source_org.p0_additive,
             coords_additive=incoords_add_ext,
+            u0=getattr(self.source_org, "u0", None),
+            coords_u=incoords_u_ext,
+            v0=getattr(self.source_org, "v0", None),
+            coords_v=incoords_v_ext,
+            w0=getattr(self.source_org, "w0", None),
+            coords_w=incoords_w_ext,
         )
         logger.debug("building extended source for pml...done")
 
@@ -1434,7 +1455,7 @@ def plot(
 class PMLBuilderExponentialAttenuation(PMLBuilder):
     """A class to set up PML for exponential attenuation media."""
 
-    def __init__(
+    def __init__(  # noqa: PLR0915
         self,
         grid: fullwave.Grid,
         medium: fullwave.Medium,
@@ -1594,12 +1615,33 @@ def __init__(
             if getattr(self.source_org, "incoords_add", None) is not None
             else None
         )
+        incoords_u_ext = (
+            self.source_org.incoords_u + self.num_boundary_points
+            if getattr(self.source_org, "incoords_u", None) is not None
+            else None
+        )
+        incoords_v_ext = (
+            self.source_org.incoords_v + self.num_boundary_points
+            if getattr(self.source_org, "incoords_v", None) is not None
+            else None
+        )
+        incoords_w_ext = (
+            self.source_org.incoords_w + self.num_boundary_points
+            if getattr(self.source_org, "incoords_w", None) is not None
+            else None
+        )
         self.extended_source = fullwave.Source(
             p0=self.source_org.p0,
             coords=self.source_org.incoords + self.num_boundary_points,
             grid_shape=extended_grid_shape,
             p0_additive=self.source_org.p0_additive,
             coords_additive=incoords_add_ext,
+            u0=getattr(self.source_org, "u0", None),
+            coords_u=incoords_u_ext,
+            v0=getattr(self.source_org, "v0", None),
+            coords_v=incoords_v_ext,
+            w0=getattr(self.source_org, "w0", None),
+            coords_w=incoords_w_ext,
         )
         extended_sensor_grid_shape = tuple(
             s + 2 * self.num_boundary_points for s in self.sensor_org.grid_shape
diff --git a/fullwave/source.py b/fullwave/source.py
index bbe4dba..0856850 100644
--- a/fullwave/source.py
+++ b/fullwave/source.py
@@ -23,6 +23,12 @@ class Source:
     grid_shape: tuple[int, ...]
     p0_additive: NDArray[np.float64] | None = None
     incoords_add: NDArray[np.int64] | None = None
+    u0: NDArray[np.float64] | None = None
+    incoords_u: NDArray[np.int64] | None = None
+    v0: NDArray[np.float64] | None = None
+    incoords_v: NDArray[np.int64] | None = None
+    w0: NDArray[np.float64] | None = None
+    incoords_w: NDArray[np.int64] | None = None
 
     def __init__(  # noqa: C901 PLR0912 PLR0915
         self,
@@ -37,6 +43,16 @@ def __init__(  # noqa: C901 PLR0912 PLR0915
         coords_additive: NDArray[np.int64] | None = None,
         # ---
         grid_shape: tuple[int, ...] | None = None,
+        # --- velocity source components ---
+        u0: NDArray[np.float64] | None = None,
+        coords_u: NDArray[np.int64] | None = None,
+        mask_u: NDArray[np.bool] | None = None,
+        v0: NDArray[np.float64] | None = None,
+        coords_v: NDArray[np.int64] | None = None,
+        mask_v: NDArray[np.bool] | None = None,
+        w0: NDArray[np.float64] | None = None,
+        coords_w: NDArray[np.int64] | None = None,
+        mask_w: NDArray[np.bool] | None = None,
     ) -> None:
         """Source class for Fullwave.
 
@@ -44,8 +60,9 @@ def __init__(  # noqa: C901 PLR0912 PLR0915
         ----------
         p0 : NDArray[np.float64] | None
             Time-varying pressure at each source position; shape [n_sources, nt].
-            If None, p0_additive must be provided (additive-only / soft initial condition):
-            icmat is written as zeros and only the additive term drives the source.
+            If None, either p0_additive or a velocity component (u0/v0/w0) must be provided.
+            When omitted with a velocity-only source, the pressure node list is empty
+            and icmat is written as a zero-row matrix.
         mask : NDArray[np.bool] | None
             binary matrix specifying the positions of the time varying pressure source distribution
             shape: [nx, ny] for 2D, [nx, ny, nz] for 3D.
@@ -54,7 +71,7 @@ def __init__(  # noqa: C901 PLR0912 PLR0915
             Coordinate array of source positions (hard source); shape [n_sources, ndim].
             Must be provided together with grid_shape.
         grid_shape : tuple[int, ...] | None
-            Shape of the computational grid. Required when using coords input.
+            Shape of the computational grid. Required when using coords or velocity-only input.
         p0_additive : NDArray[np.float64] | None
             Optional additive (soft) source term; shape [n_sources_add, nt].
             When provided, positions come from coords_additive or mask_additive,
@@ -66,15 +83,36 @@ def __init__(  # noqa: C901 PLR0912 PLR0915
             Coordinates for the additive source; shape [n_sources_add, ndim].
             If None and p0_additive is provided, defaults to primary incoords.
             Mutually exclusive with mask_additive.
+        u0 : NDArray[np.float64] | None
+            Time-varying velocity in the depth (x) direction; shape [n_sources_u, nt].
+        coords_u : NDArray[np.int64] | None
+            Coordinates for the u-velocity source; shape [n_sources_u, ndim].
+            Mutually exclusive with mask_u.
+        mask_u : NDArray[np.bool] | None
+            Boolean mask for u-velocity source positions. Mutually exclusive with coords_u.
+        v0 : NDArray[np.float64] | None
+            Time-varying velocity in the lateral (y) direction; shape [n_sources_v, nt].
+        coords_v : NDArray[np.int64] | None
+            Coordinates for the v-velocity source; shape [n_sources_v, ndim].
+            Mutually exclusive with mask_v.
+        mask_v : NDArray[np.bool] | None
+            Boolean mask for v-velocity source positions. Mutually exclusive with coords_v.
+        w0 : NDArray[np.float64] | None
+            Time-varying velocity in the elevational (z) direction; shape [n_sources_w, nt].
+        coords_w : NDArray[np.int64] | None
+            Coordinates for the w-velocity source; shape [n_sources_w, ndim].
+            Mutually exclusive with mask_w.
+        mask_w : NDArray[np.bool] | None
+            Boolean mask for w-velocity source positions. Mutually exclusive with coords_w.
 
         Raises
         ------
         ValueError
-            If grid_shape is missing when using coords or additive-only.
+            If grid_shape is missing when using coords, additive-only, or velocity-only.
             If both mask and coords, or both coords_additive and mask_additive, are provided.
-            If primary source positions cannot be resolved (need coords+grid_shape,
-            or mask, or additive-only args).
-            If both p0 and p0_additive are None.
+            If primary source positions cannot be resolved (need coords+grid_shape, mask,
+            additive-only args, or at least one velocity component with grid_shape).
+            If all of p0, p0_additive, u0, v0, and w0 are None.
             If p0 / p0_additive row counts do not match their coordinate arrays.
 
         """
@@ -110,17 +148,28 @@ def __init__(  # noqa: C901 PLR0912 PLR0915
                 mask_add = np.atleast_2d(mask_additive)
                 self.grid_shape = mask_add.shape
                 self.incoords = map_to_coords(mask_add)
+        elif u0 is not None or v0 is not None or w0 is not None:
+            # Velocity-only: no hard pressure nodes; pressure arrays will be empty
+            if grid_shape is None:
+                error_msg = "grid_shape is required for velocity-only source"
+                raise ValueError(error_msg)
+            self.grid_shape = tuple(grid_shape)
+            self.incoords = np.empty((0, len(self.grid_shape)), dtype=np.int64)
         else:
             error_msg = (
                 "Provide (coords + grid_shape), or mask, or"
-                " (p0_additive + (coords_additive or mask_additive))"
+                " (p0_additive + (coords_additive or mask_additive)), or"
+                " a velocity component (u0/v0/w0) with grid_shape"
             )
             raise ValueError(
                 error_msg,
             )
 
-        if p0 is None and p0_additive is None:
-            error_msg = "At least one of p0 or p0_additive must be provided"
+        if p0 is None and p0_additive is None and u0 is None and v0 is None and w0 is None:
+            error_msg = (
+                "At least one of p0, p0_additive, or a velocity component"
+                " (u0/v0/w0) must be provided"
+            )
             raise ValueError(error_msg)
 
         # --- Resolve additive coords (for later use) ---
@@ -148,7 +197,7 @@ def __init__(  # noqa: C901 PLR0912 PLR0915
                 )
                 raise ValueError(error_msg)
             self.p0_additive = np.atleast_2d(p0_additive) if p0_additive is not None else None
-        else:
+        elif p0_additive is not None:
             # Additive-only: icmat written as zeros
             p0_add = np.atleast_2d(p0_additive)
             n_add = _coords_add.shape[0] if _coords_add is not None else self.incoords.shape[0]
@@ -166,6 +215,12 @@ def __init__(  # noqa: C901 PLR0912 PLR0915
                     dtype=np.float64,
                 )
             self.p0_additive = p0_add
+        else:
+            # Velocity-only: no pressure nodes; derive nt from the first velocity signal
+            _first_vel = next(s for s in (u0, v0, w0) if s is not None)
+            nt_vel = np.atleast_2d(_first_vel).shape[1]
+            self.p0 = np.zeros((0, nt_vel), dtype=np.float64)
+            self.p0_additive = None
 
         # --- Set incoords_add for writer/binary ---
         if self.p0_additive is not None:
@@ -173,6 +228,11 @@ def __init__(  # noqa: C901 PLR0912 PLR0915
         else:
             self.incoords_add = None
 
+        # --- Resolve velocity source components ---
+        self.incoords_u, self.u0 = self._resolve_velocity_component("u", u0, coords_u, mask_u)
+        self.incoords_v, self.v0 = self._resolve_velocity_component("v", v0, coords_v, mask_v)
+        self.incoords_w, self.w0 = self._resolve_velocity_component("w", w0, coords_w, mask_w)
+
         self.is_3d = len(self.grid_shape) == 3
         super().__init__()
         self.__post_init__()
@@ -202,11 +262,64 @@ def __post_init__(self) -> None:
                 )
                 raise ValueError(error_msg)
 
+    @staticmethod
+    def _resolve_velocity_component(
+        name: str,
+        signal: NDArray | None,
+        coords: NDArray | None,
+        mask: NDArray | None,
+    ) -> tuple[NDArray | None, NDArray | None]:
+        """Resolve a velocity source component to (incoords, signal_arr).
+
+        Returns
+        -------
+        tuple[NDArray | None, NDArray | None]
+            (incoords, signal_arr) if the component is provided, else (None, None).
+
+        Raises
+        ------
+        ValueError
+            If coords and mask are both provided.
+            If signal is provided but neither coords nor mask are given.
+            If signal is None but coords/mask are provided.
+            If signal row count does not match the number of coordinate points.
+
+        """
+        if signal is None:
+            if coords is not None or mask is not None:
+                error_msg = f"coords_{name}/mask_{name} provided without {name}0 signal"
+                raise ValueError(error_msg)
+            return None, None
+        if coords is not None and mask is not None:
+            error_msg = f"coords_{name} and mask_{name} are mutually exclusive"
+            raise ValueError(error_msg)
+        if coords is None and mask is None:
+            error_msg = f"{name}0 signal provided but neither coords_{name} nor mask_{name} given"
+            raise ValueError(error_msg)
+        if coords is not None:
+            incoords = np.atleast_2d(coords).astype(np.int64, copy=False)
+        else:
+            incoords = map_to_coords(np.atleast_2d(mask))
+        signal_arr = np.atleast_2d(signal)
+        if signal_arr.shape[0] != incoords.shape[0]:
+            error_msg = (
+                f"{name}0 has {signal_arr.shape[0]} rows but "
+                f"coords_{name}/mask_{name} has {incoords.shape[0]} (must match)"
+            )
+            raise ValueError(error_msg)
+        return incoords, signal_arr
+
     def validate(self, grid_shape: NDArray[np.int64] | tuple) -> None:
         """Check if the source coordinates are consistent with the grid shape."""
         grid_shape = tuple(grid_shape) if isinstance(grid_shape, np.ndarray) else grid_shape
         assert self.grid_shape == grid_shape, f"{self.grid_shape} != {grid_shape}"
-        assert self.n_sources > 0 or self.n_sources_add > 0, "No active source found."
+        assert (
+            self.n_sources > 0
+            or self.n_sources_add > 0
+            or self.n_sources_u > 0
+            or self.n_sources_v > 0
+            or self.n_sources_w > 0
+        ), "No active source found."
         logger.debug("Source validated against grid shape.")
 
     @property
@@ -246,6 +359,21 @@ def n_sources_add(self) -> int:
             return 0
         return self.incoords_add.shape[0]
 
+    @property
+    def n_sources_u(self) -> int:
+        """Return the number of velocity-u (depth) source positions."""
+        return self.incoords_u.shape[0] if self.incoords_u is not None else 0
+
+    @property
+    def n_sources_v(self) -> int:
+        """Return the number of velocity-v (lateral) source positions."""
+        return self.incoords_v.shape[0] if self.incoords_v is not None else 0
+
+    @property
+    def n_sources_w(self) -> int:
+        """Return the number of velocity-w (elevational) source positions."""
+        return self.incoords_w.shape[0] if self.incoords_w is not None else 0
+
     def plot(
         self,
         export_path: Path | str | None = Path("./temp/temp.png"),
@@ -312,6 +440,13 @@ def __str__(self) -> str:
             lines.append(f"  p0_additive shape: {self.p0_additive.shape}")
         if self.incoords_add is not None:
             lines.append(f"  incoords_add: {self.incoords_add.shape[0]} points")
+        for comp, sig, n in (
+            ("u", self.u0, self.n_sources_u),
+            ("v", self.v0, self.n_sources_v),
+            ("w", self.w0, self.n_sources_w),
+        ):
+            if sig is not None:
+                lines.append(f"  {comp}0 shape: {sig.shape} ({n} points)")
         return "\n".join(lines) + "\n"
 
     def __repr__(self) -> str:
diff --git a/tests/solver/test_input_file_writer.py b/tests/solver/test_input_file_writer.py
index 5da42ad..80f8901 100644
--- a/tests/solver/test_input_file_writer.py
+++ b/tests/solver/test_input_file_writer.py
@@ -242,6 +242,246 @@ def test_run_with_incoords_add_writes_icc_add_and_ncoords_add(
     assert ncoords_add_val == 2
 
 
+def _source_with_velocity(**vel_attrs):
+    """Return a dummy source SimpleNamespace with optional velocity attributes."""
+    src = SimpleNamespace(
+        icmat=np.array([[1, 2], [3, 4]], dtype=np.float64),
+        incoords=np.array([[1, 2], [3, 4]], dtype=np.int64),
+        n_sources=2,
+        p0_additive=None,
+        incoords_add=None,
+        n_sources_add=0,
+        u0=None,
+        incoords_u=None,
+        n_sources_u=0,
+        v0=None,
+        incoords_v=None,
+        n_sources_v=0,
+        w0=None,
+        incoords_w=None,
+        n_sources_w=0,
+    )
+    for k, v in vel_attrs.items():
+        setattr(src, k, v)
+    return src
+
+
+def _run_writer(work_dir, bin_file, source, monkeypatch, sim_dir_name="sim_vel"):
+    """Build writer, bypass validation, run, and return sim_path."""
+    grid, medium, _, sensor = create_dummy_objects()
+    monkeypatch.setattr(check_functions, "check_path_exists", lambda x: None)  # noqa: ARG005
+    monkeypatch.setattr(check_functions, "check_instance", lambda inst, cls: None)  # noqa: ARG005
+    writer = InputFileWriter(
+        work_dir,
+        grid,
+        medium,
+        source,
+        sensor,
+        path_fullwave_simulation_bin=bin_file,
+        validate_input=False,
+    )
+    return Path(writer.run(sim_dir_name, is_static_map=False, recalculate_pml=True))
+
+
+class TestVelocitySourceWriter:
+    """Tests that InputFileWriter emits correct files for velocity source components."""
+
+    def test_u0_writes_icmat_u_and_icc_u(self, work_and_bin, monkeypatch):
+        """u0 + incoords_u → icmat_u.dat and icc_u.dat are created."""
+        work_dir, bin_file = work_and_bin
+        u0 = np.array([[0.5, 1.0]], dtype=np.float64)
+        incoords_u = np.array([[2, 3]], dtype=np.int64)
+        source = _source_with_velocity(u0=u0, incoords_u=incoords_u, n_sources_u=1)
+
+        sim_path = _run_writer(work_dir, bin_file, source, monkeypatch)
+
+        assert (sim_path / "icmat_u.dat").exists(), "icmat_u.dat not created"
+        assert (sim_path / "icc_u.dat").exists(), "icc_u.dat not created"
+
+    def test_u0_content(self, work_and_bin, monkeypatch):
+        """icmat_u.dat contains u0 data; icc_u.dat contains incoords_u."""
+        work_dir, bin_file = work_and_bin
+        u0 = np.array([[0.5, 1.0]], dtype=np.float64)  # shape (1, 2) — transpose is still (2, 1)
+        incoords_u = np.array([[2, 3]], dtype=np.int64)
+        source = _source_with_velocity(u0=u0, incoords_u=incoords_u, n_sources_u=1)
+
+        sim_path = _run_writer(work_dir, bin_file, source, monkeypatch)
+
+        # icmat_u.dat is written as np.transpose(u0) in float32
+        icmat_u_data = np.fromfile(sim_path / "icmat_u.dat", dtype=np.float32)
+        np.testing.assert_array_almost_equal(icmat_u_data, u0.astype(np.float32).ravel())
+
+        # icc_u.dat written as int32
+        icc_u_data = np.fromfile(sim_path / "icc_u.dat", dtype=np.int32)
+        np.testing.assert_array_equal(icc_u_data, incoords_u.ravel())
+
+    def test_ncoords_u_dat_written_with_correct_count(self, work_and_bin, monkeypatch):
+        """ncoords_u.dat is written with the correct integer count."""
+        work_dir, bin_file = work_and_bin
+        u0 = np.array([[0.5, 1.0], [0.2, 0.3]], dtype=np.float64)
+        incoords_u = np.array([[0, 1], [1, 0]], dtype=np.int64)
+        source = _source_with_velocity(u0=u0, incoords_u=incoords_u, n_sources_u=2)
+
+        sim_path = _run_writer(work_dir, bin_file, source, monkeypatch)
+
+        ncoords_u_path = sim_path / "ncoords_u.dat"
+        assert ncoords_u_path.exists(), "ncoords_u.dat not created"
+        val = np.fromfile(ncoords_u_path, dtype=np.int32)
+        assert int(val) == 2
+
+    def test_all_three_velocity_components_write_files(self, work_and_bin, monkeypatch):
+        """All three u/v/w components produce their respective files."""
+        work_dir, bin_file = work_and_bin
+        u0 = np.array([[1.0, 2.0]], dtype=np.float64)
+        v0 = np.array([[3.0, 4.0]], dtype=np.float64)
+        w0 = np.array([[5.0, 6.0]], dtype=np.float64)
+        source = _source_with_velocity(
+            u0=u0,
+            incoords_u=np.array([[0, 1]], dtype=np.int64),
+            n_sources_u=1,
+            v0=v0,
+            incoords_v=np.array([[1, 0]], dtype=np.int64),
+            n_sources_v=1,
+            w0=w0,
+            incoords_w=np.array([[0, 0]], dtype=np.int64),
+            n_sources_w=1,
+        )
+
+        sim_path = _run_writer(work_dir, bin_file, source, monkeypatch)
+
+        for suffix in ("u", "v", "w"):
+            assert (sim_path / f"icmat_{suffix}.dat").exists(), f"icmat_{suffix}.dat missing"
+            assert (sim_path / f"icc_{suffix}.dat").exists(), f"icc_{suffix}.dat missing"
+            assert (sim_path / f"ncoords_{suffix}.dat").exists(), f"ncoords_{suffix}.dat missing"
+
+    def test_no_velocity_source_no_velocity_files(self, work_and_bin, monkeypatch):
+        """When no velocity source is set, no velocity-related files are created."""
+        work_dir, bin_file = work_and_bin
+        _, _, source, _ = create_dummy_objects()
+
+        sim_path = _run_writer(work_dir, bin_file, source, monkeypatch)
+
+        for suffix in ("u", "v", "w"):
+            assert not (sim_path / f"icmat_{suffix}.dat").exists()
+            assert not (sim_path / f"icc_{suffix}.dat").exists()
+            assert not (sim_path / f"ncoords_{suffix}.dat").exists()
+
+    def test_only_v_component_writes_only_v_files(self, work_and_bin, monkeypatch):
+        """Only v present → only icmat_v/icc_v/ncoords_v written; u and w absent."""
+        work_dir, bin_file = work_and_bin
+        v0 = np.array([[7.0, 8.0]], dtype=np.float64)
+        source = _source_with_velocity(
+            v0=v0,
+            incoords_v=np.array([[3, 3]], dtype=np.int64),
+            n_sources_v=1,
+        )
+
+        sim_path = _run_writer(work_dir, bin_file, source, monkeypatch)
+
+        assert (sim_path / "icmat_v.dat").exists()
+        assert (sim_path / "icc_v.dat").exists()
+        assert (sim_path / "ncoords_v.dat").exists()
+        assert not (sim_path / "icmat_u.dat").exists()
+        assert not (sim_path / "icc_u.dat").exists()
+        assert not (sim_path / "ncoords_u.dat").exists()
+        assert not (sim_path / "icmat_w.dat").exists()
+        assert not (sim_path / "icc_w.dat").exists()
+        assert not (sim_path / "ncoords_w.dat").exists()
+
+    def test_velocity_only_no_pressure_files_are_empty(self, work_and_bin, monkeypatch):
+        """Velocity-only source (n_sources=0) writes empty icmat/icc and correct ncoords=0."""
+        work_dir, bin_file = work_and_bin
+        u0 = np.array([[1.0, 2.0]], dtype=np.float64)
+        source = _source_with_velocity(
+            # override pressure fields to match a velocity-only source
+            icmat=np.zeros((0, 2), dtype=np.float64),
+            incoords=np.empty((0, 2), dtype=np.int64),
+            n_sources=0,
+            u0=u0,
+            incoords_u=np.array([[0, 1]], dtype=np.int64),
+            n_sources_u=1,
+        )
+
+        sim_path = _run_writer(work_dir, bin_file, source, monkeypatch)
+
+        # ncoords.dat should be 0
+        ncoords_val = np.fromfile(sim_path / "ncoords.dat", dtype=np.int32)
+        assert int(ncoords_val) == 0
+
+        # icmat_u.dat and icc_u.dat should exist with correct content
+        assert (sim_path / "icmat_u.dat").exists()
+        assert (sim_path / "ncoords_u.dat").exists()
+        ncoords_u_val = np.fromfile(sim_path / "ncoords_u.dat", dtype=np.int32)
+        assert int(ncoords_u_val) == 1
+
+    def test_all_five_components_written_and_content_correct(
+        self,
+        work_and_bin,
+        monkeypatch,
+    ):
+        """p, additive, u, v, and w all set → all files written with correct binary content."""
+        work_dir, bin_file = work_and_bin
+
+        p0 = np.array([[1.0, 2.0]], dtype=np.float64)
+        p0_additive = np.array([[0.1, 0.2]], dtype=np.float64)
+        u0 = np.array([[3.0, 4.0]], dtype=np.float64)
+        v0 = np.array([[5.0, 6.0]], dtype=np.float64)
+        w0 = np.array([[7.0, 8.0]], dtype=np.float64)
+
+        incoords = np.array([[0, 0]], dtype=np.int64)
+        incoords_add = np.array([[1, 0]], dtype=np.int64)
+        incoords_u = np.array([[0, 1]], dtype=np.int64)
+        incoords_v = np.array([[1, 1]], dtype=np.int64)
+        incoords_w = np.array([[2, 0]], dtype=np.int64)
+
+        source = SimpleNamespace(
+            icmat=p0,
+            incoords=incoords,
+            n_sources=1,
+            p0_additive=p0_additive,
+            incoords_add=incoords_add,
+            n_sources_add=1,
+            u0=u0,
+            incoords_u=incoords_u,
+            n_sources_u=1,
+            v0=v0,
+            incoords_v=incoords_v,
+            n_sources_v=1,
+            w0=w0,
+            incoords_w=incoords_w,
+            n_sources_w=1,
+        )
+
+        sim_path = _run_writer(work_dir, bin_file, source, monkeypatch)
+
+        # --- pressure (hard) ---
+        icmat_data = np.fromfile(sim_path / "icmat.dat", dtype=np.float32)
+        np.testing.assert_array_almost_equal(icmat_data, p0.astype(np.float32).ravel())
+        icc_data = np.fromfile(sim_path / "icc.dat", dtype=np.int32)
+        np.testing.assert_array_equal(icc_data, incoords.ravel())
+
+        # --- additive (soft) ---
+        icmat_add_data = np.fromfile(sim_path / "icmat_add.dat", dtype=np.float32)
+        np.testing.assert_array_almost_equal(icmat_add_data, p0_additive.astype(np.float32).ravel())
+        icc_add_data = np.fromfile(sim_path / "icc_add.dat", dtype=np.int32)
+        np.testing.assert_array_equal(icc_add_data, incoords_add.ravel())
+        ncoords_add_val = np.fromfile(sim_path / "ncoords_add.dat", dtype=np.int32)
+        assert int(ncoords_add_val) == 1
+
+        # --- velocity u, v, w ---
+        for suffix, sig, icc in (
+            ("u", u0, incoords_u),
+            ("v", v0, incoords_v),
+            ("w", w0, incoords_w),
+        ):
+            icmat_vel = np.fromfile(sim_path / f"icmat_{suffix}.dat", dtype=np.float32)
+            np.testing.assert_array_almost_equal(icmat_vel, sig.astype(np.float32).ravel())
+            icc_vel = np.fromfile(sim_path / f"icc_{suffix}.dat", dtype=np.int32)
+            np.testing.assert_array_equal(icc_vel, icc.ravel())
+            ncoords_vel = np.fromfile(sim_path / f"ncoords_{suffix}.dat", dtype=np.int32)
+            assert int(ncoords_vel) == 1
+
+
 def _dc_map_original(c_map: np.ndarray) -> np.ndarray:
     """Original _set_dc_map logic before in-place optimization."""
     return matlab_round(c_map) - matlab_round(c_map.min()) + 1
diff --git a/tests/test_source.py b/tests/test_source.py
index 39b4f74..4ae79dd 100644
--- a/tests/test_source.py
+++ b/tests/test_source.py
@@ -172,7 +172,7 @@ def test_additive_only_with_coords():
 def test_both_p0_and_p0_additive_none_raises():
     """Source with both p0 and p0_additive None raises ValueError."""
     p_mask = np.array([[False, True]])
-    with pytest.raises(ValueError, match="At least one of p0 or p0_additive must be provided"):
+    with pytest.raises(ValueError, match="At least one of p0, p0_additive"):
         Source(None, p_mask)
 
 
@@ -244,3 +244,211 @@ def test_p0_additive_incoords_add_length_mismatch_raises():
             p0_additive=p0_add,
             coords_additive=incoords_add,
         )
+
+
+# --- Velocity source (u0/v0/w0) tests ---
+
+
+class TestVelocitySource:
+    """Tests for velocity source components (u0, v0, w0)."""
+
+    _grid_shape = (4, 4)
+    _nt = 5
+
+    def _p0(self, n: int = 1) -> np.ndarray:
+        return np.ones((n, self._nt), dtype=np.float64)
+
+    def _base_source(self, n: int = 1) -> Source:
+        """Minimal hard pressure source at (0,0)..(n-1,0)."""
+        coords = np.array([[i, 0] for i in range(n)])
+        return Source(self._p0(n), coords=coords, grid_shape=self._grid_shape)
+
+    # ---- construction with coords ----
+
+    def test_u0_with_coords(self):
+        """u0 with coords_u stores incoords_u and u0."""
+        coords_u = np.array([[1, 2]])
+        u0 = self._p0()
+        src = Source(
+            self._p0(),
+            coords=np.array([[0, 0]]),
+            grid_shape=self._grid_shape,
+            u0=u0,
+            coords_u=coords_u,
+        )
+        assert src.n_sources_u == 1
+        np.testing.assert_array_equal(src.incoords_u, coords_u)
+        np.testing.assert_array_equal(src.u0, u0)
+
+    def test_u0_with_mask(self):
+        """u0 with mask_u is equivalent to coords extracted from mask."""
+        mask_u = np.zeros(self._grid_shape, dtype=bool)
+        mask_u[1, 2] = True
+        u0 = self._p0()
+        src = Source(
+            self._p0(),
+            coords=np.array([[0, 0]]),
+            grid_shape=self._grid_shape,
+            u0=u0,
+            mask_u=mask_u,
+        )
+        expected_coords = np.argwhere(mask_u)
+        assert src.n_sources_u == 1
+        np.testing.assert_array_equal(src.incoords_u, expected_coords)
+
+    # ---- all three components ----
+
+    def test_all_three_velocity_components(self):
+        """All three u/v/w components can be specified simultaneously."""
+        coords_u = np.array([[0, 1]])
+        coords_v = np.array([[1, 0], [1, 1]])
+        coords_w = np.array([[2, 2], [3, 3], [0, 3]])
+        src = Source(
+            self._p0(),
+            coords=np.array([[0, 0]]),
+            grid_shape=self._grid_shape,
+            u0=np.ones((1, self._nt)),
+            coords_u=coords_u,
+            v0=np.ones((2, self._nt)),
+            coords_v=coords_v,
+            w0=np.ones((3, self._nt)),
+            coords_w=coords_w,
+        )
+        assert src.n_sources_u == 1
+        assert src.n_sources_v == 2
+        assert src.n_sources_w == 3
+
+    # ---- partial (only u provided) ----
+
+    def test_only_u_provided(self):
+        """Only u component; v and w remain None with count 0."""
+        coords_u = np.array([[0, 1]])
+        src = Source(
+            self._p0(),
+            coords=np.array([[0, 0]]),
+            grid_shape=self._grid_shape,
+            u0=self._p0(),
+            coords_u=coords_u,
+        )
+        assert src.n_sources_u == 1
+        assert src.n_sources_v == 0
+        assert src.n_sources_w == 0
+        assert src.v0 is None
+        assert src.w0 is None
+        assert src.incoords_v is None
+        assert src.incoords_w is None
+
+    # ---- error cases ----
+
+    def test_u0_without_coords_or_mask_raises(self):
+        """Providing u0 without coords_u or mask_u must raise ValueError."""
+        with pytest.raises(ValueError, match="neither coords_u nor mask_u"):
+            Source(
+                self._p0(),
+                coords=np.array([[0, 0]]),
+                grid_shape=self._grid_shape,
+                u0=self._p0(),
+            )
+
+    def test_coords_u_and_mask_u_raises(self):
+        """Providing both coords_u and mask_u must raise ValueError."""
+        mask_u = np.zeros(self._grid_shape, dtype=bool)
+        mask_u[0, 1] = True
+        coords_u = np.array([[0, 1]])
+        with pytest.raises(ValueError, match="mutually exclusive"):
+            Source(
+                self._p0(),
+                coords=np.array([[0, 0]]),
+                grid_shape=self._grid_shape,
+                u0=self._p0(),
+                coords_u=coords_u,
+                mask_u=mask_u,
+            )
+
+    def test_coords_u_without_u0_raises(self):
+        """Providing coords_u but no u0 must raise ValueError."""
+        with pytest.raises(ValueError, match="without u0"):
+            Source(
+                self._p0(),
+                coords=np.array([[0, 0]]),
+                grid_shape=self._grid_shape,
+                coords_u=np.array([[0, 1]]),
+            )
+
+    def test_u0_row_mismatch_raises(self):
+        """u0 row count not matching coords_u raises ValueError."""
+        coords_u = np.array([[0, 1], [1, 0]])  # 2 points
+        u0_wrong = self._p0(1)  # 1 row
+        with pytest.raises(ValueError, match="u0 has 1 rows"):
+            Source(
+                self._p0(),
+                coords=np.array([[0, 0]]),
+                grid_shape=self._grid_shape,
+                u0=u0_wrong,
+                coords_u=coords_u,
+            )
+
+    # ---- __str__ includes velocity info ----
+
+    def test_str_includes_velocity_component(self):
+        """__str__ reports velocity components when present."""
+        coords_u = np.array([[0, 1]])
+        src = Source(
+            self._p0(),
+            coords=np.array([[0, 0]]),
+            grid_shape=self._grid_shape,
+            u0=self._p0(),
+            coords_u=coords_u,
+        )
+        assert "u0" in str(src)
+
+    # ---- combined with hard pressure source ----
+
+    def test_combined_pressure_and_velocity(self):
+        """Hard pressure source and velocity source can coexist."""
+        coords_p = np.array([[0, 0], [0, 1]])
+        coords_u = np.array([[1, 0]])
+        src = Source(
+            self._p0(2),
+            coords=coords_p,
+            grid_shape=self._grid_shape,
+            u0=self._p0(1),
+            coords_u=coords_u,
+        )
+        assert src.n_sources == 2
+        assert src.n_sources_u == 1
+
+    # ---- velocity-only (no hard pressure source) ----
+
+    def test_velocity_only_no_pressure_source(self):
+        """Source with only u0 and no p0 has n_sources==0 and empty p0."""
+        coords_u = np.array([[0, 1], [1, 0]])
+        src = Source(
+            grid_shape=self._grid_shape,
+            u0=self._p0(2),
+            coords_u=coords_u,
+        )
+        assert src.n_sources == 0
+        assert src.n_sources_u == 2
+        assert src.p0.shape == (0, self._nt)
+        assert src.p0_additive is None
+        assert src.incoords.shape == (0, 2)
+
+    def test_velocity_only_missing_grid_shape_raises(self):
+        """Velocity-only source without grid_shape raises ValueError."""
+        with pytest.raises(ValueError, match="grid_shape is required"):
+            Source(u0=self._p0(), coords_u=np.array([[0, 1]]))
+
+    def test_velocity_only_validate_passes(self):
+        """validate() accepts a velocity-only source."""
+        src = Source(
+            grid_shape=self._grid_shape,
+            u0=self._p0(),
+            coords_u=np.array([[0, 1]]),
+        )
+        src.validate(self._grid_shape)
+
+    def test_all_none_raises(self):
+        """No p0, no p0_additive, no velocity → ValueError."""
+        with pytest.raises(ValueError):  # noqa: PT011
+            Source(grid_shape=self._grid_shape)

From 1e9e62210f3a964a7cc16d8c09ee74b1a2b639bd Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Tue, 24 Feb 2026 12:49:27 -0500
Subject: [PATCH 02/31] =?UTF-8?q?Bump=20version:=201.2.3=20=E2=86=92=201.2?=
 =?UTF-8?q?.4-dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .bumpversion.toml    | 2 +-
 fullwave/__init__.py | 2 +-
 pyproject.toml       | 2 +-
 uv.lock              | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.bumpversion.toml b/.bumpversion.toml
index 91f59c6..d7c83a6 100644
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "1.2.3"
+current_version = "1.2.4-dev0"
 parse = """(?x)
     (?P<major>0|[1-9]\\d*)\\.
     (?P<minor>0|[1-9]\\d*)\\.
diff --git a/fullwave/__init__.py b/fullwave/__init__.py
index f90a1a1..b29e9c0 100644
--- a/fullwave/__init__.py
+++ b/fullwave/__init__.py
@@ -60,7 +60,7 @@
     __version__ = version("fullwave")
 except PackageNotFoundError:
     # Update via bump-my-version, not manually
-    __version__ = "1.2.3"
+    __version__ = "1.2.4-dev0"
 
 VERSION = __version__  # for convenience
 logger.info("Fullwave version: %s", __version__)
diff --git a/pyproject.toml b/pyproject.toml
index 2351fce..160c1e4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "fullwave25"
-version = "1.2.3" # Update via bump-my-version, not manually
+version = "1.2.4-dev0" # Update via bump-my-version, not manually
 description = "Fullwave 2.5: Ultrasound wave propagation simulation with heterogeneous power law attenuation modelling capabilities"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/uv.lock b/uv.lock
index b9a0356..c71dd51 100644
--- a/uv.lock
+++ b/uv.lock
@@ -735,7 +735,7 @@ wheels = [
 
 [[package]]
 name = "fullwave25"
-version = "1.2.3"
+version = "1.2.4.dev0"
 source = { editable = "." }
 dependencies = [
     { name = "joblib" },

From 670a83336bb6a8ea134a12261f9246ccb5494049 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Tue, 24 Feb 2026 12:51:47 -0500
Subject: [PATCH 03/31] =?UTF-8?q?Bump=20version:=201.2.4-dev0=20=E2=86=92?=
 =?UTF-8?q?=201.2.5-dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .bumpversion.toml    | 2 +-
 fullwave/__init__.py | 2 +-
 pyproject.toml       | 2 +-
 uv.lock              | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.bumpversion.toml b/.bumpversion.toml
index d7c83a6..201d120 100644
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "1.2.4-dev0"
+current_version = "1.2.5-dev0"
 parse = """(?x)
     (?P<major>0|[1-9]\\d*)\\.
     (?P<minor>0|[1-9]\\d*)\\.
diff --git a/fullwave/__init__.py b/fullwave/__init__.py
index b29e9c0..ca47acc 100644
--- a/fullwave/__init__.py
+++ b/fullwave/__init__.py
@@ -60,7 +60,7 @@
     __version__ = version("fullwave")
 except PackageNotFoundError:
     # Update via bump-my-version, not manually
-    __version__ = "1.2.4-dev0"
+    __version__ = "1.2.5-dev0"
 
 VERSION = __version__  # for convenience
 logger.info("Fullwave version: %s", __version__)
diff --git a/pyproject.toml b/pyproject.toml
index 160c1e4..4822135 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "fullwave25"
-version = "1.2.4-dev0" # Update via bump-my-version, not manually
+version = "1.2.5-dev0" # Update via bump-my-version, not manually
 description = "Fullwave 2.5: Ultrasound wave propagation simulation with heterogeneous power law attenuation modelling capabilities"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/uv.lock b/uv.lock
index c71dd51..e29b441 100644
--- a/uv.lock
+++ b/uv.lock
@@ -735,7 +735,7 @@ wheels = [
 
 [[package]]
 name = "fullwave25"
-version = "1.2.4.dev0"
+version = "1.2.5.dev0"
 source = { editable = "." }
 dependencies = [
     { name = "joblib" },

From 5385ac8007b8aafe16a5a608f6e096db637bff21 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Tue, 24 Feb 2026 12:54:10 -0500
Subject: [PATCH 04/31] =?UTF-8?q?Bump=20version:=201.2.5-dev0=20=E2=86=92?=
 =?UTF-8?q?=201.2.6-dev0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .bumpversion.toml    | 2 +-
 fullwave/__init__.py | 2 +-
 pyproject.toml       | 2 +-
 uv.lock              | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.bumpversion.toml b/.bumpversion.toml
index 201d120..17bef39 100644
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "1.2.5-dev0"
+current_version = "1.2.6-dev0"
 parse = """(?x)
     (?P<major>0|[1-9]\\d*)\\.
     (?P<minor>0|[1-9]\\d*)\\.
diff --git a/fullwave/__init__.py b/fullwave/__init__.py
index ca47acc..4b5ad30 100644
--- a/fullwave/__init__.py
+++ b/fullwave/__init__.py
@@ -60,7 +60,7 @@
     __version__ = version("fullwave")
 except PackageNotFoundError:
     # Update via bump-my-version, not manually
-    __version__ = "1.2.5-dev0"
+    __version__ = "1.2.6-dev0"
 
 VERSION = __version__  # for convenience
 logger.info("Fullwave version: %s", __version__)
diff --git a/pyproject.toml b/pyproject.toml
index 4822135..32e0dd5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "fullwave25"
-version = "1.2.5-dev0" # Update via bump-my-version, not manually
+version = "1.2.6-dev0" # Update via bump-my-version, not manually
 description = "Fullwave 2.5: Ultrasound wave propagation simulation with heterogeneous power law attenuation modelling capabilities"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/uv.lock b/uv.lock
index e29b441..85fa27b 100644
--- a/uv.lock
+++ b/uv.lock
@@ -735,7 +735,7 @@ wheels = [
 
 [[package]]
 name = "fullwave25"
-version = "1.2.5.dev0"
+version = "1.2.6.dev0"
 source = { editable = "." }
 dependencies = [
     { name = "joblib" },

From 6666626113deea9094564a116344ca5fbc75c923 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Tue, 24 Feb 2026 15:39:09 -0500
Subject: [PATCH 05/31] implemented sparse grid output coordinates

---
 .../simple_plane_wave_sparse_grid.py          | 149 ++++++++++++++++
 .../simple_plane_wave_sparse_grid_3d.py       | 168 ++++++++++++++++++
 fullwave/sensor.py                            |  76 +++++++-
 fullwave/solver/binary_manager.py             |   2 +-
 fullwave/solver/input_file_writer.py          |  29 +++
 fullwave/solver/pml_builder.py                |  50 ++++--
 fullwave/solver/solver.py                     |  44 +++--
 tests/solver/test_input_file_writer.py        |   4 +
 tests/test_sensor.py                          |  69 ++++++-
 9 files changed, 550 insertions(+), 41 deletions(-)
 create mode 100644 examples/simple_plane_wave/simple_plane_wave_sparse_grid.py
 create mode 100644 examples/simple_plane_wave/simple_plane_wave_sparse_grid_3d.py

diff --git a/examples/simple_plane_wave/simple_plane_wave_sparse_grid.py b/examples/simple_plane_wave/simple_plane_wave_sparse_grid.py
new file mode 100644
index 0000000..e4e973c
--- /dev/null
+++ b/examples/simple_plane_wave/simple_plane_wave_sparse_grid.py
@@ -0,0 +1,149 @@
+"""Simple plane wave example using sparse-grid sensor output.
+
+Instead of recording every grid point, the sensor is configured with
+mod_x=4, mod_y=4 so the binary records only every 4th point in each
+spatial dimension.  This reduces output data size by a factor of 16
+compared to a full-domain sensor.
+"""
+
+import logging
+from pathlib import Path
+
+import numpy as np
+
+import fullwave
+from fullwave.utils import plot_utils
+from fullwave.utils.coordinates import map_to_coords
+
+
+def main() -> None:
+    """Run simple plane wave example with sparse-grid sensor."""
+    logging.getLogger("__main__").setLevel(logging.INFO)
+
+    work_dir = Path("./outputs/") / "simple_plane_wave_sparse_grid"
+    work_dir.mkdir(parents=True, exist_ok=True)
+
+    #
+    # --- grid ---
+    #
+    domain_size = (3e-2, 2e-2)  # meters
+    f0 = 3e6
+    c0 = 1540
+    duration = domain_size[0] / c0 * 2
+    grid = fullwave.Grid(
+        domain_size=domain_size,
+        f0=f0,
+        duration=duration,
+        c0=c0,
+    )
+
+    #
+    # --- medium ---
+    #
+    sound_speed_map = 1540 * np.ones((grid.nx, grid.ny))
+    density_map = 1000 * np.ones((grid.nx, grid.ny))
+    alpha_coeff_map = 0.5 * np.ones((grid.nx, grid.ny))
+    alpha_power_map = 1.0 * np.ones((grid.nx, grid.ny))
+    beta_map = np.zeros((grid.nx, grid.ny))
+
+    obj_x = slice(grid.nx // 3, 2 * grid.nx // 3)
+    obj_y = slice(grid.ny // 3, 2 * grid.ny // 3)
+    sound_speed_map[obj_x, obj_y] = 1600
+    density_map[obj_x, obj_y] = 1100
+    alpha_coeff_map[obj_x, obj_y] = 0.75
+    alpha_power_map[obj_x, obj_y] = 1.1
+
+    medium = fullwave.Medium(
+        grid=grid,
+        sound_speed=sound_speed_map,
+        density=density_map,
+        alpha_coeff=alpha_coeff_map,
+        alpha_power=alpha_power_map,
+        beta=beta_map,
+    )
+
+    #
+    # --- source: plane wave from the top ---
+    #
+    ncycles = 2
+    drop_off = 2
+    element_thickness_px = 3
+
+    p_mask = np.zeros((grid.nx, grid.ny), dtype=bool)
+    p_mask[:element_thickness_px, :] = True
+    p_coordinates = map_to_coords(p_mask)
+
+    p0 = np.zeros((p_mask.sum(), grid.nt))
+    for i in range(element_thickness_px):
+        p0_vec = fullwave.utils.pulse.gaussian_modulated_sinusoidal_signal(
+            nt=grid.nt,
+            f0=f0,
+            duration=duration,
+            ncycles=ncycles,
+            drop_off=drop_off,
+            p0=1e5,
+            i_layer=i,
+            dt_for_layer_delay=grid.dt,
+            cfl_for_layer_delay=grid.cfl,
+        )
+        n_y = p_coordinates.shape[0] // element_thickness_px
+        p0[n_y * i : n_y * (i + 1), :] = p0_vec[: grid.nt]
+
+    source = fullwave.Source(p0=p0, coords=p_coordinates, grid_shape=grid.shape)
+
+    #
+    # --- sparse-grid sensor ---
+    #
+    # Record every 4th point in x (depth) and every 4th point in y (lateral).
+    # The binary generates the sensor positions automatically; no mask or
+    # coordinate array is needed.
+    mod_x = 4
+    mod_y = 4
+    sensor = fullwave.Sensor(mod_x=mod_x, mod_y=mod_y, sampling_modulus_time=7)
+
+    #
+    # --- solver ---
+    #
+    # Sparse-grid output is supported by the exponential-attenuation binary.
+    fw_solver = fullwave.Solver(
+        work_dir=work_dir,
+        grid=grid,
+        medium=medium,
+        source=source,
+        sensor=sensor,
+        use_exponential_attenuation=True,
+    )
+    fw_solver.summary()
+
+    # sensor_output shape: [n_sparse_sensors, nt_recorded]
+    # n_sparse_sensors is inferred by the solver from the binary output length.
+    sensor_output = fw_solver.run()
+
+    #
+    # --- reshape and visualize ---
+    #
+    # Compute the subsampled spatial dimensions to reconstruct the 2D wavefield.
+    nx_sparse = int(np.ceil(grid.nx / mod_x))
+    ny_sparse = int(np.ceil(grid.ny / mod_y))
+    nt_recorded = sensor_output.shape[1]
+
+    # Reshape to [nt_recorded, nx_sparse, ny_sparse]
+    propagation_map = sensor_output.T.reshape(nt_recorded, nx_sparse, ny_sparse)
+
+    p_max = np.abs(propagation_map).max() / 4
+    time_step = nt_recorded // 3
+    plot_utils.plot_array(
+        propagation_map[time_step],
+        aspect=ny_sparse / nx_sparse,
+        export_path=work_dir / "wave_propagation_sparse_snapshot.png",
+        vmax=p_max,
+        vmin=-p_max,
+    )
+
+    print(f"Full grid size:    ({grid.nx}, {grid.ny})")
+    print(f"Sparse grid size:  ({nx_sparse}, {ny_sparse})  [mod_x={mod_x}, mod_y={mod_y}]")
+    print(f"Output shape:      {sensor_output.shape}  [n_sparse_sensors, nt_recorded]")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/simple_plane_wave/simple_plane_wave_sparse_grid_3d.py b/examples/simple_plane_wave/simple_plane_wave_sparse_grid_3d.py
new file mode 100644
index 0000000..9bfaf29
--- /dev/null
+++ b/examples/simple_plane_wave/simple_plane_wave_sparse_grid_3d.py
@@ -0,0 +1,168 @@
+"""Simple 3D plane wave example using sparse-grid sensor output.
+
+Instead of recording every grid point, the sensor is configured with
+mod_x=4, mod_y=4, mod_z=4 so the binary records only every 4th point in each
+spatial dimension.  This reduces output data size by a factor of 64
+compared to a full-domain sensor.
+"""
+
+import logging
+from pathlib import Path
+
+import numpy as np
+
+import fullwave
+from fullwave.utils import plot_utils
+from fullwave.utils.coordinates import map_to_coords
+
+
+def main() -> None:  # noqa: PLR0915
+    """Run 3D simple plane wave example with sparse-grid sensor."""
+    logging.getLogger("__main__").setLevel(logging.INFO)
+
+    work_dir = Path("./outputs/") / "simple_plane_wave_sparse_grid_3d"
+    work_dir.mkdir(parents=True, exist_ok=True)
+
+    #
+    # --- grid ---
+    #
+    f0 = 1e6
+    c0 = 1540
+    wavelength = c0 / f0
+    domain_size = (10 * wavelength, 10 * wavelength, 10 * wavelength)  # meters
+    duration = domain_size[0] / c0 * 2
+    grid = fullwave.Grid(domain_size, f0, duration, c0=c0)
+    grid.print_info()
+
+    #
+    # --- medium ---
+    #
+    sound_speed_map = 1540 * np.ones((grid.nx, grid.ny, grid.nz))
+    density_map = 1000 * np.ones((grid.nx, grid.ny, grid.nz))
+    alpha_coeff_map = 0.5 * np.ones((grid.nx, grid.ny, grid.nz))
+    alpha_power_map = 1.0 * np.ones((grid.nx, grid.ny, grid.nz))
+    beta_map = np.zeros((grid.nx, grid.ny, grid.nz))
+
+    obj_x = slice(grid.nx // 3, 2 * grid.nx // 3)
+    obj_y = slice(grid.ny // 4, 3 * grid.ny // 4)
+    obj_z = slice(grid.nz // 3, 9 * grid.nz // 10)
+    sound_speed_map[obj_x, obj_y, obj_z] = 1600
+    density_map[obj_x, obj_y, obj_z] = 1100
+    alpha_coeff_map[obj_x, obj_y, obj_z] = 0.75
+    alpha_power_map[obj_x, obj_y, obj_z] = 1.1
+
+    medium = fullwave.Medium(
+        grid=grid,
+        sound_speed=sound_speed_map,
+        density=density_map,
+        alpha_coeff=alpha_coeff_map,
+        alpha_power=alpha_power_map,
+        beta=beta_map,
+    )
+    medium.print_info()
+
+    #
+    # --- source: plane wave from the top ---
+    #
+    ncycles = 2
+    drop_off = 2
+    element_thickness_px = 3
+
+    p_mask = np.zeros((grid.nx, grid.ny, grid.nz), dtype=bool)
+    p_mask[:element_thickness_px, :] = True
+    p_coordinates = map_to_coords(p_mask)
+
+    p0 = np.zeros((p_mask.sum(), grid.nt))
+    for i in range(element_thickness_px):
+        p0_vec = fullwave.utils.pulse.gaussian_modulated_sinusoidal_signal(
+            nt=grid.nt,
+            f0=f0,
+            duration=duration,
+            ncycles=ncycles,
+            drop_off=drop_off,
+            p0=1e5,
+            i_layer=i,
+            dt_for_layer_delay=grid.dt,
+            cfl_for_layer_delay=grid.cfl,
+        )
+        n_yz = p_coordinates.shape[0] // element_thickness_px
+        p0[n_yz * i : n_yz * (i + 1), :] = p0_vec[: grid.nt]
+
+    source = fullwave.Source(
+        p0=p0,
+        coords=p_coordinates,
+        grid_shape=(grid.nx, grid.ny, grid.nz),
+    )
+
+    #
+    # --- sparse-grid sensor ---
+    #
+    # Record every 4th point in x (depth), y (lateral), and z (elevational).
+    # The binary generates the sensor positions automatically; no mask or
+    # coordinate array is needed.  mod_z is required for 3D simulations.
+    mod_x = 2
+    mod_y = 2
+    mod_z = 2
+    sensor = fullwave.Sensor(mod_x=mod_x, mod_y=mod_y, mod_z=mod_z, sampling_modulus_time=4)
+
+    #
+    # --- solver ---
+    #
+    # Sparse-grid output is supported by the exponential-attenuation binary.
+    fw_solver = fullwave.Solver(
+        work_dir=work_dir,
+        grid=grid,
+        medium=medium,
+        source=source,
+        sensor=sensor,
+        use_exponential_attenuation=True,
+    )
+    fw_solver.summary()
+
+    # sensor_output shape: [n_sparse_sensors, nt_recorded]
+    # n_sparse_sensors is inferred by the solver from the binary output length.
+    sensor_output = fw_solver.run()
+
+    #
+    # --- reshape and visualize ---
+    #
+    # Compute the subsampled spatial dimensions to reconstruct the 3D wavefield.
+    nx_sparse = int(np.ceil(grid.nx / mod_x))
+    ny_sparse = int(np.ceil(grid.ny / mod_y))
+    nz_sparse = int(np.ceil(grid.nz / mod_z))
+    nt_recorded = sensor_output.shape[1]
+
+    # Reshape to [nt_recorded, nx_sparse, ny_sparse, nz_sparse]
+    propagation_map = sensor_output.T.reshape(nt_recorded, nx_sparse, ny_sparse, nz_sparse)
+
+    time_step = nt_recorded // 3
+    p_max = np.abs(propagation_map).max() / 4
+
+    # x-y slice at the middle z index
+    plot_utils.plot_array(
+        propagation_map[time_step, :, :, nz_sparse // 2],
+        aspect=ny_sparse / nx_sparse,
+        export_path=work_dir / "wave_propagation_sparse_snapshot_x-y.png",
+        vmax=p_max,
+        vmin=-p_max,
+    )
+
+    # x-z slice at the middle y index
+    plot_utils.plot_array(
+        propagation_map[time_step, :, ny_sparse // 2, :],
+        aspect=nz_sparse / nx_sparse,
+        export_path=work_dir / "wave_propagation_sparse_snapshot_x-z.png",
+        vmax=p_max,
+        vmin=-p_max,
+    )
+
+    print(f"Full grid size:    ({grid.nx}, {grid.ny}, {grid.nz})")
+    print(
+        f"Sparse grid size:  ({nx_sparse}, {ny_sparse}, {nz_sparse})"
+        f"  [mod_x={mod_x}, mod_y={mod_y}, mod_z={mod_z}]",
+    )
+    print(f"Output shape:      {sensor_output.shape}  [n_sparse_sensors, nt_recorded]")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fullwave/sensor.py b/fullwave/sensor.py
index 0a9c1ad..7b0edf6 100644
--- a/fullwave/sensor.py
+++ b/fullwave/sensor.py
@@ -21,13 +21,16 @@ class Sensor:
     outcoords: NDArray[np.int64]
     sampling_modulus_time: int = 1
 
-    def __init__(
+    def __init__(  # noqa: PLR0915
         self,
         mask: NDArray[np.bool] | None = None,
         sampling_modulus_time: int = 1,
         *,
         coords: NDArray[np.int64] | None = None,
         grid_shape: tuple[int, ...] | None = None,
+        mod_x: int | None = None,
+        mod_y: int | None = None,
+        mod_z: int = 0,
     ) -> None:
         """Sensor class for Fullwave.
 
@@ -36,7 +39,7 @@ def __init__(
         mask : NDArray[np.bool] | None
             Binary matrix where the pressure is recorded at each time-step
             shape: [nx, ny] for 2D, [nx, ny, nz] for 3D.
-            Mutually exclusive with coords/grid_shape.
+            Mutually exclusive with coords/grid_shape and mod_x/mod_y.
         sampling_modulus_time: int
             Sampling modulus in time. Default is 1 (record at every time step).
             Changing this value to n will record the pressure every n time steps.
@@ -46,39 +49,95 @@ def __init__(
             Must be provided together with grid_shape.
         grid_shape : tuple[int, ...] | None
             Shape of the computational grid. Required when using coords input.
+        mod_x : int | None
+            Spatial decimation stride in x (depth) for sparse-grid sensor output.
+            When provided together with mod_y, activates sparse-grid mode.
+            In this mode the solver binary generates sensor positions automatically
+            as [::mod_x, ::mod_y] (and [::mod_z] for 3D) across the domain,
+            ignoring any explicit coords/mask.  Mutually exclusive with mask/coords.
+        mod_y : int | None
+            Spatial decimation stride in y (lateral).  Must be provided with mod_x.
+        mod_z : int
+            Spatial decimation stride in z (elevational).  Only relevant in 3D simulations.
+            Providing mod_z > 0 together with mod_x and mod_y creates a 3D sparse sensor.
+            Default 0 (2D sensor, or no z-subsampling when used in a 3D run).
 
         Raises
         ------
         ValueError
             If grid_shape is not provided when using coords input.
             If both mask and coords are provided (mutually exclusive).
-            If neither mask nor coords (with grid_shape) is provided.
+            If only one of mod_x / mod_y is provided.
+            If mod_x/mod_y are mixed with mask or coords.
+            If none of the three input modes is supplied.
 
         """
-        if coords is not None:
+        if mod_x is not None or mod_y is not None:
+            # --- sparse-grid mode ---
+            if mod_x is None or mod_y is None:
+                msg = "Both mod_x and mod_y must be provided for sparse-grid sensor mode"
+                raise ValueError(msg)
+            if mask is not None or coords is not None:
+                msg = "mod_x/mod_y are mutually exclusive with mask and coords"
+                raise ValueError(msg)
+            if mod_x <= 0 or mod_y <= 0:
+                msg = "mod_x and mod_y must be positive integers"
+                raise ValueError(msg)
+            if mod_z < 0:
+                msg = "mod_z must be a non-negative integer"
+                raise ValueError(msg)
+            self.is_sparse_grid = True
+            self.mod_x = mod_x
+            self.mod_y = mod_y
+            self.mod_z = mod_z
+            self.is_3d = mod_z > 0
+            ndim = 3 if self.is_3d else 2
+            # Empty placeholder — the binary computes positions from mod values.
+            self.outcoords = np.empty((0, ndim), dtype=np.int64)
+            self.grid_shape = None
+        elif coords is not None:
             if grid_shape is None:
                 msg = "grid_shape is required when using coords input"
                 raise ValueError(msg)
             if mask is not None:
                 msg = "mask and coords are mutually exclusive"
                 raise ValueError(msg)
+            self.is_sparse_grid = False
+            self.mod_x = 0
+            self.mod_y = 0
+            self.mod_z = 0
             self.outcoords = np.atleast_2d(coords).astype(np.int64, copy=False)
             self.grid_shape = tuple(grid_shape)
+            self.is_3d = len(self.grid_shape) == 3
         elif mask is not None:
+            self.is_sparse_grid = False
+            self.mod_x = 0
+            self.mod_y = 0
+            self.mod_z = 0
             mask = np.atleast_2d(mask)
             self.grid_shape = mask.shape
             self.outcoords = map_to_coords(mask)
+            self.is_3d = len(self.grid_shape) == 3
         else:
-            msg = "Either mask or coords (with grid_shape) must be provided"
+            msg = "Either mask, coords (with grid_shape), or mod_x with mod_y must be provided"
             raise ValueError(msg)
 
         self.sampling_modulus_time = sampling_modulus_time
-        self.is_3d = len(self.grid_shape) == 3
         super().__init__()
         logger.debug("Sensor instance created.")
 
     def validate(self, grid_shape: NDArray[np.int64] | tuple) -> None:
         """Check if the sensor coordinates are consistent with the grid shape."""
+        if self.is_sparse_grid:
+            grid_shape = tuple(grid_shape) if isinstance(grid_shape, np.ndarray) else grid_shape
+            if len(grid_shape) == 3 and self.mod_z == 0:
+                msg = (
+                    "Sparse-grid sensor used in a 3D simulation but mod_z was not provided. "
+                    "Pass mod_z > 0 to Sensor (e.g. Sensor(mod_x=4, mod_y=4, mod_z=4))."
+                )
+                raise ValueError(msg)
+            logger.debug("Sparse-grid sensor validated.")
+            return
         grid_shape = tuple(grid_shape) if isinstance(grid_shape, np.ndarray) else grid_shape
         assert self.grid_shape == grid_shape, f"{self.grid_shape} != {grid_shape}"
         assert self.n_sensors > 0, "No active sensor found."
@@ -173,6 +232,11 @@ def __str__(self) -> str:
             Formatted string containing source information.
 
         """
+        if self.is_sparse_grid:
+            mod_str = f"mod_x={self.mod_x}, mod_y={self.mod_y}"
+            if self.is_3d:
+                mod_str += f", mod_z={self.mod_z}"
+            return f"Sensor (sparse-grid): \n  Strides: {mod_str}\n  Is 3D: {self.is_3d}\n"
         return (
             f"Sensor: \n"
             f"  Number of sensors: {self.n_sensors}\n"
diff --git a/fullwave/solver/binary_manager.py b/fullwave/solver/binary_manager.py
index 1b8a789..f3c433b 100644
--- a/fullwave/solver/binary_manager.py
+++ b/fullwave/solver/binary_manager.py
@@ -28,7 +28,7 @@
 
 # Pinned release tag for the solver binaries.
 # Update this only when new binaries are uploaded to a GitHub release.
-BINARY_RELEASE_TAG = "fullwave_bin_v1.2"
+BINARY_RELEASE_TAG = "fullwave_bin_v1.3"
 
 
 def _download_url(filename: str, tag: str) -> str:
diff --git a/fullwave/solver/input_file_writer.py b/fullwave/solver/input_file_writer.py
index fc67888..ccbdb14 100644
--- a/fullwave/solver/input_file_writer.py
+++ b/fullwave/solver/input_file_writer.py
@@ -36,6 +36,7 @@ def __init__(
         use_exponential_attenuation: bool = False,
         use_isotropic_relaxation: bool = False,
         release_after_write: bool = False,
+        pml_thickness: int = 0,
     ) -> None:
         """Initialize the InputGeneratorBase instance.
 
@@ -74,6 +75,10 @@ def __init__(
             Whether to release the variable from memory after writing to file.
             This can help reduce memory usage when generating input files for large simulations.
             default is False.
+        pml_thickness : int, optional
+            PML boundary thickness in grid points (n_pml_layer + n_transition_layer).
+            Required by the solver binary when using sparse grid (mod_x/mod_y != 0) to determine
+            the interior domain boundaries. Also written when mod_x == 0 for future use.
 
         """
         logger.debug("Initializing InputFileWriter instance.")
@@ -99,6 +104,7 @@ def __init__(
         self.is_3d = self.grid.is_3d
         self.use_exponential_attenuation = use_exponential_attenuation
         self.release_after_write = release_after_write
+        self.pml_thickness = pml_thickness
 
         self._dim = int(
             np.rint(self.medium.sound_speed.max()) - np.rint(self.medium.sound_speed.min()),
@@ -828,6 +834,7 @@ def _save_variables_into_dat_file(
         self._save_step_params(simulation_dir)
         self._save_coords_params(simulation_dir)
         self._save_d_params(simulation_dir, dim)
+        self._save_sparse_grid_params(simulation_dir)
 
         if self.use_isotropic_relaxation:
             rename_dict = {
@@ -894,6 +901,7 @@ def _save_variables_into_dat_file_exponential_attenuation(
         self._save_step_params(simulation_dir)
         self._save_coords_params(simulation_dir)
         self._save_d_params(simulation_dir, dim)
+        self._save_sparse_grid_params(simulation_dir)
 
     def _build_symbolic_links_for_dat_files(self, src_dir: Path, dst_dir: Path) -> None:
         var_name_list = [
@@ -916,6 +924,9 @@ def _build_symbolic_links_for_dat_files(self, src_dir: Path, dst_dir: Path) -> N
             "ncoordszero",
             "nTic",
             "modT",
+            "modX",
+            "modY",
+            "pml_thickness",
             "d",
             "dmap",
             "ndmap",
@@ -948,6 +959,8 @@ def _build_symbolic_links_for_dat_files(self, src_dir: Path, dst_dir: Path) -> N
                     "bpmly2",
                 ],
             )
+        if self.is_3d:
+            var_name_list.append("modZ")
         if self.is_3d and not self.use_isotropic_relaxation:
             var_name_list.extend(
                 [
@@ -1074,6 +1087,22 @@ def _save_coords_params(self, simulation_dir: Path) -> None:
             save_path = simulation_dir / f"{var_name}.dat"
             self._queue_v_abs_write(np.int32, save_path, var)
 
+    def _save_sparse_grid_params(self, simulation_dir: Path) -> None:
+        """Write sparse-grid parameters when the sensor is in sparse-grid mode.
+
+        modX / modY / modZ and pml_thickness are only written when the sensor
+        was constructed with mod_x/mod_y (i.e. sensor.is_sparse_grid is True).
+        In standard coordinate/mask mode these files are not produced, preserving
+        backward compatibility with older binaries that do not expect them.
+        """
+        if not self.sensor.is_sparse_grid:
+            return
+        self._queue_v_abs_write(np.int32, simulation_dir / "modX.dat", self.sensor.mod_x)
+        self._queue_v_abs_write(np.int32, simulation_dir / "modY.dat", self.sensor.mod_y)
+        if self.is_3d:
+            self._queue_v_abs_write(np.int32, simulation_dir / "modZ.dat", self.sensor.mod_z)
+        self._queue_v_abs_write(np.int32, simulation_dir / "pml_thickness.dat", self.pml_thickness)
+
     def _save_d_params(
         self,
         simulation_dir: Path,
diff --git a/fullwave/solver/pml_builder.py b/fullwave/solver/pml_builder.py
index 9e3899c..016b428 100644
--- a/fullwave/solver/pml_builder.py
+++ b/fullwave/solver/pml_builder.py
@@ -353,14 +353,24 @@ def __init__(  # noqa: PLR0915
         logger.debug("building extended source for pml...done")
 
         logger.debug("building extended sensor for pml...")
-        extended_sensor_grid_shape = tuple(
-            s + 2 * self.num_boundary_points for s in self.sensor_org.grid_shape
-        )
-        self.extended_sensor = fullwave.Sensor(
-            coords=self.sensor_org.outcoords + self.num_boundary_points,
-            grid_shape=extended_sensor_grid_shape,
-            sampling_modulus_time=self.sensor_org.sampling_modulus_time,
-        )
+        if self.sensor_org.is_sparse_grid:
+            # Sparse-grid sensor: no explicit coordinates to shift.
+            # Pass mod values through; the binary generates positions at run time.
+            self.extended_sensor = fullwave.Sensor(
+                mod_x=self.sensor_org.mod_x,
+                mod_y=self.sensor_org.mod_y,
+                mod_z=self.sensor_org.mod_z,
+                sampling_modulus_time=self.sensor_org.sampling_modulus_time,
+            )
+        else:
+            extended_sensor_grid_shape = tuple(
+                s + 2 * self.num_boundary_points for s in self.sensor_org.grid_shape
+            )
+            self.extended_sensor = fullwave.Sensor(
+                coords=self.sensor_org.outcoords + self.num_boundary_points,
+                grid_shape=extended_sensor_grid_shape,
+                sampling_modulus_time=self.sensor_org.sampling_modulus_time,
+            )
         logger.debug("building extended sensor for pml...done")
         if self.is_3d:
             self.pml_mask_x, self.pml_mask_y, self.pml_mask_z = self._localize_pml_region()
@@ -1643,14 +1653,22 @@ def __init__(  # noqa: PLR0915
             w0=getattr(self.source_org, "w0", None),
             coords_w=incoords_w_ext,
         )
-        extended_sensor_grid_shape = tuple(
-            s + 2 * self.num_boundary_points for s in self.sensor_org.grid_shape
-        )
-        self.extended_sensor = fullwave.Sensor(
-            coords=self.sensor_org.outcoords + self.num_boundary_points,
-            grid_shape=extended_sensor_grid_shape,
-            sampling_modulus_time=self.sensor_org.sampling_modulus_time,
-        )
+        if self.sensor_org.is_sparse_grid:
+            self.extended_sensor = fullwave.Sensor(
+                mod_x=self.sensor_org.mod_x,
+                mod_y=self.sensor_org.mod_y,
+                mod_z=self.sensor_org.mod_z,
+                sampling_modulus_time=self.sensor_org.sampling_modulus_time,
+            )
+        else:
+            extended_sensor_grid_shape = tuple(
+                s + 2 * self.num_boundary_points for s in self.sensor_org.grid_shape
+            )
+            self.extended_sensor = fullwave.Sensor(
+                coords=self.sensor_org.outcoords + self.num_boundary_points,
+                grid_shape=extended_sensor_grid_shape,
+                sampling_modulus_time=self.sensor_org.sampling_modulus_time,
+            )
         logger.debug("Extended source and sensor for PML built successfully.")
 
         logger.debug("Localizing PML region...")
diff --git a/fullwave/solver/solver.py b/fullwave/solver/solver.py
index 0f8b283..3573850 100644
--- a/fullwave/solver/solver.py
+++ b/fullwave/solver/solver.py
@@ -642,6 +642,8 @@ def _check_input(
     def _reshape_sensor_data(
         raw_sensor_output: NDArray[np.float64],
         sensor: fullwave.Sensor,
+        *,
+        n_t: int | None = None,
     ) -> NDArray[np.float64]:
         """Reshape the raw sensor output data.
 
@@ -651,12 +653,21 @@ def _reshape_sensor_data(
             The raw sensor output data from the simulation. [nt*ncoordsout, 1]
         sensor: fullwave.Sensor
             The sensor object used in the simulation.
+        n_t: int | None
+            Number of time steps in the extended grid.  Required for sparse-grid
+            sensors because n_sensors is not known at Python time.
 
         Returns
         -------
         NDArray[np.float64]: The reshaped sensor output data. [ncoordsout, nt]
 
         """
+        if sensor.is_sparse_grid:
+            if n_t is None:
+                msg = "n_t is required to reshape sparse-grid sensor output"
+                raise ValueError(msg)
+            n_t_recorded = -(-n_t // sensor.sampling_modulus_time)  # ceiling division
+            return raw_sensor_output.reshape(n_t_recorded, -1).T
         return raw_sensor_output.reshape(-1, sensor.n_sensors).T
 
     def run(
@@ -783,30 +794,27 @@ def run(
             )
             logger.warning(warning_msg)
 
-        sensor_mask: NDArray[np.bool_]
         if record_whole_domain:
-            if self.is_3d:
-                sensor_mask = np.zeros(
-                    (
-                        self.pml_builder.extended_grid.nx,
-                        self.pml_builder.extended_grid.ny,
-                        self.pml_builder.extended_grid.nz,
-                    ),
-                    dtype=bool,
-                )
-            else:
-                sensor_mask = np.zeros(
-                    (self.pml_builder.extended_grid.nx, self.pml_builder.extended_grid.ny),
-                    dtype=bool,
-                )
-            sensor_mask[:, :] = True
+            mod_x = 1
+            mod_y = 1
+            mod_z = 1 if self.is_3d else None
+
             sensor = fullwave.Sensor(
-                mask=sensor_mask,
+                mod_x=mod_x,
+                mod_y=mod_y,
+                mod_z=mod_z,
                 sampling_modulus_time=sampling_modulus_time_whole_domain,
             )
         else:
             sensor = self.pml_builder.extended_sensor
 
+        # pml_thickness = PML + transition layers on each side, excluding ghost cells.
+        # Used by the binary to locate the interior domain when building a sparse sensor grid.
+        if record_whole_domain:
+            pml_thickness = 0
+        else:
+            pml_thickness = self.pml_builder.num_boundary_points - self.pml_builder.m_spatial_order
+
         start_input_file_writer_time = time.time()
         input_file_writer = InputFileWriter(
             work_dir=self.work_dir,
@@ -818,6 +826,7 @@ def run(
             use_exponential_attenuation=self.use_exponential_attenuation,
             use_isotropic_relaxation=self.use_isotropic_relaxation,
             release_after_write=release_after_write,
+            pml_thickness=pml_thickness,
         )
         simulation_dir = input_file_writer.run(
             simulation_dir_name,
@@ -850,6 +859,7 @@ def run(
             result = self._reshape_sensor_data(
                 sim_result,
                 sensor=sensor,
+                n_t=self.pml_builder.extended_grid.nt,
             )
             end_loading_time = time.time()
             message = (
diff --git a/tests/solver/test_input_file_writer.py b/tests/solver/test_input_file_writer.py
index 80f8901..abca35e 100644
--- a/tests/solver/test_input_file_writer.py
+++ b/tests/solver/test_input_file_writer.py
@@ -44,6 +44,10 @@ def create_dummy_objects():
         outcoords=np.array([[1, 2], [3, 4]], dtype=np.int64),
         sampling_modulus_time=0.5,
         n_sensors=2,
+        mod_x=0,
+        mod_y=0,
+        mod_z=0,
+        is_sparse_grid=False,
     )
     return grid, medium, source, sensor
 
diff --git a/tests/test_sensor.py b/tests/test_sensor.py
index 283d360..42ac132 100644
--- a/tests/test_sensor.py
+++ b/tests/test_sensor.py
@@ -102,5 +102,72 @@ def test_coords_and_mask_raises():
 
 
 def test_no_input_raises():
-    with pytest.raises(ValueError, match="Either mask or coords"):
+    with pytest.raises(ValueError, match="Either mask"):
         Sensor()
+
+
+# --- Sparse-grid mode tests ---
+
+
+def test_sparse_grid_2d():
+    sensor = Sensor(mod_x=4, mod_y=4)
+    assert sensor.is_sparse_grid
+    assert sensor.mod_x == 4
+    assert sensor.mod_y == 4
+    assert sensor.mod_z == 0
+    assert not sensor.is_3d
+    assert sensor.n_sensors == 0
+    assert sensor.outcoords.shape == (0, 2)
+    assert sensor.grid_shape is None
+
+
+def test_sparse_grid_3d():
+    sensor = Sensor(mod_x=4, mod_y=4, mod_z=2)
+    assert sensor.is_sparse_grid
+    assert sensor.mod_z == 2
+    assert sensor.is_3d
+    assert sensor.outcoords.shape == (0, 3)
+
+
+def test_sparse_grid_validate_2d_ok():
+    sensor = Sensor(mod_x=4, mod_y=4)
+    sensor.validate((100, 200))  # 2D grid — should not raise
+
+
+def test_sparse_grid_validate_3d_missing_mod_z_raises():
+    sensor = Sensor(mod_x=4, mod_y=4)  # mod_z defaults to 0
+    with pytest.raises(ValueError, match="mod_z"):
+        sensor.validate((100, 200, 50))  # 3D grid
+
+
+def test_sparse_grid_validate_3d_ok():
+    sensor = Sensor(mod_x=4, mod_y=4, mod_z=4)
+    sensor.validate((100, 200, 50))  # should not raise
+
+
+def test_sparse_grid_missing_mod_y_raises():
+    with pytest.raises(ValueError, match="Both mod_x and mod_y"):
+        Sensor(mod_x=4)
+
+
+def test_sparse_grid_missing_mod_x_raises():
+    with pytest.raises(ValueError, match="Both mod_x and mod_y"):
+        Sensor(mod_y=4)
+
+
+def test_sparse_grid_with_mask_raises():
+    mask = np.array([[1, 0]])
+    with pytest.raises(ValueError, match="mutually exclusive"):
+        Sensor(mask=mask, mod_x=4, mod_y=4)
+
+
+def test_sparse_grid_with_coords_raises():
+    coords = np.array([[0, 0]])
+    with pytest.raises(ValueError, match="mutually exclusive"):
+        Sensor(coords=coords, grid_shape=(1, 2), mod_x=4, mod_y=4)
+
+
+def test_sparse_grid_str():
+    sensor = Sensor(mod_x=4, mod_y=4)
+    assert "sparse-grid" in str(sensor)
+    assert "mod_x=4" in str(sensor)

From 2a602d3d953d6b70b4156ce7f056e417c239e46d Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Tue, 24 Feb 2026 15:43:41 -0500
Subject: [PATCH 06/31] =?UTF-8?q?Bump=20version:=201.2.6-dev0=20=E2=86=92?=
 =?UTF-8?q?=201.2.6-dev1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .bumpversion.toml    | 2 +-
 fullwave/__init__.py | 2 +-
 pyproject.toml       | 2 +-
 uv.lock              | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.bumpversion.toml b/.bumpversion.toml
index 17bef39..78bde87 100644
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "1.2.6-dev0"
+current_version = "1.2.6-dev1"
 parse = """(?x)
     (?P<major>0|[1-9]\\d*)\\.
     (?P<minor>0|[1-9]\\d*)\\.
diff --git a/fullwave/__init__.py b/fullwave/__init__.py
index 4b5ad30..9b6406e 100644
--- a/fullwave/__init__.py
+++ b/fullwave/__init__.py
@@ -60,7 +60,7 @@
     __version__ = version("fullwave")
 except PackageNotFoundError:
     # Update via bump-my-version, not manually
-    __version__ = "1.2.6-dev0"
+    __version__ = "1.2.6-dev1"
 
 VERSION = __version__  # for convenience
 logger.info("Fullwave version: %s", __version__)
diff --git a/pyproject.toml b/pyproject.toml
index 32e0dd5..53ddbd4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "fullwave25"
-version = "1.2.6-dev0" # Update via bump-my-version, not manually
+version = "1.2.6-dev1" # Update via bump-my-version, not manually
 description = "Fullwave 2.5: Ultrasound wave propagation simulation with heterogeneous power law attenuation modelling capabilities"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/uv.lock b/uv.lock
index 85fa27b..d4a22e0 100644
--- a/uv.lock
+++ b/uv.lock
@@ -735,7 +735,7 @@ wheels = [
 
 [[package]]
 name = "fullwave25"
-version = "1.2.6.dev0"
+version = "1.2.6.dev1"
 source = { editable = "." }
 dependencies = [
     { name = "joblib" },

From 336bd999dcc50583b7e73dd8b0ae3a775bbd80bc Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Tue, 24 Feb 2026 18:54:48 -0500
Subject: [PATCH 07/31] Add built-in signal filtering examples and
 functionality

---
 examples/signal_filter/__init__.py            |   1 +
 .../signal_filter/plane_wave_with_filter.py   | 280 ++++++++++++++++++
 .../signal_filter/signal_filter_example.py    | 130 ++++++++
 fullwave/solver/solver.py                     | 105 ++++++-
 fullwave/utils/__init__.py                    |   3 +-
 fullwave/utils/signal_filter.py               | 161 ++++++++++
 tests/solver/test_signal_filter.py            | 261 ++++++++++++++++
 7 files changed, 939 insertions(+), 2 deletions(-)
 create mode 100644 examples/signal_filter/__init__.py
 create mode 100644 examples/signal_filter/plane_wave_with_filter.py
 create mode 100644 examples/signal_filter/signal_filter_example.py
 create mode 100644 fullwave/utils/signal_filter.py
 create mode 100644 tests/solver/test_signal_filter.py

diff --git a/examples/signal_filter/__init__.py b/examples/signal_filter/__init__.py
new file mode 100644
index 0000000..8fa8327
--- /dev/null
+++ b/examples/signal_filter/__init__.py
@@ -0,0 +1 @@
+"""Signal filter examples for fullwave25."""
diff --git a/examples/signal_filter/plane_wave_with_filter.py b/examples/signal_filter/plane_wave_with_filter.py
new file mode 100644
index 0000000..fd6c6c4
--- /dev/null
+++ b/examples/signal_filter/plane_wave_with_filter.py
@@ -0,0 +1,280 @@
+"""Demonstrate the built-in high-pass filter inside solver.run().
+
+Based on examples/linear_transducer/plane_wave_compounding.py.
+
+This script runs a single 0-degree plane wave transmission through a medium
+with echoic targets, then shows the effect of passing
+``highpass_cutoff_mhz=0.5`` to ``solver.run()``:
+
+  * Left panel   - raw sensor traces (PML low-frequency drift visible)
+  * Right panel  - high-pass filtered traces (drift removed)
+  * Bottom panel - amplitude spectra of a single element before/after
+
+Run with:
+    uv run python examples/signal_filter/plane_wave_with_filter.py
+"""
+
+import logging
+import shutil
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+import fullwave
+from fullwave.utils.signal_filter import apply_filter
+
+logging.getLogger("__main__").setLevel(logging.INFO)
+
+
+# ---------------------------------------------------------------------------
+# Medium helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_echoic_targets(
+    scatterer: np.ndarray,
+    grid: fullwave.Grid,
+    target_radius_m: float,
+    target_spacing_m: float,
+    n_targets_axial: int,
+    n_targets_lateral: int,
+) -> np.ndarray:
+    """Place a grid of hypo/hyper/anechoic circles in the scatterer map."""
+    total_axial = (n_targets_axial - 1) * target_spacing_m
+    total_lateral = (n_targets_lateral - 1) * target_spacing_m
+    x0 = (grid.shape[0] * grid.dx - total_axial) / 2
+    y0 = (grid.shape[1] * grid.dx - total_lateral) / 2
+
+    axial_pos = np.linspace(x0, x0 + total_axial, n_targets_axial)
+    lateral_pos = np.linspace(y0, y0 + total_lateral, n_targets_lateral)
+
+    # columns: anechoic, hypo-echoic, hyper-echoic
+    ratio = np.array(
+        [
+            [0.0, 0.5, 3.0],
+            [0.0, 0.5, 3.0],
+            [0.0, 0.5, 3.0],
+        ],
+    )
+
+    for i_ax, xp in enumerate(axial_pos):
+        for i_lat, yp in enumerate(lateral_pos):
+            xi = int(xp / grid.dx)
+            yi = int(yp / grid.dx)
+            rr, cc = np.ogrid[
+                -xi : scatterer.shape[0] - xi,
+                -yi : scatterer.shape[1] - yi,
+            ]
+            mask = rr**2 + cc**2 <= (target_radius_m / grid.dx) ** 2
+            scatterer -= 1.0
+            scatterer[mask] *= ratio[i_ax, i_lat]
+            scatterer += 1.0
+
+    return scatterer
+
+
+def _make_input_signal(
+    grid: fullwave.Grid,
+    transducer: fullwave.Transducer,
+    element_layer_px: int,
+    p_max: float = 1e5,
+) -> np.ndarray:
+    """Build a 0-degree plane wave input signal (no steering delay)."""
+    input_signal = np.zeros((transducer.n_sources, grid.nt))
+    for i in range(len(input_signal)):
+        n_y = input_signal.shape[0] // element_layer_px
+        i_layer = i // n_y
+        input_signal[i] = fullwave.utils.pulse.gaussian_modulated_sinusoidal_signal(
+            nt=grid.nt,
+            f0=grid.f0,
+            duration=grid.duration,
+            ncycles=2,
+            drop_off=2,
+            p0=p_max,
+            i_layer=i_layer,
+            dt_for_layer_delay=grid.dt,
+            cfl_for_layer_delay=grid.cfl,
+            delay_sec=0.0,
+        )
+    return input_signal
+
+
+# ---------------------------------------------------------------------------
+# Simulation sub-steps
+# ---------------------------------------------------------------------------
+
+
+def _build_medium(
+    grid: fullwave.Grid,
+    c0: float,
+) -> fullwave.Medium:
+    """Build the acoustic medium with echoic scattering targets."""
+    rng = np.random.default_rng(42)
+    scatterer, _ = fullwave.utils.generate_scatterer(
+        grid=grid,
+        ratio_scatterer_to_total_grid=0.38,
+        scatter_value_std=0.02 / 2,
+        rng=rng,
+    )
+    scatterer = _make_echoic_targets(
+        scatterer,
+        grid,
+        target_radius_m=5e-3,
+        target_spacing_m=15e-3,
+        n_targets_axial=3,
+        n_targets_lateral=3,
+    )
+    return fullwave.Medium(
+        grid,
+        sound_speed=np.ones(grid.shape) * c0,
+        density=np.ones(grid.shape) * 1000 * scatterer,
+        alpha_coeff=np.ones(grid.shape) * 0.5,
+        alpha_power=np.ones(grid.shape) * 1.1,
+        beta=np.zeros(grid.shape),
+        air_map=np.zeros(grid.shape),
+    )
+
+
+def _build_transducer(
+    grid: fullwave.Grid,
+    domain_size: tuple[float, float],
+) -> tuple[fullwave.Transducer, int]:
+    """Build the 128-element linear transducer and return it with element_layer_px."""
+    element_layer_px = 4
+    transducer_width_m = 38e-3
+    transducer_geometry = fullwave.TransducerGeometry(
+        grid,
+        number_elements=128,
+        element_width_m=0.298e-3 - 0.048e-3,
+        element_spacing_m=0.048e-3,
+        element_layer_px=element_layer_px,
+        position_m=(0, (domain_size[1] - transducer_width_m) / 2),
+        radius=float("inf"),
+    )
+    transducer = fullwave.Transducer(
+        transducer_geometry=transducer_geometry,
+        grid=grid,
+        sampling_modulus_time=7,
+    )
+    transducer.set_signal(_make_input_signal(grid, transducer, element_layer_px))
+    return transducer, element_layer_px
+
+
+def _run_simulation(
+    work_dir: Path,
+    grid: fullwave.Grid,
+    medium: fullwave.Medium,
+    transducer: fullwave.Transducer,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Run the solver and return (raw_output, hp_filtered_output)."""
+    solver = fullwave.Solver(
+        work_dir=work_dir,
+        grid=grid,
+        medium=medium,
+        transducer=transducer,
+    )
+    raw_output = solver.run(simulation_dir_name="txrx_raw", is_static_map=True)
+    raw_output = transducer.post_process_sensor_output(raw_output, average_surface_signals=True)
+    shutil.rmtree(work_dir / "txrx_raw")
+
+    # Equivalent to passing highpass_cutoff_mhz=0.5 directly to solver.run()
+    dt_rec = grid.dt * transducer.sampling_modulus_time
+    filtered_output = apply_filter(raw_output, dt=dt_rec, f_low_hz=0.5e6, use_gpu=False)
+    return raw_output, filtered_output
+
+
+def _plot_results(
+    work_dir: Path,
+    grid: fullwave.Grid,
+    transducer: fullwave.Transducer,
+    raw_output: np.ndarray,
+    filtered_output: np.ndarray,
+    f0: float,
+) -> None:
+    """Plot time traces and amplitude spectra for three representative elements."""
+    dt_rec = grid.dt * transducer.sampling_modulus_time
+    n_t_rec = raw_output.shape[1]
+    t_us = np.arange(n_t_rec) * dt_rec * 1e6
+    freqs_mhz = np.fft.rfftfreq(n_t_rec, d=dt_rec) / 1e6
+
+    def _db(sig: np.ndarray) -> np.ndarray:
+        amp = np.abs(np.fft.rfft(sig))
+        return 20 * np.log10(np.maximum(amp / (amp.max() + 1e-12), 1e-5))
+
+    n_elem = raw_output.shape[0]
+    elem_indices = [n_elem // 4, n_elem // 2, 3 * n_elem // 4]
+
+    fig, axes = plt.subplots(3, 2, figsize=(13, 10))
+    fig.suptitle("Plane wave — sensor output before / after HP filter (0.5 MHz)", fontsize=12)
+
+    for row, idx in enumerate(elem_indices):
+        ax_t, ax_f = axes[row, 0], axes[row, 1]
+        raw_trace, filt_trace = raw_output[idx], filtered_output[idx]
+
+        ax_t.plot(t_us, raw_trace, color="tab:gray", lw=0.7, alpha=0.7, label="Raw")
+        ax_t.plot(t_us, filt_trace, color="tab:orange", lw=0.9, label="HP 0.5 MHz")
+        ax_t.set_ylabel("Pressure")
+        ax_t.set_title(f"element {idx}")
+        ax_t.legend(fontsize=8)
+        ax_t.set_xlim(t_us[0], t_us[-1])
+        if row == 2:
+            ax_t.set_xlabel("Time (µs)")
+
+        ax_f.plot(freqs_mhz, _db(raw_trace), color="tab:gray", lw=0.7, alpha=0.7, label="Raw")
+        ax_f.plot(freqs_mhz, _db(filt_trace), color="tab:orange", lw=0.9, label="HP 0.5 MHz")
+        ax_f.axvline(f0 / 1e6, color="steelblue", lw=0.8, ls="--", label=f"f0={f0 / 1e6:.0f} MHz")
+        ax_f.set_xlim(0, f0 / 1e6 * 3)
+        ax_f.set_ylim(-80, 5)
+        ax_f.set_ylabel("Amplitude (dB)")
+        ax_f.legend(fontsize=8)
+        if row == 2:
+            ax_f.set_xlabel("Frequency (MHz)")
+
+    axes[0, 0].set_title(f"Time traces — {axes[0, 0].get_title()}", fontsize=10)
+    axes[0, 1].set_title(f"Amplitude spectra — {axes[0, 1].get_title()}", fontsize=10)
+    plt.tight_layout()
+    out_fig = work_dir / "sensor_before_after_filter.png"
+    plt.savefig(out_fig, dpi=150)
+    print(f"Saved figure to {out_fig}")
+
+
+def _print_summary(raw_output: np.ndarray, filtered_output: np.ndarray) -> None:
+    """Print DC drift statistics before and after filtering."""
+    drift_raw = raw_output.mean(axis=1)
+    drift_filt = filtered_output.mean(axis=1)
+    print(
+        f"\nDC drift (mean across elements):"
+        f"\n  Raw      - mean={drift_raw.mean():.4f}, std={drift_raw.std():.4f}"
+        f"\n  Filtered - mean={drift_filt.mean():.6f}, std={drift_filt.std():.6f}",
+    )
+    print(
+        "\nTip: pass highpass_cutoff_mhz=0.5 directly to solver.run() to apply"
+        " the same filter automatically before the result is returned.",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    """Run plane wave simulation and compare raw vs high-pass filtered sensor output."""
+    work_dir = Path("./outputs/plane_wave_with_filter")
+    work_dir.mkdir(parents=True, exist_ok=True)
+
+    domain_size = (4.5e-2, 4.5e-2)  # [axial, lateral] m
+    f0 = 2e6
+    c0 = 1540
+
+    grid = fullwave.Grid(domain_size, f0, domain_size[0] / c0 * 2.3, c0=c0, ppw=12, cfl=0.4)
+    medium = _build_medium(grid, c0)
+    transducer, _ = _build_transducer(grid, domain_size)
+    raw_output, filtered_output = _run_simulation(work_dir, grid, medium, transducer)
+    _plot_results(work_dir, grid, transducer, raw_output, filtered_output, f0)
+    _print_summary(raw_output, filtered_output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/signal_filter/signal_filter_example.py b/examples/signal_filter/signal_filter_example.py
new file mode 100644
index 0000000..33bf8a5
--- /dev/null
+++ b/examples/signal_filter/signal_filter_example.py
@@ -0,0 +1,130 @@
+"""Demonstrate the built-in signal filter (no GPU / no simulation required).
+
+This script:
+  1. Builds a synthetic sensor trace that mimics typical PML drift artefacts:
+     a slow DC ramp + a 3 MHz ultrasound pulse buried in broadband noise.
+  2. Applies a high-pass filter (0.5 MHz) to remove the drift.
+  3. Applies a band-pass filter (1-5 MHz) to isolate the ultrasound band.
+  4. Plots the time traces and their amplitude spectra side-by-side.
+
+Run with:
+    uv run python examples/signal_filter/signal_filter_example.py
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from fullwave.utils.signal_filter import apply_filter
+
+
+def main() -> None:
+    """Build a synthetic sensor trace and demonstrate high-pass and band-pass filtering."""
+    # ---------------------------------------------------------------------------
+    # Simulation-like parameters
+    # ---------------------------------------------------------------------------
+    f0 = 3e6  # center frequency, Hz
+    dt = 1 / (f0 * 20)  # ~20 samples per period
+    n_t = 2048
+    t = np.arange(n_t) * dt  # time axis, seconds
+
+    # ---------------------------------------------------------------------------
+    # Synthetic signal: DC drift + 3 MHz pulse + broadband noise
+    # ---------------------------------------------------------------------------
+    rng = np.random.default_rng(42)
+
+    # Slow PML drift: linear ramp (typical artifact)
+    drift = np.linspace(0, 2.0, n_t)
+
+    # Short Gaussian-windowed 3 MHz pulse arriving at t = 3 µs
+    t_pulse = 3e-6
+    sigma = 0.5e-6
+    envelope = np.exp(-0.5 * ((t - t_pulse) / sigma) ** 2)
+    pulse = envelope * np.sin(2 * np.pi * f0 * t)
+
+    # White noise floor at -30 dB relative to pulse
+    noise = rng.standard_normal(n_t) * 0.03
+
+    raw = drift + pulse + noise  # shape [n_t]
+
+    # Wrap as [n_sensors, n_t] (one channel)
+    data = raw[np.newaxis, :]
+
+    # ---------------------------------------------------------------------------
+    # Apply filters
+    # ---------------------------------------------------------------------------
+    hp_filtered = apply_filter(data, dt, f_low_hz=0.5e6, use_gpu=False)
+    bp_filtered = apply_filter(data, dt, f_low_hz=1e6, f_high_hz=5e6, use_gpu=False)
+
+    # ---------------------------------------------------------------------------
+    # Amplitude spectrum helper
+    # ---------------------------------------------------------------------------
+    freqs_mhz = np.fft.rfftfreq(n_t, d=dt) / 1e6
+
+    def spectrum_db(sig: np.ndarray) -> np.ndarray:
+        """Return amplitude spectrum in dB (normalised to peak, floored at -100 dB)."""
+        amp = np.abs(np.fft.rfft(sig))
+        amp_norm = amp / (amp.max() + 1e-12)
+        return 20 * np.log10(np.maximum(amp_norm, 1e-5))  # floor at -100 dB
+
+    # ---------------------------------------------------------------------------
+    # Plot — 2 rows (time / spectrum) x 3 columns (raw / high-pass / band-pass)
+    # Each filtered column overlays the raw signal so before/after is clear.
+    # ---------------------------------------------------------------------------
+    t_us = t * 1e6  # µs for display
+
+    fig, axes = plt.subplots(2, 3, figsize=(14, 7))
+    fig.suptitle("Built-in signal filter — example", fontsize=13)
+
+    filters = [
+        ("Raw (drift + pulse + noise)", data[0], None, "tab:blue"),
+        ("High-pass 0.5 MHz", hp_filtered[0], data[0], "tab:orange"),
+        ("Band-pass 1-5 MHz", bp_filtered[0], data[0], "tab:green"),
+    ]
+
+    for col, (label, sig, before, color) in enumerate(filters):
+        ax_t = axes[0, col]
+        ax_f = axes[1, col]
+
+        # --- time trace ---
+        if before is not None:
+            ax_t.plot(t_us, before, color="tab:gray", lw=0.6, alpha=0.5, label="Before")
+        ax_t.plot(t_us, sig, color=color, lw=0.9, label="After" if before is not None else label)
+        ax_t.set_title(label, fontsize=10)
+        ax_t.set_xlabel("Time (µs)")
+        ax_t.set_ylabel("Amplitude")
+        ax_t.set_xlim(t_us[0], t_us[-1])
+        if before is not None:
+            ax_t.legend(fontsize=8)
+
+        # --- amplitude spectrum ---
+        if before is not None:
+            ax_f.plot(
+                freqs_mhz,
+                spectrum_db(before),
+                color="tab:gray",
+                lw=0.6,
+                alpha=0.5,
+                label="Before",
+            )
+        ax_f.plot(
+            freqs_mhz,
+            spectrum_db(sig),
+            color=color,
+            lw=0.9,
+            label="After" if before is not None else label,
+        )
+        ax_f.set_xlabel("Frequency (MHz)")
+        ax_f.set_ylabel("Amplitude (dB)")
+        ax_f.set_xlim(0, 10)
+        ax_f.set_ylim(-80, 5)
+        ax_f.axvline(f0 / 1e6, color="gray", lw=0.8, ls="--", label=f"f₀ = {f0 / 1e6:.0f} MHz")
+        ax_f.legend(fontsize=8)
+
+    plt.tight_layout()
+    out_path = "signal_filter_example.png"
+    plt.savefig(out_path, dpi=150)
+    print(f"Saved figure to {out_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fullwave/solver/solver.py b/fullwave/solver/solver.py
index 3573850..0c3111c 100644
--- a/fullwave/solver/solver.py
+++ b/fullwave/solver/solver.py
@@ -15,6 +15,7 @@
     MemoryTempfile,
     check_functions,
 )
+from fullwave.utils.signal_filter import apply_filter
 
 from .binary_manager import ensure_binary
 from .cuda_utils import get_cuda_architecture, retrieve_cuda_version
@@ -638,6 +639,80 @@ def _check_input(
         error_msg = f"{path_fullwave_simulation_bin} does not exist"
         assert path_fullwave_simulation_bin.exists(), error_msg
 
+    @staticmethod
+    def _validate_filter_params(
+        highpass_cutoff_mhz: float | None,
+        bandpass_cutoff_mhz: tuple[float, float] | None,
+        *,
+        load_results: bool,
+    ) -> None:
+        """Validate high-pass / band-pass filter arguments passed to run().
+
+        Raises
+        ------
+        ValueError
+            If both filter options are set simultaneously, or if a filter is
+            requested without ``load_results=True``.
+
+        """
+        if highpass_cutoff_mhz is not None and bandpass_cutoff_mhz is not None:
+            error_msg = (
+                "highpass_cutoff_mhz and bandpass_cutoff_mhz cannot both be specified. "
+                "Use highpass_cutoff_mhz for a simple high-pass filter or "
+                "bandpass_cutoff_mhz for a band-pass filter."
+            )
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+        if (highpass_cutoff_mhz is not None or bandpass_cutoff_mhz is not None) and (
+            not load_results
+        ):
+            error_msg = (
+                "Filtering requires load_results=True. "
+                "Set load_results=True or disable the filter options."
+            )
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+    @staticmethod
+    def _apply_output_filter(
+        result: NDArray[np.float64],
+        dt: float,
+        highpass_cutoff_mhz: float | None,
+        bandpass_cutoff_mhz: tuple[float, float] | None,
+    ) -> NDArray[np.float64]:
+        """Apply the optional frequency filter to the reshaped sensor output.
+
+        Parameters
+        ----------
+        result : NDArray[np.float64]
+            Sensor data shaped ``[n_sensors, n_t]``.
+        dt : float
+            Grid time step in seconds.
+        highpass_cutoff_mhz : float | None
+            High-pass edge in MHz, or ``None``.
+        bandpass_cutoff_mhz : tuple[float, float] | None
+            ``(f_low_mhz, f_high_mhz)`` band-pass edges, or ``None``.
+
+        Returns
+        -------
+        NDArray[np.float64]
+            Filtered (or unchanged) sensor data.
+
+        """
+        if highpass_cutoff_mhz is not None:
+            logger.info("Applying high-pass filter at %.4g MHz...", highpass_cutoff_mhz)
+            return apply_filter(result, dt, f_low_hz=highpass_cutoff_mhz * 1e6)
+        if bandpass_cutoff_mhz is not None:
+            f_low_hz = bandpass_cutoff_mhz[0] * 1e6
+            f_high_hz = bandpass_cutoff_mhz[1] * 1e6
+            logger.info(
+                "Applying band-pass filter %.4g-%.4g MHz...",
+                bandpass_cutoff_mhz[0],
+                bandpass_cutoff_mhz[1],
+            )
+            return apply_filter(result, dt, f_low_hz=f_low_hz, f_high_hz=f_high_hz)
+        return result
+
     @staticmethod
     def _reshape_sensor_data(
         raw_sensor_output: NDArray[np.float64],
@@ -681,6 +756,8 @@ def run(
         load_results: bool = True,
         generate_input_only: bool = False,
         release_after_write: bool = False,
+        highpass_cutoff_mhz: float | None = None,
+        bandpass_cutoff_mhz: tuple[float, float] | None = None,
     ) -> NDArray[np.float64] | Path:
         r"""Run the fullwave simulation and return the result as a NumPy array.
 
@@ -740,6 +817,18 @@ def run(
             If True, the memory used by the input files will be released after writing them to disk.
             This is useful when run_on_memory is True to free up memory space for the simulation
             or when the input files are large. Default is False.
+        highpass_cutoff_mhz : float | None
+            Apply a high-pass filter to the sensor recordings after the simulation.
+            Removes low-frequency PML drift by attenuating frequencies below this value (in MHz).
+            Uses a cosine (Hann) taper to avoid Gibbs ringing.
+            Cannot be combined with ``bandpass_cutoff_mhz``.
+            Requires ``load_results=True``.  Default is ``None`` (no filtering).
+        bandpass_cutoff_mhz : tuple[float, float] | None
+            Apply a band-pass filter ``(f_low_mhz, f_high_mhz)`` to the sensor recordings
+            after the simulation.  Retains only frequencies inside the specified band.
+            Uses cosine (Hann) tapers on both edges.
+            Cannot be combined with ``highpass_cutoff_mhz``.
+            Requires ``load_results=True``.  Default is ``None`` (no filtering).
 
         Returns
         -------
@@ -757,6 +846,8 @@ def run(
             Static map simulations require input files to be stored on a disk.
             run_on_memory, on the other hand, removes the input files
             after the simulation is complete.
+            Also raised if both ``highpass_cutoff_mhz`` and ``bandpass_cutoff_mhz`` are given,
+            or if either filter option is set but ``load_results=False``.
 
         """
         # self._save_data_for_beamforming()
@@ -778,6 +869,12 @@ def run(
             logger.error(error_msg)
             raise ValueError(error_msg)
 
+        self._validate_filter_params(
+            highpass_cutoff_mhz,
+            bandpass_cutoff_mhz,
+            load_results=load_results,
+        )
+
         start_time = time.time()
         extended_medium = self.pml_builder.run(use_pml=self.use_pml)
         end_pml_builder_time = time.time()
@@ -867,7 +964,13 @@ def run(
                 f"{end_loading_time - start_loading_time:.2e} seconds."
             )
             logger.info(message)
-            return result
+
+            return self._apply_output_filter(
+                result,
+                self.grid.dt,
+                highpass_cutoff_mhz,
+                bandpass_cutoff_mhz,
+            )
         # if load_results is False, return the raw result
         # which is a list of file names
         return sim_result
diff --git a/fullwave/utils/__init__.py b/fullwave/utils/__init__.py
index 1af42f7..6175687 100644
--- a/fullwave/utils/__init__.py
+++ b/fullwave/utils/__init__.py
@@ -1,6 +1,6 @@
 """misc utils for fullwave package."""
 
-from . import pulse, relaxation_parameters, signal_process
+from . import pulse, relaxation_parameters, signal_filter, signal_process
 from .memory_tempfile import MemoryTempfile
 from .scatterer import (
     generate_resolution_based_scatterer,
@@ -14,5 +14,6 @@
     "generate_scatterer",
     "pulse",
     "relaxation_parameters",
+    "signal_filter",
     "signal_process",
 ]
diff --git a/fullwave/utils/signal_filter.py b/fullwave/utils/signal_filter.py
new file mode 100644
index 0000000..d238c18
--- /dev/null
+++ b/fullwave/utils/signal_filter.py
@@ -0,0 +1,161 @@
+"""FFT-based frequency-domain filtering for sensor data.
+
+GPU backend: CuPy when available; falls back silently to NumPy.
+No new hard dependencies — CuPy is already listed under the ``examples`` optional extra.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+if TYPE_CHECKING:
+    from numpy.typing import NDArray
+
+logger = logging.getLogger("__main__." + __name__)
+
+# Module-level cache to avoid repeated import overhead
+_CUPY_AVAILABLE: bool | None = None
+
+
+def _check_cupy() -> bool:
+    """Return True if CuPy is importable; result is cached after the first call."""
+    global _CUPY_AVAILABLE  # noqa: PLW0603
+    if _CUPY_AVAILABLE is None:
+        try:
+            import cupy  # noqa: F401
+
+            _CUPY_AVAILABLE = True
+        except ImportError:
+            _CUPY_AVAILABLE = False
+    return _CUPY_AVAILABLE
+
+
+def _build_frequency_mask(
+    n_fft: int,
+    dt: float,
+    f_low_hz: float | None = None,
+    f_high_hz: float | None = None,
+    taper_ratio: float = 0.1,
+) -> NDArray[np.float64]:
+    """Build a frequency-domain gain mask with cosine (Hann) tapers.
+
+    Parameters
+    ----------
+    n_fft : int
+        FFT length (number of time samples before zero-padding, i.e. ``n_t``).
+    dt : float
+        Simulation time step in seconds.
+    f_low_hz : float | None
+        High-pass cut-off frequency in Hz.  Frequencies below this value are
+        attenuated.  The mask transitions smoothly from 0 to 1 in a window of
+        width ``f_low_hz * taper_ratio`` centred at ``f_low_hz``.
+    f_high_hz : float | None
+        Low-pass cut-off frequency in Hz.  Frequencies above this value are
+        attenuated.  The mask transitions smoothly from 1 to 0 in a window of
+        width ``f_high_hz * taper_ratio`` centred at ``f_high_hz``.
+    taper_ratio : float
+        Fractional width of each cosine taper relative to its centre frequency.
+        Default is 0.1 (10 %).
+
+    Returns
+    -------
+    NDArray[np.float64]
+        Frequency-domain gain mask of shape ``[n_fft // 2 + 1]``.
+
+    """
+    freqs = np.fft.rfftfreq(n_fft, d=dt)
+    mask = np.ones(len(freqs), dtype=np.float64)
+
+    if f_low_hz is not None:
+        half_width = f_low_hz * taper_ratio / 2.0
+        f_start = f_low_hz - half_width
+        f_end = f_low_hz + half_width
+        width = f_end - f_start  # == f_low_hz * taper_ratio
+
+        in_taper = (freqs >= f_start) & (freqs <= f_end)
+        below_taper = freqs < f_start
+
+        mask[below_taper] = 0.0
+        mask[in_taper] = 0.5 * (1.0 - np.cos(np.pi * (freqs[in_taper] - f_start) / width))
+
+    if f_high_hz is not None:
+        half_width = f_high_hz * taper_ratio / 2.0
+        f_start = f_high_hz - half_width
+        f_end = f_high_hz + half_width
+        width = f_end - f_start  # == f_high_hz * taper_ratio
+
+        in_taper = (freqs >= f_start) & (freqs <= f_end)
+        above_taper = freqs > f_end
+
+        lp_taper = np.ones(len(freqs), dtype=np.float64)
+        lp_taper[in_taper] = 0.5 * (1.0 + np.cos(np.pi * (freqs[in_taper] - f_start) / width))
+        lp_taper[above_taper] = 0.0
+        mask *= lp_taper
+
+    return mask
+
+
+def apply_filter(
+    data: NDArray[np.float64],
+    dt: float,
+    f_low_hz: float | None = None,
+    f_high_hz: float | None = None,
+    taper_ratio: float = 0.1,
+    *,
+    use_gpu: bool = True,
+) -> NDArray[np.float64]:
+    """Apply a frequency-domain filter to sensor data.
+
+    The filter is built as a cosine-tapered gain mask (see :func:`_build_frequency_mask`).
+    When CuPy is available and ``use_gpu=True``, the FFT operations run on the GPU
+    for maximum throughput; otherwise NumPy is used transparently.
+
+    Parameters
+    ----------
+    data : NDArray[np.float64]
+        Sensor time traces, shape ``[n_sensors, n_t]``.
+    dt : float
+        Simulation time step in seconds.
+    f_low_hz : float | None
+        High-pass edge frequency in Hz.  Pass ``None`` to skip high-passing.
+    f_high_hz : float | None
+        Low-pass edge frequency in Hz.  Pass ``None`` to skip low-passing.
+    taper_ratio : float
+        Fractional taper width relative to each cut-off frequency.  Default 0.1.
+    use_gpu : bool
+        If ``True`` (default), attempt to use CuPy for GPU-accelerated FFTs.
+        Falls back to NumPy silently if CuPy is unavailable.
+
+    Returns
+    -------
+    NDArray[np.float64]
+        Filtered data, same shape as ``data``.
+
+    """
+    n_t = data.shape[1]
+    mask = _build_frequency_mask(
+        n_t,
+        dt,
+        f_low_hz=f_low_hz,
+        f_high_hz=f_high_hz,
+        taper_ratio=taper_ratio,
+    )
+
+    if use_gpu and _check_cupy():
+        import cupy as cp
+
+        logger.debug("apply_filter: using CuPy GPU backend")
+        data_gpu = cp.asarray(data, dtype=cp.float64)
+        mask_gpu = cp.asarray(mask, dtype=cp.float64)
+        spec = cp.fft.rfft(data_gpu, axis=1)
+        spec *= mask_gpu[cp.newaxis, :]
+        filtered = cp.fft.irfft(spec, n=n_t, axis=1)
+        return cp.asnumpy(filtered)
+
+    logger.debug("apply_filter: using NumPy CPU backend")
+    spec = np.fft.rfft(data, axis=1)
+    spec *= mask[np.newaxis, :]
+    return np.fft.irfft(spec, n=n_t, axis=1)
diff --git a/tests/solver/test_signal_filter.py b/tests/solver/test_signal_filter.py
new file mode 100644
index 0000000..8738a66
--- /dev/null
+++ b/tests/solver/test_signal_filter.py
@@ -0,0 +1,261 @@
+"""Unit tests for fullwave.utils.signal_filter — no GPU required."""
+
+from unittest.mock import MagicMock
+
+import numpy as np
+import pytest
+
+import fullwave
+from fullwave.solver.solver import Solver
+from fullwave.utils.signal_filter import _build_frequency_mask, apply_filter
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_signal(dt: float, n_t: int, frequencies_hz: list[float]) -> np.ndarray:
+    """Sum of pure sinusoids, shape [1, n_t]."""
+    t = np.arange(n_t) * dt
+    sig = sum(np.sin(2 * np.pi * f * t) for f in frequencies_hz)
+    return np.asarray(sig, dtype=np.float64)[np.newaxis, :]
+
+
+# ---------------------------------------------------------------------------
+# _build_frequency_mask
+# ---------------------------------------------------------------------------
+
+
+def test_mask_shape():
+    n_fft = 512
+    dt = 1e-8
+    mask = _build_frequency_mask(n_fft, dt)
+    assert mask.shape == (n_fft // 2 + 1,)
+
+
+def test_mask_all_ones_when_no_cutoff():
+    mask = _build_frequency_mask(256, 1e-8)
+    np.testing.assert_array_equal(mask, np.ones(256 // 2 + 1))
+
+
+def test_mask_highpass_dc_zero():
+    """DC bin (index 0) must be zero for any high-pass cutoff > 0."""
+    mask = _build_frequency_mask(256, 1e-8, f_low_hz=1e6)
+    assert mask[0] == 0.0
+
+
+def test_mask_highpass_passband_one():
+    """Bins well above the cutoff should be ≈ 1."""
+    dt = 1e-8
+    n_fft = 1024
+    f_low_hz = 1e6
+    mask = _build_frequency_mask(n_fft, dt, f_low_hz=f_low_hz)
+    freqs = np.fft.rfftfreq(n_fft, d=dt)
+    passband = freqs > f_low_hz * 1.1
+    np.testing.assert_allclose(mask[passband], 1.0, atol=1e-10)
+
+
+def test_mask_bandpass_range():
+    """Bins well inside the band ≈ 1; bins well outside ≈ 0."""
+    dt = 1e-8
+    n_fft = 1024
+    f_low_hz = 1e6
+    f_high_hz = 5e6
+    mask = _build_frequency_mask(n_fft, dt, f_low_hz=f_low_hz, f_high_hz=f_high_hz)
+    freqs = np.fft.rfftfreq(n_fft, d=dt)
+
+    passband = (freqs > f_low_hz * 1.2) & (freqs < f_high_hz * 0.8)
+    stopband_low = freqs < f_low_hz * 0.8
+    stopband_high = freqs > f_high_hz * 1.2
+
+    np.testing.assert_allclose(mask[passband], 1.0, atol=1e-10)
+    np.testing.assert_allclose(mask[stopband_low], 0.0, atol=1e-10)
+    np.testing.assert_allclose(mask[stopband_high], 0.0, atol=1e-10)
+
+
+# ---------------------------------------------------------------------------
+# apply_filter — functional tests (CPU only via use_gpu=False)
+# ---------------------------------------------------------------------------
+
+
+def test_no_filter_passthrough():
+    """With no cutoffs the output must equal the input (up to float rounding)."""
+    dt = 1e-8
+    n_t = 512
+    rng = np.random.default_rng(0)
+    data = rng.standard_normal((8, n_t))
+    result = apply_filter(data, dt, use_gpu=False)
+    np.testing.assert_allclose(result, data, atol=1e-10)
+
+
+def test_highpass_removes_dc():
+    """DC + sine: after high-pass the DC component should vanish, sine should survive."""
+    dt = 1e-8
+    n_t = 2048
+    f_signal_hz = 3e6  # 3 MHz — well above the 0.5 MHz cutoff
+    f_low_hz = 0.5e6
+
+    t = np.arange(n_t) * dt
+    dc = 5.0 * np.ones(n_t)
+    sine = np.sin(2 * np.pi * f_signal_hz * t)
+    data = (dc + sine)[np.newaxis, :]
+
+    result = apply_filter(data, dt, f_low_hz=f_low_hz, use_gpu=False)
+
+    # DC (mean) should be nearly zero
+    assert abs(result[0].mean()) < 0.05
+
+    # Sine amplitude should be close to 1.0 — check via RMS
+    rms = np.sqrt(np.mean(result[0] ** 2))
+    assert 0.6 < rms < 1.1, f"Expected RMS ~0.7, got {rms}"
+
+
+def test_bandpass_filter_gain_matches_mask():
+    """Gold-standard performance test: measured per-frequency gain must match the mask.
+
+    The mask returned by ``_build_frequency_mask`` is the specification.  We feed
+    pure sinusoids at two frequencies (one well inside the band, one well outside),
+    run the filter, then recover the actual gain at each frequency via FFT.  The
+    measured gain must agree with the mask value at that bin to within 1 %.
+    """
+    dt = 1e-8
+    n_t = 8192  # large n for clean spectral resolution
+    f_inband_hz = 3e6  # 3 MHz — flat passband, mask ≈ 1
+    f_outband_hz = 0.2e6  # 0.2 MHz — well into stopband, mask ≈ 0
+
+    f_low_hz = 1e6
+    f_high_hz = 5e6
+
+    t = np.arange(n_t) * dt
+    data = (np.sin(2 * np.pi * f_inband_hz * t) + np.sin(2 * np.pi * f_outband_hz * t))[
+        np.newaxis,
+        :,
+    ]
+
+    result = apply_filter(data, dt, f_low_hz=f_low_hz, f_high_hz=f_high_hz, use_gpu=False)
+
+    # --- gold standard: mask values at the exact test frequencies ---
+    freqs = np.fft.rfftfreq(n_t, d=dt)
+    mask = _build_frequency_mask(n_t, dt, f_low_hz=f_low_hz, f_high_hz=f_high_hz)
+
+    idx_inband = int(np.argmin(np.abs(freqs - f_inband_hz)))
+    idx_outband = int(np.argmin(np.abs(freqs - f_outband_hz)))
+
+    expected_gain_inband = mask[idx_inband]  # should be ≈ 1.0
+    expected_gain_outband = mask[idx_outband]  # should be ≈ 0.0
+
+    # --- measured gains via amplitude spectrum ---
+    input_spec = np.abs(np.fft.rfft(data[0]))
+    output_spec = np.abs(np.fft.rfft(result[0]))
+
+    measured_gain_inband = output_spec[idx_inband] / input_spec[idx_inband]
+    measured_gain_outband = output_spec[idx_outband] / input_spec[idx_outband]
+
+    tol = 0.01  # 1 % absolute tolerance on gain
+
+    assert abs(measured_gain_inband - expected_gain_inband) < tol, (
+        f"In-band gain {measured_gain_inband:.4f} deviates from mask {expected_gain_inband:.4f}"
+    )
+    assert abs(measured_gain_outband - expected_gain_outband) < tol, (
+        f"Out-of-band gain {measured_gain_outband:.4f} deviates from mask "
+        f"{expected_gain_outband:.4f}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Validation via solver.run() — no GPU, no binary needed
+# ---------------------------------------------------------------------------
+
+
+def test_highpass_and_bandpass_raises():
+    """solver.run() must raise ValueError when both filter params are set."""
+    grid = MagicMock(spec=fullwave.Grid)
+    grid.is_3d = False
+    grid.ppw = 4
+    grid.nx = 100
+    grid.ny = 100
+    grid.nz = 1
+    grid.dt = 1e-8
+
+    medium = MagicMock(spec=fullwave.Medium)
+    medium.n_relaxation_mechanisms = 2
+    medium.use_isotropic_relaxation = True
+
+    source = MagicMock(spec=fullwave.Source)
+    sensor = MagicMock(spec=fullwave.Sensor)
+
+    fake_bin = MagicMock()
+    fake_bin.exists.return_value = True
+
+    # Bypass __init__ entirely, just test the validation in run()
+    solver = object.__new__(Solver)
+    solver.run_on_memory = False
+    solver.work_dir = MagicMock()
+    solver.grid = grid
+    solver.medium = medium
+    solver.is_3d = False
+    solver.use_gpu = False
+    solver.use_exponential_attenuation = False
+    solver.use_isotropic_relaxation = True
+    solver.n_relax_mechanisms = 2
+    solver.source = source
+    solver.sensor = sensor
+    solver.transducer = None
+    solver.path_fullwave_simulation_bin = fake_bin
+    solver.cuda_device_id = None
+    solver.use_pml = True
+    solver.save_gpu_memory = False
+
+    pml_builder = MagicMock()
+    solver.pml_builder = pml_builder
+    solver.fullwave_launcher = MagicMock()
+
+    with pytest.raises(ValueError, match="cannot both be specified"):
+        solver.run(
+            highpass_cutoff_mhz=0.5,
+            bandpass_cutoff_mhz=(1.0, 5.0),
+        )
+
+
+def test_filter_requires_load_results():
+    """Passing a filter option with load_results=False must raise ValueError."""
+    grid = MagicMock(spec=fullwave.Grid)
+    grid.is_3d = False
+    grid.ppw = 4
+    grid.nx = 100
+    grid.ny = 100
+    grid.nz = 1
+    grid.dt = 1e-8
+
+    medium = MagicMock(spec=fullwave.Medium)
+    medium.n_relaxation_mechanisms = 2
+    medium.use_isotropic_relaxation = True
+
+    source = MagicMock(spec=fullwave.Source)
+    sensor = MagicMock(spec=fullwave.Sensor)
+
+    solver = object.__new__(Solver)
+    solver.run_on_memory = False
+    solver.work_dir = MagicMock()
+    solver.grid = grid
+    solver.medium = medium
+    solver.is_3d = False
+    solver.use_gpu = False
+    solver.use_exponential_attenuation = False
+    solver.use_isotropic_relaxation = True
+    solver.n_relax_mechanisms = 2
+    solver.source = source
+    solver.sensor = sensor
+    solver.transducer = None
+    solver.path_fullwave_simulation_bin = MagicMock()
+    solver.cuda_device_id = None
+    solver.use_pml = True
+    solver.save_gpu_memory = False
+    solver.pml_builder = MagicMock()
+    solver.fullwave_launcher = MagicMock()
+
+    with pytest.raises(ValueError, match="load_results=True"):
+        solver.run(
+            highpass_cutoff_mhz=0.5,
+            load_results=False,
+        )

From a7d9f76f10e4b272267f8d5ccf30ac6878615f55 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Tue, 24 Feb 2026 18:59:38 -0500
Subject: [PATCH 08/31] =?UTF-8?q?Bump=20version:=201.2.6-dev1=20=E2=86=92?=
 =?UTF-8?q?=201.2.6-dev2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .bumpversion.toml    | 2 +-
 fullwave/__init__.py | 2 +-
 pyproject.toml       | 2 +-
 uv.lock              | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.bumpversion.toml b/.bumpversion.toml
index 78bde87..126318e 100644
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "1.2.6-dev1"
+current_version = "1.2.6-dev2"
 parse = """(?x)
     (?P<major>0|[1-9]\\d*)\\.
     (?P<minor>0|[1-9]\\d*)\\.
diff --git a/fullwave/__init__.py b/fullwave/__init__.py
index 9b6406e..59833f2 100644
--- a/fullwave/__init__.py
+++ b/fullwave/__init__.py
@@ -60,7 +60,7 @@
     __version__ = version("fullwave")
 except PackageNotFoundError:
     # Update via bump-my-version, not manually
-    __version__ = "1.2.6-dev1"
+    __version__ = "1.2.6-dev2"
 
 VERSION = __version__  # for convenience
 logger.info("Fullwave version: %s", __version__)
diff --git a/pyproject.toml b/pyproject.toml
index 53ddbd4..a273ac6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "fullwave25"
-version = "1.2.6-dev1" # Update via bump-my-version, not manually
+version = "1.2.6-dev2" # Update via bump-my-version, not manually
 description = "Fullwave 2.5: Ultrasound wave propagation simulation with heterogeneous power law attenuation modelling capabilities"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/uv.lock b/uv.lock
index d4a22e0..0a4998f 100644
--- a/uv.lock
+++ b/uv.lock
@@ -735,7 +735,7 @@ wheels = [
 
 [[package]]
 name = "fullwave25"
-version = "1.2.6.dev1"
+version = "1.2.6.dev2"
 source = { editable = "." }
 dependencies = [
     { name = "joblib" },

From ab6e4359b7e18c797711c49d2d335de15e09c239 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Wed, 25 Feb 2026 10:27:50 -0500
Subject: [PATCH 09/31] update ruff.toml and refactor docstring formatting
 across multiple modules to adhere to Numpy style

---
 .pre-commit-config.yaml                       |   2 +-
 .vscode/settings.json                         |   3 +-
 fullwave/beamformer/beamformer.py             |   3 +-
 fullwave/medium.py                            |  12 +-
 fullwave/sensor.py                            |   5 +-
 fullwave/solver/cuda_utils.py                 |  15 ++-
 fullwave/solver/pml_builder.py                |   8 +-
 fullwave/solver/solver.py                     |   2 +-
 fullwave/solver/utils.py                      |  24 ++--
 fullwave/source.py                            |   2 +-
 fullwave/transducer.py                        |  23 ++--
 fullwave/utils/check_functions.py             |   9 +-
 fullwave/utils/coordinates.py                 |  12 +-
 fullwave/utils/memory_tempfile.py             |  14 +-
 fullwave/utils/plot_utils.py                  |   6 +-
 fullwave/utils/signal_filter.py               |   4 +-
 ruff.toml                                     | 125 +++++++++++-------
 .../presets/test_domain_background.py         |  22 +--
 tests/solver/test_launcher.py                 |  16 +--
 tests/test_source.py                          |   4 +-
 20 files changed, 182 insertions(+), 129 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 24d8fff..ad103ff 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,7 +11,7 @@ repos:
         args: ["--maxkb=10000"]
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.11.0
+    rev: v0.15.2
     hooks:
       # Run the linter.
       - id: ruff
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 918a25f..de11a99 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -10,5 +10,6 @@
     }
   },
   "python.analysis.typeCheckingMode": "standard",
-  "python.testing.pytestEnabled": true
+  "python.testing.pytestEnabled": true,
+  "ruff.format.preview": false
 }
diff --git a/fullwave/beamformer/beamformer.py b/fullwave/beamformer/beamformer.py
index f068799..4e327d1 100644
--- a/fullwave/beamformer/beamformer.py
+++ b/fullwave/beamformer/beamformer.py
@@ -14,7 +14,8 @@ class Beamformer:
     For faster implementations, consider using libraries such as
     mach beamformer: https://github.com/Forest-Neurotech/mach
 
-    References:
+    References
+    ----------
     - Fullwave2 BMME890 implementation
     - https://github.com/gfpinton/fullwave_bmme890/blob/master/fullwave2_launcher_imaging_planewave.m
 
diff --git a/fullwave/medium.py b/fullwave/medium.py
index 7d37948..d7feaa2 100644
--- a/fullwave/medium.py
+++ b/fullwave/medium.py
@@ -231,7 +231,8 @@ def check_relaxation_param_dict(
     ) -> None:
         """Check if the relaxation parameter updates have valid keys and matching shapes.
 
-        Raises:
+        Raises
+        ------
             ValueError: If the keys do not match the desired keys or
             if the shapes of the values do not match the domain shape.
 
@@ -1147,11 +1148,13 @@ def build(self) -> MediumRelaxationMaps:
         it uses the relaxation parameters look up table
         to generate the relaxation parameters.
 
-        Returns:
+        Returns
+        -------
             MediumRelaxationMaps: An instance of MediumRelaxationMaps
             built from the retrieved relaxation parameters.
 
-        Raises:
+        Raises
+        ------
             ValueError: If an unknown attenuation_builder is specified.
 
         """
@@ -1216,7 +1219,8 @@ def _db_mhz_cm_to_a_exp(
     def build_exponential(self) -> MediumExponentialAttenuation:
         """Build MediumExponentialAttenuation from alpha and power maps.
 
-        Returns:
+        Returns
+        -------
             MediumExponentialAttenuation: An instance of MediumExponentialAttenuation
             built from the alpha and power maps.
 
diff --git a/fullwave/sensor.py b/fullwave/sensor.py
index 7b0edf6..8de59ee 100644
--- a/fullwave/sensor.py
+++ b/fullwave/sensor.py
@@ -21,7 +21,7 @@ class Sensor:
     outcoords: NDArray[np.int64]
     sampling_modulus_time: int = 1
 
-    def __init__(  # noqa: PLR0915
+    def __init__(
         self,
         mask: NDArray[np.bool] | None = None,
         sampling_modulus_time: int = 1,
@@ -189,7 +189,8 @@ def plot(
     ) -> None:
         """Plot the transducer mask, optionally exporting and displaying the figure.
 
-        Raises:
+        Raises
+        ------
             ValueError: If the sensor is 3D because plotting is not supported.
 
         """
diff --git a/fullwave/solver/cuda_utils.py b/fullwave/solver/cuda_utils.py
index e06209f..497b8ac 100644
--- a/fullwave/solver/cuda_utils.py
+++ b/fullwave/solver/cuda_utils.py
@@ -113,7 +113,8 @@ def cuda_api_call(func: Callable) -> Callable:
     Decorator for CUDA API calls
     Raises RuntimeError if the CUDA call does not return CUDA_SUCCESS.
 
-    Returns:
+    Returns
+    -------
         Callable: The wrapped function that checks CUDA API call results.
 
     """
@@ -138,7 +139,8 @@ def cuda_api_call_warn(func: Callable) -> Callable:
 
     Prints a warning message if the CUDA call does not return CUDA_SUCCESS.
 
-    Returns:
+    Returns
+    -------
         Callable: The wrapped function that checks CUDA API call results.
 
     """
@@ -230,7 +232,8 @@ def get_cuda_device_specs() -> list[dict[str, Any]]:
         'cuda_cores': int
     }
 
-    Returns:
+    Returns
+    -------
         A list of dictionaries containing specifications for each CUDA device.
 
     """
@@ -325,7 +328,8 @@ def get_cuda_architecture() -> list[dict[str, Any]]:
         'architecture': str,
     }
 
-    Returns:
+    Returns
+    -------
         A list of dictionaries containing architecture information for each CUDA device.
 
     """
@@ -363,7 +367,8 @@ def get_cuda_architecture() -> list[dict[str, Any]]:
 def retrieve_cuda_version() -> float:
     """Retrieve the CUDA driver version.
 
-    Returns:
+    Returns
+    -------
         str: CUDA version in the format "major.minor" or "unknown" if retrieval fails.
 
     """
diff --git a/fullwave/solver/pml_builder.py b/fullwave/solver/pml_builder.py
index 016b428..5fbd8eb 100644
--- a/fullwave/solver/pml_builder.py
+++ b/fullwave/solver/pml_builder.py
@@ -115,7 +115,7 @@ class PMLBuilder:
     pml_mask_x: NDArray[np.float64] = field(init=False)
     pml_mask_y: NDArray[np.float64] = field(init=False)
 
-    def __init__(  # noqa: PLR0915
+    def __init__(
         self,
         grid: fullwave.Grid,
         medium: fullwave.Medium,
@@ -898,7 +898,7 @@ def _worker(
 
         return extended_medium
 
-    def _apply_pml_3d(  # noqa: PLR0915
+    def _apply_pml_3d(
         self,
         extended_medium: fullwave.MediumRelaxationMaps,
         theoritical_reflection_coefficient: float,
@@ -1184,7 +1184,7 @@ def _worker(
 
         return extended_medium
 
-    def _apply_transition_and_pml(  # noqa: C901, PLR0912, PLR0915
+    def _apply_transition_and_pml(  # noqa: PLR0912
         self,
         input_array: NDArray[np.float64],
         value_target: float,
@@ -1465,7 +1465,7 @@ def plot(
 class PMLBuilderExponentialAttenuation(PMLBuilder):
     """A class to set up PML for exponential attenuation media."""
 
-    def __init__(  # noqa: PLR0915
+    def __init__(
         self,
         grid: fullwave.Grid,
         medium: fullwave.Medium,
diff --git a/fullwave/solver/solver.py b/fullwave/solver/solver.py
index 0c3111c..faadb57 100644
--- a/fullwave/solver/solver.py
+++ b/fullwave/solver/solver.py
@@ -309,7 +309,7 @@ class Solver:
     generates the required input files, and runs the simulation executable.
     """
 
-    def __init__(  # noqa: PLR0912, PLR0915, C901
+    def __init__(  # noqa: PLR0912
         self,
         work_dir: Path,
         grid: fullwave.Grid,
diff --git a/fullwave/solver/utils.py b/fullwave/solver/utils.py
index 9986bd6..adbe878 100644
--- a/fullwave/solver/utils.py
+++ b/fullwave/solver/utils.py
@@ -17,10 +17,12 @@ def load_dat_data(dat_file_path: Path, dtype: DTypeLike = np.float32) -> NDArray
         dat_file_path (Path): Path to the .dat file.
         dtype: Data type to use when reading the file.
 
-    Raises:
+    Raises
+    ------
         ValueError: if dat_file_path does not exist.
 
-    Returns:
+    Returns
+    -------
         NDArray[np.float64]: Array of data read from the file.
 
     """
@@ -58,7 +60,8 @@ def load_dat_and_reshape(
         n_sensors: Number of sensors
         dtype: Data type to use when reading the file.
 
-    Returns:
+    Returns
+    -------
         NDArray[np.float64]: Array of data read from the file.
 
     """
@@ -88,7 +91,8 @@ def initialize_relaxation_param_dict(
 ) -> dict[str, NDArray[np.float64]]:
     """Initialize a dictionary with relaxation parameters.
 
-    Returns:
+    Returns
+    -------
         dict[str, NDArray[np.float64]]: Dictionary of relaxation parameters.
 
     """
@@ -125,10 +129,12 @@ def load_data_with_time_step(
         dtype: Data type to use when reading the file.
         time_step: Time step index to load.
 
-    Returns:
+    Returns
+    -------
         NDArray[np.float64]: Array of data read from the file.
 
-    Raises:
+    Raises
+    ------
         ValueError: if file_path does not exist.
 
     """
@@ -159,10 +165,12 @@ def load_data_with_sensor_index(
         sensor_index: Sensor index to load.
         dtype: Data type to use when reading the file.
 
-    Returns:
+    Returns
+    -------
         NDArray[np.float64]: Array of data read from the file.
 
-    Raises:
+    Raises
+    ------
         ValueError: if file_path does not exist.
 
     """
diff --git a/fullwave/source.py b/fullwave/source.py
index 0856850..74ca2e8 100644
--- a/fullwave/source.py
+++ b/fullwave/source.py
@@ -30,7 +30,7 @@ class Source:
     w0: NDArray[np.float64] | None = None
     incoords_w: NDArray[np.int64] | None = None
 
-    def __init__(  # noqa: C901 PLR0912 PLR0915
+    def __init__(  # noqa: PLR0912
         self,
         p0: NDArray[np.float64] | None = None,
         mask: NDArray[np.bool] | None = None,
diff --git a/fullwave/transducer.py b/fullwave/transducer.py
index c0dc858..8678c08 100644
--- a/fullwave/transducer.py
+++ b/fullwave/transducer.py
@@ -22,7 +22,8 @@
 def _make_pos_int(val: float | tuple[float] | tuple[int]) -> NDArray[np.int64]:
     """Force value to be a positive integer.
 
-    Returns:
+    Returns
+    -------
         NDArray[np.int64]: Array with positive integers.
 
     """
@@ -202,7 +203,7 @@ def __init__(
         ) = self._create_element_coords()
         logger.debug("TransducerGeometry instance created.")
 
-    def _init_dimensions(  # noqa: C901, PLR0912
+    def _init_dimensions(  # noqa: PLR0912
         self,
         grid: Grid,
         element_width_px: int | None,
@@ -313,7 +314,7 @@ def _init_positions(self, position_px: int, position_m: float) -> tuple[int, flo
             assert len(position_m) == 3, "position_m must have 3 elements for 3D transducer"
         return position_px, position_m
 
-    def _create_element_coords(  # noqa: PLR0912, PLR0915
+    def _create_element_coords(  # noqa: PLR0912
         self,
     ) -> tuple[NDArray[np.int64], NDArray[np.int64], NDArray[np.int64], NDArray[np.int64]]:
         """Build flat coordinate arrays for source and sensor pixels.
@@ -813,15 +814,15 @@ def set_signal(self, value: NDArray[np.float64]) -> None:
     def source_coords(self) -> NDArray[np.int64]:
         """Coordinates of active source pixels; shape [N_src, ndim]."""
         active_ids = np.where(self.active_source_elements)[0] + 1
-        mask = np.isin(self.transducer_geometry._source_ids, active_ids)  # noqa: SLF001
-        return self.transducer_geometry._source_coords[mask]  # noqa: SLF001
+        mask = np.isin(self.transducer_geometry._source_ids, active_ids)
+        return self.transducer_geometry._source_coords[mask]
 
     @property
     def sensor_coords(self) -> NDArray[np.int64]:
         """Coordinates of active sensor pixels; shape [N_snsr, ndim]."""
         active_ids = np.where(self.active_sensor_elements)[0] + 1
-        mask = np.isin(self.transducer_geometry._sensor_ids, active_ids)  # noqa: SLF001
-        return self.transducer_geometry._sensor_coords[mask]  # noqa: SLF001
+        mask = np.isin(self.transducer_geometry._sensor_ids, active_ids)
+        return self.transducer_geometry._sensor_coords[mask]
 
     @property
     def element_id_to_element_surface(self) -> dict[int, NDArray[np.int64]]:
@@ -986,8 +987,8 @@ def dict_source_index_to_location(self) -> dict[int, NDArray[np.int64]]:
     @property
     def element_id_to_element_center(self) -> dict[int, NDArray[np.int64]]:
         """Return the dictionary mapping element IDs to their center coordinates."""
-        coords = self.transducer_geometry._source_coords  # noqa: SLF001
-        ids = self.transducer_geometry._source_ids  # noqa: SLF001
+        coords = self.transducer_geometry._source_coords
+        ids = self.transducer_geometry._source_ids
         n = self.transducer_geometry.number_elements
         ndim = coords.shape[1] if len(coords) else (3 if self.is_3d else 2)
         out: dict[int, NDArray[np.int64]] = {}
@@ -1049,7 +1050,7 @@ def plot_source_mask(
 
         it plots whole transducer geometry including the inactive/active source and sensor elements.
         """
-        import matplotlib.pyplot as plt
+        import matplotlib.pyplot as plt  # noqa: PLC0415
 
         fig, ax = plt.subplots(1, 1, figsize=(10, 10))
         plot_mask = np.zeros(self.transducer_geometry.stored_grid_size)
@@ -1089,7 +1090,7 @@ def plot_sensor_mask(
 
         it plots whole transducer geometry including the inactive/active source and sensor elements.
         """
-        import matplotlib.pyplot as plt
+        import matplotlib.pyplot as plt  # noqa: PLC0415
 
         fig, ax = plt.subplots(1, 1, figsize=(10, 10))
         plot_mask = np.zeros(self.transducer_geometry.stored_grid_size)
diff --git a/fullwave/utils/check_functions.py b/fullwave/utils/check_functions.py
index b15d77f..0cc6f58 100644
--- a/fullwave/utils/check_functions.py
+++ b/fullwave/utils/check_functions.py
@@ -7,7 +7,8 @@
 def check_instance(target_var: Any, instances: list[Any] | Any) -> None:  # noqa: ANN401
     """Check whether target_var is an instance of the specified type.
 
-    Raises:
+    Raises
+    ------
         TypeError: If target_var is not an instance of instance.
 
     """
@@ -21,7 +22,8 @@ def check_instance(target_var: Any, instances: list[Any] | Any) -> None:  # noqa
 def check_path_exists(target_path: Path) -> None:
     """Check whether the provided target_path exists.
 
-    Raises:
+    Raises
+    ------
         ValueError: If target_path does not exist.
 
     """
@@ -37,7 +39,8 @@ def check_compatible_value(
 ) -> None:
     """Check whether the provided value is in the list of compatible values.
 
-    Raises:
+    Raises
+    ------
         ValueError: If value is not in compatible_values.
 
     """
diff --git a/fullwave/utils/coordinates.py b/fullwave/utils/coordinates.py
index 1a7798a..cf78427 100644
--- a/fullwave/utils/coordinates.py
+++ b/fullwave/utils/coordinates.py
@@ -16,7 +16,8 @@ def make_circle_idx(
         cen: The center of the circle.
         rad: The radius of the circle.
 
-    Returns:
+    Returns
+    -------
         mask: The mask of the circle.
 
     """
@@ -32,7 +33,8 @@ def map_to_coords(
 ) -> NDArray[np.int64]:
     """Map the mask map to coordinates.
 
-    Returns:
+    Returns
+    -------
         NDArray[np.int64]: An array of coordinates corresponding to non-zero elements in the mask.
 
     """
@@ -103,7 +105,8 @@ def map_to_coords_with_sort(map_data: NDArray[np.int64]) -> NDArray[np.int64]:
     Args:
         map_data: The mask map.
 
-    Returns:
+    Returns
+    -------
         NDArray[np.int64]: An array of coordinates corresponding to non-zero elements in the mask.
 
     """
@@ -124,7 +127,8 @@ def map_to_coordinates(
         is_3d: Whether the grid is 3D or not.
         sort: Whether to sort the coordinates by the first dimension.
 
-    Returns:
+    Returns
+    -------
         NDArray[np.int64]: An array of coordinates corresponding to non-zero elements in the mask.
 
     """
diff --git a/fullwave/utils/memory_tempfile.py b/fullwave/utils/memory_tempfile.py
index 237362d..47b011c 100644
--- a/fullwave/utils/memory_tempfile.py
+++ b/fullwave/utils/memory_tempfile.py
@@ -31,7 +31,7 @@ class MemoryTempfile:
     and create temporary files and directories.
     """
 
-    def __init__(  # noqa: C901, PLR0912
+    def __init__(  # noqa: PLR0912
         self,
         *,
         preferred_paths: list | None = None,
@@ -182,7 +182,7 @@ def mkdtemp(
         self,
         suffix: str | None = None,
         prefix: str | None = None,
-        dir: str | None = None,  # noqa: A002
+        dir: str | None = None,
     ) -> str:
         """Create a temporary directory.
 
@@ -207,7 +207,7 @@ def mkstemp(
         self,
         suffix: str | None = None,
         prefix: str | None = None,
-        dir: str | None = None,  # noqa: A002
+        dir: str | None = None,
         *,
         text: bool = False,
     ) -> tuple[int, str]:
@@ -241,7 +241,7 @@ def TemporaryDirectory(  # noqa: N802
         self,
         suffix: str | None = None,
         prefix: str | None = None,
-        dir: str | None = None,  # noqa: A002
+        dir: str | None = None,
     ) -> tempfile.TemporaryDirectory[str]:
         """Create a temporary directory and return a TemporaryDirectory object.
 
@@ -275,7 +275,7 @@ def SpooledTemporaryFile(  # noqa: N802
         newline: str | None = None,
         suffix: str | None = None,
         prefix: str | None = None,
-        dir: str | None = None,  # noqa: A002
+        dir: str | None = None,
     ) -> tempfile.SpooledTemporaryFile:
         """Create a spooled temporary file.
 
@@ -329,7 +329,7 @@ def NamedTemporaryFile(  # noqa: N802
         newline: str | None = None,
         suffix: str | None = None,
         prefix: str | None = None,
-        dir: str | None = None,  # noqa: A002
+        dir: str | None = None,
         *,
         delete: bool = True,
     ) -> tempfile.NamedTemporaryFile:
@@ -379,7 +379,7 @@ def TemporaryFile(  # noqa: N802
         newline: str | None = None,
         suffix: str | None = None,
         prefix: str | None = None,
-        dir: str | None = None,  # noqa: A002
+        dir: str | None = None,
     ) -> tempfile.TemporaryFile:
         """Create and return a temporary file.
 
diff --git a/fullwave/utils/plot_utils.py b/fullwave/utils/plot_utils.py
index 6f63f1f..c353229 100644
--- a/fullwave/utils/plot_utils.py
+++ b/fullwave/utils/plot_utils.py
@@ -13,7 +13,7 @@
 from tqdm import tqdm
 
 
-def plot_array(  # noqa: C901, D417, PLR0912
+def plot_array(  # noqa: PLR0912
     x: NDArray[np.float64 | np.int64 | np.bool],
     aspect: float | None = None,
     vmin: float | None = None,
@@ -553,7 +553,7 @@ def plot_wave_propagation_animation(
     animation_data.save(export_name, writer="ffmpeg", dpi=dpi)
 
 
-def plot_wave_propagation_with_map(  # noqa: PLR0915, C901
+def plot_wave_propagation_with_map(
     propagation_map: NDArray[np.float64],
     c_map: NDArray[np.float64],
     rho_map: NDArray[np.float64],
@@ -868,7 +868,7 @@ def plot_wave_propagation_with_map(  # noqa: PLR0915, C901
     plt.close("all")
 
 
-def plot_wave_propagation_snapshot(  # noqa: PLR0915
+def plot_wave_propagation_snapshot(
     propagation_map: NDArray[np.float64],
     c_map: NDArray[np.float64],
     rho_map: NDArray[np.float64],
diff --git a/fullwave/utils/signal_filter.py b/fullwave/utils/signal_filter.py
index d238c18..8264286 100644
--- a/fullwave/utils/signal_filter.py
+++ b/fullwave/utils/signal_filter.py
@@ -25,7 +25,7 @@ def _check_cupy() -> bool:
     global _CUPY_AVAILABLE  # noqa: PLW0603
     if _CUPY_AVAILABLE is None:
         try:
-            import cupy  # noqa: F401
+            import cupy  # noqa: F401, PLC0415
 
             _CUPY_AVAILABLE = True
         except ImportError:
@@ -145,7 +145,7 @@ def apply_filter(
     )
 
     if use_gpu and _check_cupy():
-        import cupy as cp
+        import cupy as cp  # noqa: PLC0415
 
         logger.debug("apply_filter: using CuPy GPU backend")
         data_gpu = cp.asarray(data, dtype=cp.float64)
diff --git a/ruff.toml b/ruff.toml
index 955b704..c65d0b6 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -1,4 +1,8 @@
-# Exclude a variety of commonly ignored directories.
+# -------- Core options --------
+line-length = 100
+indent-width = 4
+target-version = "py312"
+
 exclude = [
     ".bzr",
     ".direnv",
@@ -28,87 +32,108 @@ exclude = [
     "venv",
 ]
 
-# Same as Black.
-line-length = 100
-indent-width = 4
-
-# Assume Python 3.13
-target-version = "py312"
-
 [lint]
-# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)  codes by default.
-# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
-# McCabe complexity (`C901`) by default.
-select = ["ALL"]
-# select = ["E4", "E7", "E9", "F"]
+select = [
+    "E",      # pycodestyle errors
+    "F",      # Pyflakes
+    "W",      # pycodestyle warnings
+    "I",      # isort
+    "N",      # pep8-naming
+    "UP",     # pyupgrade
+    "B",      # bugbear (bug-prone patterns)
+    "SIM",    # simplify
+    "S",      # bandit (security)
+    "C4",     # comprehensions
+    "ARG",    # unused function args
+    "ANN",    # type annotations
+    "D",      # docstrings
+    "PL",     # pylint-like
+    "TC",     # type-checking import placement
+    "RET",    # return-value patterns
+    "RUF100", # unused-noqa
+    "FBT",    # boolean-trap
+    "NPY",    # numpy-specific
+]
 
-# ignore PLR0913 Too many arguments in function definition
-# ignore S101 Use of assert detected
-# ignore PLR2004 Use of Magic number detected
-# ignore S404 `subprocess` module is possibly insecure ()
-# ignore CPY001 Missing copyright notice at top of file
 ignore = [
-    "PLR0913",
-    "PLR0914",
-    "ERA001",
-    "S101",
-    "PLR2004",
-    "S404",
-    "CPY001",
-    "PLR1702",
-    "T201",
+    "PLR0913", # too many arguments
+    "PLR0914", # too many locals
+    "PLR1702", # too many branches
+    "PLR2004", # magic numbers
+    "PLR0915", # too many statements
+    "S101",    # assert allowed in non-prod / tests
+    "S404",    # subprocess warning (you already know what you're doing)
+    "CPY001",  # copyright header
+    "ERA001",  # commented-out code
+    "PLR0904",
 ]
 
-# Allow fix for all enabled rules (when `--fix`) is provided.
 fixable = ["ALL"]
 unfixable = []
 
-# Allow unused variables when underscore-prefixed.
 dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
 
 [lint.per-file-ignores]
-
-# disable D103 (missing docstring in public function) and ANN201 (missing type annotation)
 "tests/**" = [
+    "D100",
     "D101",
     "D103",
     "D104",
     "D107",
+    "ANN001",
+    "ANN002",
+    "ANN003",
     "ANN201",
     "ANN202",
     "ANN204",
+    "ARG001",
+    "RET504",
+    "S101",
+    "PLC0415", # `import` should be at the top-level of a file
+]
+
+"examples/**" = [
+    "D100",
+    "D101",
+    "D102",
+    "D103",
+    "D104",
+    "D107",
     "ANN001",
     "ANN002",
     "ANN003",
-    "ARG001",
-    "RET504",
+    "ANN201",
+    "ANN202",
+    "ANN204",
+]
+
+# Example: allow more imperative / exploratory style in notebooks.
+"notebooks/**" = [
     "D100",
-    "S101",
+    "D101",
+    "D102",
+    "D103",
+    "D104",
+    "D107",
+    "ANN001",
+    "ANN002",
+    "ANN003",
+    "ANN201",
+    "ANN202",
+    "ANN204",
 ]
 
+[lint.pydocstyle]
+convention = "numpy"
+
 [format]
 # Like Black, use double quotes for strings.
 quote-style = "double"
-
 # Like Black, indent with spaces, rather than tabs.
 indent-style = "space"
-
 # Like Black, respect magic trailing commas.
 skip-magic-trailing-comma = false
-
 # Like Black, automatically detect the appropriate line ending.
 line-ending = "auto"
-
-# Enable auto-formatting of code examples in docstrings. Markdown,
-# reStructuredText code/literal blocks and doctests are all supported.
-#
-# This is currently disabled by default, but it is planned for this
-# to be opt-out in the future.
 docstring-code-format = true
-
-# Set the line length limit used when formatting code snippets in
-# docstrings.
-#
-# This only has an effect when the `docstring-code-format` setting is
-# enabled.
-docstring-code-line-length = "dynamic"
+docstring-code-line-length = 120
diff --git a/tests/medium_builder/presets/test_domain_background.py b/tests/medium_builder/presets/test_domain_background.py
index fd01c28..5de3e80 100644
--- a/tests/medium_builder/presets/test_domain_background.py
+++ b/tests/medium_builder/presets/test_domain_background.py
@@ -76,14 +76,14 @@ def named_material():
 
 def test_setup_base_geometry(grid, default_material):
     bd = BackgroundDomain(grid=grid, material_properties=default_material)
-    geo = bd._setup_base_geometry()  # noqa: SLF001
+    geo = bd._setup_base_geometry()
     assert geo.shape == (grid.nx, grid.ny)
     np.testing.assert_array_equal(geo, np.ones((grid.nx, grid.ny)))
 
 
 def test_setup_sound_speed_default(grid, default_material):
     bd = BackgroundDomain(grid=grid, material_properties=default_material)
-    sound_speed = bd._setup_sound_speed()  # noqa: SLF001
+    sound_speed = bd._setup_sound_speed()
     expected = np.ones((grid.nx, grid.ny)) * default_material.sound_speed
     np.testing.assert_array_almost_equal(sound_speed, expected)
 
@@ -94,7 +94,7 @@ def test_setup_sound_speed_named(grid, named_material):
         background_property_name="custom",
         material_properties=named_material,
     )
-    sound_speed = bd._setup_sound_speed()  # noqa: SLF001
+    sound_speed = bd._setup_sound_speed()
     expected_value = named_material.custom["sound_speed"]
     expected = np.ones((grid.nx, grid.ny)) * expected_value
     np.testing.assert_array_almost_equal(sound_speed, expected)
@@ -102,7 +102,7 @@ def test_setup_sound_speed_named(grid, named_material):
 
 def test_setup_density_default(grid, default_material):
     bd = BackgroundDomain(grid=grid, material_properties=default_material)
-    density = bd._setup_density()  # noqa: SLF001
+    density = bd._setup_density()
     expected = np.ones((grid.nx, grid.ny)) * default_material.density
     np.testing.assert_array_almost_equal(density, expected)
 
@@ -113,7 +113,7 @@ def test_setup_density_named(grid, named_material):
         background_property_name="custom",
         material_properties=named_material,
     )
-    density = bd._setup_density()  # noqa: SLF001
+    density = bd._setup_density()
     expected_value = named_material.custom["density"]
     expected = np.ones((grid.nx, grid.ny)) * expected_value
     np.testing.assert_array_almost_equal(density, expected)
@@ -121,7 +121,7 @@ def test_setup_density_named(grid, named_material):
 
 def test_setup_alpha_coeff_default(grid, default_material):
     bd = BackgroundDomain(grid=grid, material_properties=default_material)
-    alpha_coeff = bd._setup_alpha_coeff()  # noqa: SLF001
+    alpha_coeff = bd._setup_alpha_coeff()
     expected = np.ones((grid.nx, grid.ny)) * default_material.alpha_coeff
     np.testing.assert_array_almost_equal(alpha_coeff, expected)
 
@@ -132,7 +132,7 @@ def test_setup_alpha_coeff_named(grid, named_material):
         background_property_name="custom",
         material_properties=named_material,
     )
-    alpha_coeff = bd._setup_alpha_coeff()  # noqa: SLF001
+    alpha_coeff = bd._setup_alpha_coeff()
     expected_value = named_material.custom["alpha_coeff"]
     expected = np.ones((grid.nx, grid.ny)) * expected_value
     np.testing.assert_array_almost_equal(alpha_coeff, expected)
@@ -140,7 +140,7 @@ def test_setup_alpha_coeff_named(grid, named_material):
 
 def test_setup_alpha_power_default(grid, default_material):
     bd = BackgroundDomain(grid=grid, material_properties=default_material)
-    alpha_power = bd._setup_alpha_power()  # noqa: SLF001
+    alpha_power = bd._setup_alpha_power()
     expected = np.ones((grid.nx, grid.ny)) * default_material.alpha_power
     np.testing.assert_array_almost_equal(alpha_power, expected)
 
@@ -151,7 +151,7 @@ def test_setup_alpha_power_named(grid, named_material):
         background_property_name="custom",
         material_properties=named_material,
     )
-    alpha_power = bd._setup_alpha_power()  # noqa: SLF001
+    alpha_power = bd._setup_alpha_power()
     expected_value = named_material.custom["alpha_power"]
     expected = np.ones((grid.nx, grid.ny)) * expected_value
     np.testing.assert_array_almost_equal(alpha_power, expected)
@@ -159,7 +159,7 @@ def test_setup_alpha_power_named(grid, named_material):
 
 def test_setup_beta_default(grid, default_material):
     bd = BackgroundDomain(grid=grid, material_properties=default_material)
-    beta = bd._setup_beta()  # noqa: SLF001
+    beta = bd._setup_beta()
     expected = np.ones((grid.nx, grid.ny)) * default_material.beta
     np.testing.assert_array_almost_equal(beta, expected)
 
@@ -170,7 +170,7 @@ def test_setup_beta_named(grid, named_material):
         background_property_name="custom",
         material_properties=named_material,
     )
-    beta = bd._setup_beta()  # noqa: SLF001
+    beta = bd._setup_beta()
     expected_value = named_material.custom["beta"]
     expected = np.ones((grid.nx, grid.ny)) * expected_value
     np.testing.assert_array_almost_equal(beta, expected)
diff --git a/tests/solver/test_launcher.py b/tests/solver/test_launcher.py
index b7f97b1..f52dac7 100644
--- a/tests/solver/test_launcher.py
+++ b/tests/solver/test_launcher.py
@@ -127,7 +127,7 @@ def test_configure_cuda_device_id_none(monkeypatch):
     )
 
     # Test with None input
-    result = Launcher._configure_cuda_device_id(None)  # noqa: SLF001
+    result = Launcher._configure_cuda_device_id(None)
     assert result == "0"
 
 
@@ -141,7 +141,7 @@ def test_configure_cuda_device_id_int(monkeypatch):
         selective_run_wrapper(monkeypatch=monkeypatch, gpu_ids=gpu_ids),
     )
     # Test with integer input
-    result = Launcher._configure_cuda_device_id(0)  # noqa: SLF001
+    result = Launcher._configure_cuda_device_id(0)
     assert result == "0"
 
 
@@ -156,7 +156,7 @@ def test_configure_cuda_device_id_string(monkeypatch):
     )
 
     # Test with string input
-    result = Launcher._configure_cuda_device_id("1")  # noqa: SLF001
+    result = Launcher._configure_cuda_device_id("1")
     assert result == "1"
 
 
@@ -170,7 +170,7 @@ def test_configure_cuda_device_id_list(monkeypatch):
         selective_run_wrapper(monkeypatch=monkeypatch, gpu_ids=gpu_ids),
     )
     # Test with list input
-    result = Launcher._configure_cuda_device_id([0, 1, 2])  # noqa: SLF001
+    result = Launcher._configure_cuda_device_id([0, 1, 2])
     assert result == "0,1,2"
 
 
@@ -185,7 +185,7 @@ def test_configure_cuda_device_id_negative_int(monkeypatch):
     )
     # Test with negative integer
     with pytest.raises(ValueError, match="CUDA device ID must be a non-negative integer"):
-        Launcher._configure_cuda_device_id(-1)  # noqa: SLF001
+        Launcher._configure_cuda_device_id(-1)
 
 
 def test_configure_cuda_device_id_invalid_string(monkeypatch):
@@ -202,7 +202,7 @@ def test_configure_cuda_device_id_invalid_string(monkeypatch):
         ValueError,
         match="CUDA device ID string must represent a non-negative integer",
     ):
-        Launcher._configure_cuda_device_id("invalid")  # noqa: SLF001
+        Launcher._configure_cuda_device_id("invalid")
 
 
 def test_configure_cuda_device_id_list_with_negative(monkeypatch):
@@ -220,7 +220,7 @@ def test_configure_cuda_device_id_list_with_negative(monkeypatch):
         ValueError,
         match="All CUDA device IDs in the list must be non-negative integers",
     ):
-        Launcher._configure_cuda_device_id([0, -1, 2])  # noqa: SLF001
+        Launcher._configure_cuda_device_id([0, -1, 2])
 
 
 def test_configure_cuda_device_id_invalid_type(monkeypatch):
@@ -238,7 +238,7 @@ def test_configure_cuda_device_id_invalid_type(monkeypatch):
         ValueError,
         match="CUDA device ID must be an integer, string, list, or None",
     ):
-        Launcher._configure_cuda_device_id(math.pi)  # noqa: SLF001
+        Launcher._configure_cuda_device_id(math.pi)
 
 
 def test_launcher_init_invalid_bin_path(monkeypatch):
diff --git a/tests/test_source.py b/tests/test_source.py
index 4ae79dd..6195378 100644
--- a/tests/test_source.py
+++ b/tests/test_source.py
@@ -111,7 +111,7 @@ def test_coords_and_mask_raises():
 def test_no_mask_or_coords_raises():
     p0 = np.array([[1.0]])
     with pytest.raises(
-        ValueError,  # noqa: PT011
+        ValueError,
     ):
         Source(p0)
 
@@ -450,5 +450,5 @@ def test_velocity_only_validate_passes(self):
 
     def test_all_none_raises(self):
         """No p0, no p0_additive, no velocity → ValueError."""
-        with pytest.raises(ValueError):  # noqa: PT011
+        with pytest.raises(ValueError):
             Source(grid_shape=self._grid_shape)

From 3a3d6e98d8e014b154ff001fe46d6904109e0191 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Wed, 25 Feb 2026 10:35:56 -0500
Subject: [PATCH 10/31] =?UTF-8?q?Bump=20version:=201.2.6-dev2=20=E2=86=92?=
 =?UTF-8?q?=201.2.6-dev3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .bumpversion.toml    | 2 +-
 fullwave/__init__.py | 2 +-
 pyproject.toml       | 2 +-
 uv.lock              | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.bumpversion.toml b/.bumpversion.toml
index 126318e..d0cf5ad 100644
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "1.2.6-dev2"
+current_version = "1.2.6-dev3"
 parse = """(?x)
     (?P<major>0|[1-9]\\d*)\\.
     (?P<minor>0|[1-9]\\d*)\\.
diff --git a/fullwave/__init__.py b/fullwave/__init__.py
index 59833f2..0400fa0 100644
--- a/fullwave/__init__.py
+++ b/fullwave/__init__.py
@@ -60,7 +60,7 @@
     __version__ = version("fullwave")
 except PackageNotFoundError:
     # Update via bump-my-version, not manually
-    __version__ = "1.2.6-dev2"
+    __version__ = "1.2.6-dev3"
 
 VERSION = __version__  # for convenience
 logger.info("Fullwave version: %s", __version__)
diff --git a/pyproject.toml b/pyproject.toml
index a273ac6..726c789 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "fullwave25"
-version = "1.2.6-dev2" # Update via bump-my-version, not manually
+version = "1.2.6-dev3" # Update via bump-my-version, not manually
 description = "Fullwave 2.5: Ultrasound wave propagation simulation with heterogeneous power law attenuation modelling capabilities"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/uv.lock b/uv.lock
index 0a4998f..b5dd97a 100644
--- a/uv.lock
+++ b/uv.lock
@@ -735,7 +735,7 @@ wheels = [
 
 [[package]]
 name = "fullwave25"
-version = "1.2.6.dev2"
+version = "1.2.6.dev3"
 source = { editable = "." }
 dependencies = [
     { name = "joblib" },

From eab55ffd63eb6c82eaf3c2e75f06226307348140 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Thu, 26 Feb 2026 12:13:37 -0500
Subject: [PATCH 11/31] Add GPU memory estimation and verify_gpu flag to Solver
 and Launcher classes

- Introduced `verify_gpu` parameter to `Launcher` and `Solver` constructors to check CUDA device availability.
- Added `_estimate_gpu_memory` method in `Solver` to estimate GPU memory usage before simulation execution.
---
 fullwave/solver/launcher.py              |  23 ++-
 fullwave/solver/solver.py                | 155 ++++++++++++++++++
 tests/solver/test_gpu_memory_estimate.py | 190 +++++++++++++++++++++++
 3 files changed, 365 insertions(+), 3 deletions(-)
 create mode 100644 tests/solver/test_gpu_memory_estimate.py

diff --git a/fullwave/solver/launcher.py b/fullwave/solver/launcher.py
index 80da170..06a6dcb 100644
--- a/fullwave/solver/launcher.py
+++ b/fullwave/solver/launcher.py
@@ -31,6 +31,7 @@ def __init__(
         use_gpu: bool = True,
         cuda_device_id: str | int | list | None = None,
         save_gpu_memory: bool = False,
+        verify_gpu: bool = True,
     ) -> None:
         """Initialize a FullwaveLauncher instance.
 
@@ -62,6 +63,11 @@ def __init__(
             depending on the hardware and the simulation settings.
             useful in 3D simulations with large grid sizes
             where GPU memory is a limiting factor.
+        verify_gpu : bool, optional
+            Whether to verify that the specified CUDA devices exist on the system.
+            Defaults to True. Set to False when generating input files only
+            (``generate_input_only=True``) on a machine that may not have
+            the target GPUs available.
 
         """
         self._path_fullwave_simulation_bin = path_fullwave_simulation_bin
@@ -69,7 +75,10 @@ def __init__(
         assert self._path_fullwave_simulation_bin.exists(), error_msg
         self.is_3d = is_3d
         self.use_gpu = use_gpu
-        self.cuda_device_id = self._configure_cuda_device_id(cuda_device_id)
+        self.cuda_device_id = self._configure_cuda_device_id(
+            cuda_device_id,
+            verify_gpu=verify_gpu,
+        )
         self.save_gpu_memory = save_gpu_memory
         logger.debug("Launcher instance created.")
 
@@ -152,13 +161,20 @@ def _verify_cuda_devices_exist(device_id_str: str) -> None:
                 raise ValueError(message)
 
     @staticmethod
-    def _configure_cuda_device_id(cuda_device_id: str | int | list | None) -> str:
+    def _configure_cuda_device_id(
+        cuda_device_id: str | int | list | None,
+        *,
+        verify_gpu: bool = True,
+    ) -> str:
         """Verify and assign the CUDA device ID.
 
         Parameters
         ----------
         cuda_device_id : str | int | None
             The CUDA device ID to verify and assign.
+        verify_gpu : bool, optional
+            Whether to verify that the specified CUDA devices exist.
+            Defaults to True.
 
         Returns
         -------
@@ -167,7 +183,8 @@ def _configure_cuda_device_id(cuda_device_id: str | int | list | None) -> str:
 
         """
         output = Launcher._parse_cuda_device_id(cuda_device_id)
-        Launcher._verify_cuda_devices_exist(output)
+        if verify_gpu:
+            Launcher._verify_cuda_devices_exist(output)
         return output
 
     def run(
diff --git a/fullwave/solver/solver.py b/fullwave/solver/solver.py
index faadb57..0d901ef 100644
--- a/fullwave/solver/solver.py
+++ b/fullwave/solver/solver.py
@@ -329,6 +329,7 @@ def __init__(  # noqa: PLR0912
         use_isotropic_relaxation: bool = True,
         cuda_device_id: str | int | list | None = None,
         save_gpu_memory: bool = False,
+        verify_gpu: bool = True,
     ) -> None:
         """Initialize a Solver instance for the fullwave simulation.
 
@@ -415,6 +416,11 @@ def __init__(  # noqa: PLR0912
             depending on the hardware and the simulation settings.
             useful in 3D simulations with large grid sizes
             where GPU memory is a limiting factor.
+        verify_gpu : bool, optional
+            Whether to verify that the specified CUDA devices exist on the system.
+            Defaults to True. Set to False when generating input files only
+            (``generate_input_only=True``) on a machine that may not have
+            the target GPUs available.
 
         Raises
         ------
@@ -555,6 +561,7 @@ def __init__(  # noqa: PLR0912
             use_gpu=self.use_gpu,
             cuda_device_id=self.cuda_device_id,
             save_gpu_memory=self.save_gpu_memory,
+            verify_gpu=verify_gpu,
         )
 
         if use_exponential_attenuation:
@@ -758,6 +765,7 @@ def run(
         release_after_write: bool = False,
         highpass_cutoff_mhz: float | None = None,
         bandpass_cutoff_mhz: tuple[float, float] | None = None,
+        gpu_memory_estimate: bool = True,
     ) -> NDArray[np.float64] | Path:
         r"""Run the fullwave simulation and return the result as a NumPy array.
 
@@ -829,6 +837,9 @@ def run(
             Uses cosine (Hann) tapers on both edges.
             Cannot be combined with ``highpass_cutoff_mhz``.
             Requires ``load_results=True``.  Default is ``None`` (no filtering).
+        gpu_memory_estimate : bool
+            Whether to estimate GPU memory usage before running the simulation.
+            Default is True. If True, it estimates the GPU memory usage.
 
         Returns
         -------
@@ -937,6 +948,9 @@ def run(
         )
         logger.debug(message)
 
+        if gpu_memory_estimate:
+            self._estimate_gpu_memory(sensor)
+
         if generate_input_only:
             logger.info(
                 "Input data generation completed in %s. Skipping simulation execution.",
@@ -975,6 +989,147 @@ def run(
         # which is a list of file names
         return sim_result
 
+    def _estimate_gpu_memory(
+        self,
+        sensor: fullwave.Sensor,
+    ) -> None:
+        """Estimate and log GPU memory usage per device.
+
+        Provides a pre-launch estimate so users can verify that the simulation
+        fits in GPU memory before the binary starts allocating.
+
+        Parameters
+        ----------
+        sensor : fullwave.Sensor
+            The sensor that will actually be written to the input files (may
+            differ from ``self.sensor`` when ``record_whole_domain=True``).
+
+        """
+        device_ids = self.fullwave_launcher.cuda_device_id.split(",")
+        n_gpus = len(device_ids)
+
+        grid = self.pml_builder.extended_grid
+        source = self.pml_builder.extended_source
+        medium = self.pml_builder.extended_medium
+
+        depth = grid.nx
+        lateral = grid.ny * grid.nz if self.is_3d else grid.ny
+        halo_depth = 8  # ghost cells per side for stencil exchange
+
+        float_sz = 4  # bytes per float32
+        int_sz = 4  # bytes per int32
+
+        # Sound-speed range determines the number of derivative-map levels
+        c_map = medium.sound_speed
+        c_range = int(np.rint(c_map.max()) - np.rint(c_map.min()))
+        n_deriv_levels = 1 if c_range == 0 else c_range + 1
+
+        n_time_steps_source = source.icmat.shape[1]
+
+        # --- Per-GPU domain decomposition along depth ---
+        base_depth = depth // n_gpus
+        remainder = depth % n_gpus
+
+        for rank, dev_id in enumerate(device_ids):
+            # First `remainder` ranks get one extra depth slice
+            depth_this_gpu = base_depth + (1 if rank < remainder else 0)
+
+            # Halo regions: first/last GPU has 1 side, middle GPUs have 2
+            if n_gpus == 1:
+                n_halo_sides = 0
+            elif rank == 0 or rank == n_gpus - 1:
+                n_halo_sides = 1
+            else:
+                n_halo_sides = 2
+            depth_with_halo = depth_this_gpu + n_halo_sides * halo_depth
+
+            slab = depth_with_halo * lateral
+
+            # Approximate per-GPU source / sensor counts
+            src_this_gpu = max(source.n_sources // n_gpus, 0)
+            sen_this_gpu = max(sensor.n_sensors // n_gpus, 0)
+
+            # --- Wave-field storage (pressure + 2 velocity, 2 time levels) ---
+            wave_fields = 3 * 2 * slab * float_sz
+
+            # --- Material property maps (3 maps) ---
+            material_maps = 3 * slab * float_sz
+
+            # --- Stencil coefficient storage ---
+            stencil_storage = 9 * 2 * n_deriv_levels * float_sz + slab * int_sz
+
+            # --- Source injection ---
+            if src_this_gpu > 0:
+                if self.save_gpu_memory:
+                    source_storage = src_this_gpu * float_sz
+                else:
+                    source_storage = src_this_gpu * n_time_steps_source * float_sz
+                source_storage += 2 * src_this_gpu * int_sz
+            else:
+                source_storage = 0
+
+            # --- Output / sensor gathering ---
+            if sen_this_gpu > 0:
+                sensor_storage = (
+                    sen_this_gpu * float_sz + 3 * sen_this_gpu * int_sz + sen_this_gpu * int_sz
+                )
+            else:
+                sensor_storage = 0
+
+            if self.use_exponential_attenuation:
+                attenuation_map = slab * float_sz
+                total = (
+                    wave_fields
+                    + material_maps
+                    + attenuation_map
+                    + stencil_storage
+                    + source_storage
+                    + sensor_storage
+                )
+            else:
+                n_relax = self.n_relax_mechanisms
+                # Relaxation memory fields (pressure + velocity directions,
+                # each with N_relax mechanisms and 2 time levels)
+                relaxation_fields = 2 * n_relax * 2 * slab * float_sz * 2
+                # PML absorption / dispersion coefficient maps
+                pml_coefficients = (
+                    2 * slab * float_sz
+                    + 2 * n_relax * slab * float_sz
+                    + 2 * n_relax * slab * float_sz
+                )
+                # Kernel dispatch tables (negligible but included)
+                dispatch_tables = 8 * n_relax * 8
+                # Zero-pressure (air) coordinate storage
+                n_air = medium.n_air
+                air_storage = 2 * n_air * int_sz if n_air > 0 else 0
+
+                total = (
+                    wave_fields
+                    + material_maps
+                    + relaxation_fields
+                    + pml_coefficients
+                    + dispatch_tables
+                    + stencil_storage
+                    + source_storage
+                    + air_storage
+                    + sensor_storage
+                )
+
+            gb = 1024.0**3
+            mode = "exponential" if self.use_exponential_attenuation else "relaxation"
+            saving = ", save_gpu_memory=True" if self.save_gpu_memory else ""
+            logger.info(
+                "GPU memory estimate [GPU %s] (%s mode%s): "
+                "%.2f GB  (depth=%d +%d halo, lateral=%d)",
+                dev_id.strip(),
+                mode,
+                saving,
+                total / gb,
+                depth_this_gpu,
+                n_halo_sides * halo_depth,
+                lateral,
+            )
+
     def print_info(self) -> None:
         """Print the Solver instance information."""
         print(str(self))
diff --git a/tests/solver/test_gpu_memory_estimate.py b/tests/solver/test_gpu_memory_estimate.py
new file mode 100644
index 0000000..5ad2fec
--- /dev/null
+++ b/tests/solver/test_gpu_memory_estimate.py
@@ -0,0 +1,190 @@
+from pathlib import Path
+from unittest.mock import patch
+
+import numpy as np
+import pytest
+
+import fullwave
+from fullwave.solver import solver as solver_module
+
+_FAKE_BINARY = (
+    Path(__file__).parent.parent.parent
+    / "fullwave"
+    / "solver"
+    / "bins"
+    / "gpu"
+    / "2d"
+    / "num_relax=2"
+    / "fullwave2_2d_2_relax_multi_gpu_cuda129"
+)
+
+
+def _build_solver(
+    tmp_path,
+    *,
+    save_gpu_memory=False,
+    cuda_device_id=None,
+):
+    domain_size = (1e-3, 1e-3)
+    f0 = 1e6
+    c0 = 1540
+    duration = domain_size[0] / c0 * 2
+
+    grid = fullwave.Grid(
+        domain_size=domain_size,
+        f0=f0,
+        duration=duration,
+        c0=c0,
+    )
+
+    shape = (grid.nx, grid.ny)
+    medium = fullwave.Medium(
+        grid=grid,
+        sound_speed=c0 * np.ones(shape),
+        density=1000 * np.ones(shape),
+        alpha_coeff=0.5 * np.ones(shape),
+        alpha_power=1.0 * np.ones(shape),
+        beta=np.zeros(shape),
+        use_isotropic_relaxation=True,
+    )
+
+    p_mask = np.zeros(shape, dtype=bool)
+    p_mask[grid.nx // 2, :] = True
+    source = fullwave.Source(np.ones((p_mask.sum(), grid.nt)), p_mask)
+
+    sensor_mask = np.ones(shape, dtype=bool)
+    sensor = fullwave.Sensor(mask=sensor_mask)
+
+    return fullwave.Solver(
+        work_dir=tmp_path / "work",
+        grid=grid,
+        medium=medium,
+        source=source,
+        sensor=sensor,
+        use_isotropic_relaxation=True,
+        save_gpu_memory=save_gpu_memory,
+        cuda_device_id=cuda_device_id,
+        verify_gpu=False,
+    )
+
+
+@pytest.fixture(autouse=True)
+def _patch_binary(monkeypatch):
+    monkeypatch.setattr(
+        solver_module,
+        "_retrieve_fullwave_simulation_path",
+        lambda **kwargs: _FAKE_BINARY,  # noqa: ARG005
+    )
+
+
+def test_single_gpu_relaxation_logs_info(tmp_path):
+    solver = _build_solver(tmp_path)
+    sensor = solver.pml_builder.extended_sensor
+
+    with patch("fullwave.solver.solver.logger") as mock_logger:
+        solver._estimate_gpu_memory(sensor)
+
+    mock_logger.info.assert_called_once()
+    fmt = mock_logger.info.call_args[0][0]
+    mode_arg = mock_logger.info.call_args[0][2]
+    assert "GPU memory estimate" in fmt
+    assert mode_arg == "relaxation"
+
+
+def test_single_gpu_no_halo(tmp_path):
+    solver = _build_solver(tmp_path)
+    sensor = solver.pml_builder.extended_sensor
+
+    with patch("fullwave.solver.solver.logger") as mock_logger:
+        solver._estimate_gpu_memory(sensor)
+
+    # halo argument: dev_id, mode, saving, GB, depth, halo, lateral
+    halo_value = mock_logger.info.call_args[0][6]
+    assert halo_value == 0
+
+
+def test_estimate_is_positive(tmp_path):
+    solver = _build_solver(tmp_path)
+    sensor = solver.pml_builder.extended_sensor
+
+    with patch("fullwave.solver.solver.logger") as mock_logger:
+        solver._estimate_gpu_memory(sensor)
+
+    gb_value = mock_logger.info.call_args[0][4]
+    assert gb_value > 0
+
+
+def test_save_gpu_memory_reduces_estimate(tmp_path):
+    solver_no_save = _build_solver(tmp_path, save_gpu_memory=False)
+    solver_save = _build_solver(tmp_path, save_gpu_memory=True)
+
+    with patch("fullwave.solver.solver.logger") as mock_no_save:
+        solver_no_save._estimate_gpu_memory(solver_no_save.pml_builder.extended_sensor)
+    gb_no_save = mock_no_save.info.call_args[0][4]
+
+    with patch("fullwave.solver.solver.logger") as mock_save:
+        solver_save._estimate_gpu_memory(solver_save.pml_builder.extended_sensor)
+    gb_save = mock_save.info.call_args[0][4]
+
+    assert gb_save <= gb_no_save
+
+
+def test_save_gpu_memory_label_in_log(tmp_path):
+    solver = _build_solver(tmp_path, save_gpu_memory=True)
+    sensor = solver.pml_builder.extended_sensor
+
+    with patch("fullwave.solver.solver.logger") as mock_logger:
+        solver._estimate_gpu_memory(sensor)
+
+    saving_arg = mock_logger.info.call_args[0][3]
+    assert "save_gpu_memory" in saving_arg
+
+
+def test_multi_gpu_two_devices_logs_per_device(tmp_path):
+    solver = _build_solver(tmp_path, cuda_device_id=[0, 1])
+    sensor = solver.pml_builder.extended_sensor
+
+    with patch("fullwave.solver.solver.logger") as mock_logger:
+        solver._estimate_gpu_memory(sensor)
+
+    assert mock_logger.info.call_count == 2
+
+
+def test_multi_gpu_two_devices_each_has_one_halo_side(tmp_path):
+    solver = _build_solver(tmp_path, cuda_device_id=[0, 1])
+    sensor = solver.pml_builder.extended_sensor
+
+    with patch("fullwave.solver.solver.logger") as mock_logger:
+        solver._estimate_gpu_memory(sensor)
+
+    for call in mock_logger.info.call_args_list:
+        halo_value = call[0][6]
+        assert halo_value == 8  # 1 side * 8 ghost cells
+
+
+def test_multi_gpu_three_devices_middle_has_two_halo_sides(tmp_path):
+    solver = _build_solver(tmp_path, cuda_device_id=[0, 1, 2])
+    sensor = solver.pml_builder.extended_sensor
+
+    with patch("fullwave.solver.solver.logger") as mock_logger:
+        solver._estimate_gpu_memory(sensor)
+
+    assert mock_logger.info.call_count == 3
+    calls = mock_logger.info.call_args_list
+    assert calls[0][0][6] == 8  # first GPU: 1 halo side
+    assert calls[1][0][6] == 16  # middle GPU: 2 halo sides
+    assert calls[2][0][6] == 8  # last GPU: 1 halo side
+
+
+def test_multi_gpu_splits_depth(tmp_path):
+    solver = _build_solver(tmp_path, cuda_device_id=[0, 1])
+    sensor = solver.pml_builder.extended_sensor
+    total_depth = solver.pml_builder.extended_grid.nx
+
+    with patch("fullwave.solver.solver.logger") as mock_logger:
+        solver._estimate_gpu_memory(sensor)
+
+    calls = mock_logger.info.call_args_list
+    depth_gpu0 = calls[0][0][5]
+    depth_gpu1 = calls[1][0][5]
+    assert depth_gpu0 + depth_gpu1 == total_depth

From 844ff70e9ea8f628c7a9b9bd3a6c87f05bdd2c05 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Thu, 26 Feb 2026 13:44:49 -0500
Subject: [PATCH 12/31] Refactor PMLBuilder to replace joblib with
 ThreadPoolExecutor for parallel processing Remove joblib dependency from
 project

---
 fullwave/solver/pml_builder.py | 88 ++++++++++++++++------------------
 pyproject.toml                 | 12 ++++-
 uv.lock                        | 11 -----
 3 files changed, 52 insertions(+), 59 deletions(-)

diff --git a/fullwave/solver/pml_builder.py b/fullwave/solver/pml_builder.py
index 5fbd8eb..de52885 100644
--- a/fullwave/solver/pml_builder.py
+++ b/fullwave/solver/pml_builder.py
@@ -5,13 +5,11 @@
 from collections import OrderedDict
 from dataclasses import dataclass, field
 from functools import cached_property
-from itertools import starmap
 from pathlib import Path
 
 import matplotlib.pyplot as plt
 import numexpr as ne
 import numpy as np
-from joblib import Parallel, delayed
 from numpy.typing import NDArray
 
 import fullwave
@@ -834,21 +832,24 @@ def _compute_one(
 
         items = list(rename_dict.items())
 
-        results = Parallel(n_jobs=self.medium_org.n_jobs, backend="threading")(
-            delayed(_compute_one)(
-                key_fw2,
-                key_py,
-                relaxation_param_dict,
-                alpha_target_higher_nu,
-                d_target_higher_nu,
-                alpha_target_pml,
-                d_target_pml,
-                n_polynomial,
-                self.is_3d,
-                self._apply_transition_and_pml,
-            )
-            for key_fw2, key_py in items
-        )
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(
+                    _compute_one,
+                    key_fw2,
+                    key_py,
+                    relaxation_param_dict,
+                    alpha_target_higher_nu,
+                    d_target_higher_nu,
+                    alpha_target_pml,
+                    d_target_pml,
+                    n_polynomial,
+                    self.is_3d,
+                    self._apply_transition_and_pml,
+                )
+                for key_fw2, key_py in items
+            ]
+            results = [f.result() for f in futures]
         out_dict = dict(results)
 
         logger.debug("Calculating PML a and b coefficients...")
@@ -876,13 +877,9 @@ def _worker(
             # Return keys + values so parent can update dict safely
             return (f"a_pml_{axis}{nu}", a, f"b_pml_{axis}{nu}", b)
 
-        results = Parallel(
-            n_jobs=self.medium_org.n_jobs,  # use all cores
-            backend="loky",  # process-based; safe default for Python code
-            prefer="processes",
-        )(
-            starmap(delayed(_worker), tasks),
-        )
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = [executor.submit(_worker, nu, axis) for nu, axis in tasks]
+            results = [f.result() for f in futures]
 
         for a_key, a_val, b_key, b_val in results:
             out_dict[a_key] = a_val
@@ -1119,21 +1116,24 @@ def _compute_one(
             raise ValueError(error_msg)
 
         items = list(rename_dict.items())
-        results = Parallel(n_jobs=self.medium_org.n_jobs, backend="threading")(
-            delayed(_compute_one)(
-                key_fw2,
-                key_py,
-                relaxation_param_dict,
-                alpha_target_higher_nu,
-                d_target_higher_nu,
-                alpha_target_pml,
-                d_target_pml,
-                n_polynomial,
-                self.is_3d,
-                self._apply_transition_and_pml,
-            )
-            for key_fw2, key_py in items
-        )
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(
+                    _compute_one,
+                    key_fw2,
+                    key_py,
+                    relaxation_param_dict,
+                    alpha_target_higher_nu,
+                    d_target_higher_nu,
+                    alpha_target_pml,
+                    d_target_pml,
+                    n_polynomial,
+                    self.is_3d,
+                    self._apply_transition_and_pml,
+                )
+                for key_fw2, key_py in items
+            ]
+            results = [f.result() for f in futures]
         out_dict = dict(results)
 
         logger.debug("Calculating PML a and b coefficients...")
@@ -1162,13 +1162,9 @@ def _worker(
             # Return keys + values so parent can update dict safely
             return (f"a_pml_{axis}{nu}", a, f"b_pml_{axis}{nu}", b)
 
-        results = Parallel(
-            n_jobs=self.medium_org.n_jobs,  # use all cores
-            backend="loky",  # process-based; safe default for Python code
-            prefer="processes",
-        )(
-            starmap(delayed(_worker), tasks),
-        )
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = [executor.submit(_worker, nu, axis) for nu, axis in tasks]
+            results = [f.result() for f in futures]
 
         for a_key, a_val, b_key, b_val in results:
             out_dict[a_key] = a_val
diff --git a/pyproject.toml b/pyproject.toml
index 726c789..bf2f35f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,6 @@ dependencies = [
     "opencv-python-headless>=4.12.0.88",
     "tomli>=2.3.0",
     "numba>=0.63.1",
-    "joblib>=1.5.3",
     "numexpr>=2.14.1",
 ]
 authors = [{ name = "Masashi Sode" }, { name = "Gianmarco Pinton" }]
@@ -57,4 +56,13 @@ build-backend = "hatchling.build"
 packages = ["fullwave", "fullwave.utils"]
 
 [tool.hatch.build]
-exclude = ["tests", "figs", "examples", ".vscode", ".github", "fullwave/solver/bins/gpu", "fullwave/solver/bins/exponential_attenuation", "fullwave/solver/bins/_exponential_attenuation"]
+exclude = [
+    "tests",
+    "figs",
+    "examples",
+    ".vscode",
+    ".github",
+    "fullwave/solver/bins/gpu",
+    "fullwave/solver/bins/exponential_attenuation",
+    "fullwave/solver/bins/_exponential_attenuation",
+]
diff --git a/uv.lock b/uv.lock
index b5dd97a..a3cb3fd 100644
--- a/uv.lock
+++ b/uv.lock
@@ -738,7 +738,6 @@ name = "fullwave25"
 version = "1.2.6.dev3"
 source = { editable = "." }
 dependencies = [
-    { name = "joblib" },
     { name = "matplotlib" },
     { name = "numba" },
     { name = "numexpr" },
@@ -780,7 +779,6 @@ dev = [
 requires-dist = [
     { name = "cupy-cuda12x", marker = "extra == 'examples'", specifier = ">=13.6.0" },
     { name = "ipykernel", marker = "extra == 'examples'", specifier = ">=6.28.0" },
-    { name = "joblib", specifier = ">=1.5.3" },
     { name = "jupyter", marker = "extra == 'examples'", specifier = ">=1.0.0" },
     { name = "line-profiler", marker = "extra == 'dev'", specifier = ">=5.0.1" },
     { name = "mach-beamform", marker = "extra == 'examples'", specifier = ">=0.0.4" },
@@ -1034,15 +1032,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" },
 ]
 
-[[package]]
-name = "joblib"
-version = "1.5.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/41/f2/d34e8b3a08a9cc79a50b2208a93dce981fe615b64d5a4d4abee421d898df/joblib-1.5.3.tar.gz", hash = "sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3", size = 331603, upload-time = "2025-12-15T08:41:46.427Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" },
-]
-
 [[package]]
 name = "json5"
 version = "0.12.1"

From 356e35896190e00dfb5bcf2f02749590e00ff1ab Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Thu, 26 Feb 2026 14:10:24 -0500
Subject: [PATCH 13/31] Disable GPU memory estimation by default and add
 logging for experimental feature

---
 fullwave/solver/solver.py | 249 ++++++++++++++++++++++++++------------
 1 file changed, 170 insertions(+), 79 deletions(-)

diff --git a/fullwave/solver/solver.py b/fullwave/solver/solver.py
index 0d901ef..e30ca2d 100644
--- a/fullwave/solver/solver.py
+++ b/fullwave/solver/solver.py
@@ -765,7 +765,7 @@ def run(
         release_after_write: bool = False,
         highpass_cutoff_mhz: float | None = None,
         bandpass_cutoff_mhz: tuple[float, float] | None = None,
-        gpu_memory_estimate: bool = True,
+        gpu_memory_estimate: bool = False,
     ) -> NDArray[np.float64] | Path:
         r"""Run the fullwave simulation and return the result as a NumPy array.
 
@@ -1005,6 +1005,8 @@ def _estimate_gpu_memory(
             differ from ``self.sensor`` when ``record_whole_domain=True``).
 
         """
+        # show that this is an experimental feature
+        logger.info("Estimating GPU memory usage... (experimental feature, may be inaccurate)")
         device_ids = self.fullwave_launcher.cuda_device_id.split(",")
         n_gpus = len(device_ids)
 
@@ -1014,108 +1016,64 @@ def _estimate_gpu_memory(
 
         depth = grid.nx
         lateral = grid.ny * grid.nz if self.is_3d else grid.ny
-        halo_depth = 8  # ghost cells per side for stencil exchange
+        halo_depth = 8
 
-        float_sz = 4  # bytes per float32
-        int_sz = 4  # bytes per int32
+        float_bytes = 4
+        int_bytes = 4
 
-        # Sound-speed range determines the number of derivative-map levels
         c_map = medium.sound_speed
         c_range = int(np.rint(c_map.max()) - np.rint(c_map.min()))
         n_deriv_levels = 1 if c_range == 0 else c_range + 1
 
-        n_time_steps_source = source.icmat.shape[1]
+        n_source_timesteps = source.icmat.shape[1]
+
+        gb = 1024.0**3
 
-        # --- Per-GPU domain decomposition along depth ---
         base_depth = depth // n_gpus
         remainder = depth % n_gpus
 
         for rank, dev_id in enumerate(device_ids):
-            # First `remainder` ranks get one extra depth slice
-            depth_this_gpu = base_depth + (1 if rank < remainder else 0)
+            depth_this = base_depth + (1 if rank < remainder else 0)
 
-            # Halo regions: first/last GPU has 1 side, middle GPUs have 2
             if n_gpus == 1:
                 n_halo_sides = 0
             elif rank == 0 or rank == n_gpus - 1:
                 n_halo_sides = 1
             else:
                 n_halo_sides = 2
-            depth_with_halo = depth_this_gpu + n_halo_sides * halo_depth
-
-            slab = depth_with_halo * lateral
-
-            # Approximate per-GPU source / sensor counts
-            src_this_gpu = max(source.n_sources // n_gpus, 0)
-            sen_this_gpu = max(sensor.n_sensors // n_gpus, 0)
-
-            # --- Wave-field storage (pressure + 2 velocity, 2 time levels) ---
-            wave_fields = 3 * 2 * slab * float_sz
-
-            # --- Material property maps (3 maps) ---
-            material_maps = 3 * slab * float_sz
+            local_depth = depth_this + n_halo_sides * halo_depth
+            slab = local_depth * lateral
 
-            # --- Stencil coefficient storage ---
-            stencil_storage = 9 * 2 * n_deriv_levels * float_sz + slab * int_sz
-
-            # --- Source injection ---
-            if src_this_gpu > 0:
-                if self.save_gpu_memory:
-                    source_storage = src_this_gpu * float_sz
-                else:
-                    source_storage = src_this_gpu * n_time_steps_source * float_sz
-                source_storage += 2 * src_this_gpu * int_sz
-            else:
-                source_storage = 0
-
-            # --- Output / sensor gathering ---
-            if sen_this_gpu > 0:
-                sensor_storage = (
-                    sen_this_gpu * float_sz + 3 * sen_this_gpu * int_sz + sen_this_gpu * int_sz
-                )
-            else:
-                sensor_storage = 0
+            n_sources = max(source.n_sources // n_gpus, 0)
+            n_sensors = max(sensor.n_sensors // n_gpus, 0)
 
             if self.use_exponential_attenuation:
-                attenuation_map = slab * float_sz
-                total = (
-                    wave_fields
-                    + material_maps
-                    + attenuation_map
-                    + stencil_storage
-                    + source_storage
-                    + sensor_storage
+                total = self._mem_exponential(
+                    slab,
+                    n_deriv_levels,
+                    n_sources,
+                    n_source_timesteps,
+                    save_gpu_memory=self.save_gpu_memory,
+                    n_sensors=n_sensors,
+                    float_bytes=float_bytes,
+                    int_bytes=int_bytes,
+                    is_3d=self.is_3d,
                 )
             else:
-                n_relax = self.n_relax_mechanisms
-                # Relaxation memory fields (pressure + velocity directions,
-                # each with N_relax mechanisms and 2 time levels)
-                relaxation_fields = 2 * n_relax * 2 * slab * float_sz * 2
-                # PML absorption / dispersion coefficient maps
-                pml_coefficients = (
-                    2 * slab * float_sz
-                    + 2 * n_relax * slab * float_sz
-                    + 2 * n_relax * slab * float_sz
-                )
-                # Kernel dispatch tables (negligible but included)
-                dispatch_tables = 8 * n_relax * 8
-                # Zero-pressure (air) coordinate storage
-                n_air = medium.n_air
-                air_storage = 2 * n_air * int_sz if n_air > 0 else 0
-
-                total = (
-                    wave_fields
-                    + material_maps
-                    + relaxation_fields
-                    + pml_coefficients
-                    + dispatch_tables
-                    + stencil_storage
-                    + source_storage
-                    + air_storage
-                    + sensor_storage
+                total = self._mem_relaxation(
+                    slab,
+                    n_deriv_levels,
+                    n_sources,
+                    n_source_timesteps,
+                    save_gpu_memory=self.save_gpu_memory,
+                    n_air=medium.n_air,
+                    n_sensors=n_sensors,
+                    n_relax=self.n_relax_mechanisms,
+                    float_bytes=float_bytes,
+                    int_bytes=int_bytes,
+                    is_3d=self.is_3d,
                 )
 
-            gb = 1024.0**3
             mode = "exponential" if self.use_exponential_attenuation else "relaxation"
             saving = ", save_gpu_memory=True" if self.save_gpu_memory else ""
             logger.info(
@@ -1125,11 +1083,144 @@ def _estimate_gpu_memory(
                 mode,
                 saving,
                 total / gb,
-                depth_this_gpu,
+                depth_this,
                 n_halo_sides * halo_depth,
                 lateral,
             )
 
+    @staticmethod
+    def _mem_exponential(
+        slab: int,
+        n_deriv_levels: int,
+        n_sources: int,
+        n_source_timesteps: int,
+        *,
+        save_gpu_memory: bool,
+        n_sensors: int,
+        float_bytes: int,
+        int_bytes: int,
+        is_3d: bool,
+    ) -> int:
+        """Return estimated GPU bytes for exponential-attenuation solver.
+
+        Parameters
+        ----------
+        slab : int
+            Grid points per GPU slab (local_depth * lateral).
+        n_deriv_levels : int
+            Number of derivative-map levels.
+        n_sources : int
+            Approximate source count on this GPU.
+        n_source_timesteps : int
+            Number of source time steps.
+        save_gpu_memory : bool
+            Whether memory-saving mode is active.
+        n_sensors : int
+            Approximate sensor count on this GPU.
+        float_bytes : int
+            Bytes per float (4).
+        int_bytes : int
+            Bytes per int (4).
+        is_3d: bool,
+            Whether the simulation is 3D (affects sensor memory).
+
+        Returns
+        -------
+        int
+            Total estimated bytes.
+
+        """
+        fb = float_bytes
+        ib = int_bytes
+        # fields: 3 wave-field pairs (2 time levels each)
+        mem = 4 * 2 * slab * fb if is_3d else 3 * 2 * slab * fb
+        # material: 3 property maps + 1 attenuation map
+        mem += 4 * slab * fb
+        # stencil
+        mem += 9 * 2 * n_deriv_levels * fb + slab * ib
+        # source
+        if n_sources > 0:
+            mem += n_sources * fb if save_gpu_memory else n_sources * n_source_timesteps * fb
+            mem += 2 * n_sources * ib
+        # sensor
+        if n_sensors > 0:
+            mem += n_sensors * fb + 3 * n_sensors * ib + n_sensors * ib
+        return mem
+
+    @staticmethod
+    def _mem_relaxation(
+        slab: int,
+        n_deriv_levels: int,
+        n_sources: int,
+        n_source_timesteps: int,
+        *,
+        save_gpu_memory: bool,
+        n_air: int,
+        n_sensors: int,
+        n_relax: int,
+        float_bytes: int,
+        int_bytes: int,
+        is_3d: bool,
+    ) -> int:
+        """Return estimated GPU bytes for relaxation (power-law) solver.
+
+        Parameters
+        ----------
+        slab : int
+            Grid points per GPU slab (local_depth * lateral).
+        n_deriv_levels : int
+            Number of derivative-map levels.
+        n_sources : int
+            Approximate source count on this GPU.
+        n_source_timesteps : int
+            Number of source time steps.
+        save_gpu_memory : bool
+            Whether memory-saving mode is active.
+        n_air : int
+            Number of zero-pressure (air) coordinates.
+        n_sensors : int
+            Approximate sensor count on this GPU.
+        n_relax : int
+            Number of relaxation mechanisms.
+        float_bytes : int
+            Bytes per float (4).
+        int_bytes : int
+            Bytes per int (4).
+        is_3d: bool,
+            Whether the simulation is 3D (affects sensor memory).
+
+        Returns
+        -------
+        int
+            Total estimated bytes.
+
+        """
+        fb = float_bytes
+        ib = int_bytes
+        # fields: 3 wave-field pairs (2 time levels each)
+        mem = 4 * 2 * slab * fb if is_3d else 3 * 2 * slab * fb
+        # relaxation memory: pressure + velocity
+        mem += (2 * n_relax * 2 * slab * fb) * 12 if is_3d else (2 * n_relax * slab * fb) * 8
+        # material: 3 property maps
+        mem += 3 * slab * fb
+        # a/b coefficients
+        mem += (2 + 4 * n_relax) * slab * fb
+        # pointer tables
+        mem += 8 * n_relax * 8
+        # stencil
+        mem += 9 * 2 * n_deriv_levels * fb + slab * ib
+        # source
+        if n_sources > 0:
+            mem += n_sources * fb if save_gpu_memory else n_sources * n_source_timesteps * fb
+            mem += 2 * n_sources * ib
+        # air
+        if n_air > 0:
+            mem += 2 * n_air * ib
+        # sensor
+        if n_sensors > 0:
+            mem += n_sensors * fb + 3 * n_sensors * ib + n_sensors * ib
+        return mem
+
     def print_info(self) -> None:
         """Print the Solver instance information."""
         print(str(self))

From aa832ccb76b4af9fed2edd9f7118e6938edb9ea0 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Thu, 26 Feb 2026 21:27:18 -0500
Subject: [PATCH 14/31] update GPU memory estimation

---
 fullwave/solver/solver.py                |  60 +++--
 ruff.toml                                |   1 +
 tests/solver/test_gpu_memory_estimate.py | 291 +++++++++++++++++++++--
 3 files changed, 310 insertions(+), 42 deletions(-)

diff --git a/fullwave/solver/solver.py b/fullwave/solver/solver.py
index e30ca2d..8e53ce6 100644
--- a/fullwave/solver/solver.py
+++ b/fullwave/solver/solver.py
@@ -1046,6 +1046,7 @@ def _estimate_gpu_memory(
 
             n_sources = max(source.n_sources // n_gpus, 0)
             n_sensors = max(sensor.n_sensors // n_gpus, 0)
+            n_air_local = max(medium.n_air // n_gpus, 0)
 
             if self.use_exponential_attenuation:
                 total = self._mem_exponential(
@@ -1066,7 +1067,7 @@ def _estimate_gpu_memory(
                     n_sources,
                     n_source_timesteps,
                     save_gpu_memory=self.save_gpu_memory,
-                    n_air=medium.n_air,
+                    n_air=n_air_local,
                     n_sensors=n_sensors,
                     n_relax=self.n_relax_mechanisms,
                     float_bytes=float_bytes,
@@ -1132,19 +1133,24 @@ def _mem_exponential(
         """
         fb = float_bytes
         ib = int_bytes
-        # fields: 3 wave-field pairs (2 time levels each)
-        mem = 4 * 2 * slab * fb if is_3d else 3 * 2 * slab * fb
-        # material: 3 property maps + 1 attenuation map
+        ndim = 3 if is_3d else 2
+        n_fields = 4 if is_3d else 3  # p, u, [v], w
+
+        # wave fields: n_fields pairs x 2 time levels
+        mem = n_fields * 2 * slab * fb
+        # material: rho + K + beta + a_exp
         mem += 4 * slab * fb
-        # stencil
+        # derivative maps (dmap + dcmap)
         mem += 9 * 2 * n_deriv_levels * fb + slab * ib
-        # source
+        # source (icmat + coords)
         if n_sources > 0:
             mem += n_sources * fb if save_gpu_memory else n_sources * n_source_timesteps * fb
-            mem += 2 * n_sources * ib
-        # sensor
+            mem += ndim * n_sources * ib
+        # sensor (genoutframe + coordsout_local + p_idx_array)
         if n_sensors > 0:
-            mem += n_sensors * fb + 3 * n_sensors * ib + n_sensors * ib
+            mem += n_sensors * fb
+            mem += (ndim + 1) * n_sensors * ib
+            mem += n_sensors * ib
         return mem
 
     @staticmethod
@@ -1177,7 +1183,7 @@ def _mem_relaxation(
         save_gpu_memory : bool
             Whether memory-saving mode is active.
         n_air : int
-            Number of zero-pressure (air) coordinates.
+            Number of zero-pressure (air) coordinates on this GPU.
         n_sensors : int
             Approximate sensor count on this GPU.
         n_relax : int
@@ -1197,28 +1203,34 @@ def _mem_relaxation(
         """
         fb = float_bytes
         ib = int_bytes
-        # fields: 3 wave-field pairs (2 time levels each)
-        mem = 4 * 2 * slab * fb if is_3d else 3 * 2 * slab * fb
-        # relaxation memory: pressure + velocity
-        mem += (2 * n_relax * 2 * slab * fb) * 12 if is_3d else (2 * n_relax * slab * fb) * 8
-        # material: 3 property maps
+        ndim = 3 if is_3d else 2
+        n_fields = 4 if is_3d else 3  # p, u, [v], w
+
+        # wave fields: n_fields pairs x 2 time levels
+        mem = n_fields * 2 * slab * fb
+        # relaxation psi:
+        mem += 2 * (ndim * n_relax * 2 * slab * fb)
+        # material: rho + K + beta
         mem += 3 * slab * fb
-        # a/b coefficients
-        mem += (2 + 4 * n_relax) * slab * fb
-        # pointer tables
-        mem += 8 * n_relax * 8
-        # stencil
+        # kappa: 2 arrays (kappa_x1, kappa_x2)
+        mem += 2 * slab * fb
+        # PML: pml_x1 + pml_x2, each has 2 * n_relax arrays
+        mem += 2 * (2 * n_relax) * slab * fb
+        # (dmap + dcmap)
         mem += 9 * 2 * n_deriv_levels * fb + slab * ib
-        # source
+        # source (icmat + coords)
         if n_sources > 0:
             mem += n_sources * fb if save_gpu_memory else n_sources * n_source_timesteps * fb
-            mem += 2 * n_sources * ib
+            mem += ndim * n_sources * ib
+
         # air
         if n_air > 0:
-            mem += 2 * n_air * ib
+            mem += ndim * n_air * ib
         # sensor
         if n_sensors > 0:
-            mem += n_sensors * fb + 3 * n_sensors * ib + n_sensors * ib
+            mem += n_sensors * fb
+            mem += (ndim + 1) * n_sensors * ib
+            mem += n_sensors * ib
         return mem
 
     def print_info(self) -> None:
diff --git a/ruff.toml b/ruff.toml
index c65d0b6..f6b2d84 100644
--- a/ruff.toml
+++ b/ruff.toml
@@ -77,6 +77,7 @@ dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
 "tests/**" = [
     "D100",
     "D101",
+    "D102",
     "D103",
     "D104",
     "D107",
diff --git a/tests/solver/test_gpu_memory_estimate.py b/tests/solver/test_gpu_memory_estimate.py
index 5ad2fec..d9ebed5 100644
--- a/tests/solver/test_gpu_memory_estimate.py
+++ b/tests/solver/test_gpu_memory_estimate.py
@@ -68,6 +68,13 @@ def _build_solver(
     )
 
 
+def _gpu_log_calls(mock_logger):
+    """Return only per-GPU log calls (skip the experimental feature message)."""
+    return [
+        call for call in mock_logger.info.call_args_list if "GPU memory estimate" in str(call[0][0])
+    ]
+
+
 @pytest.fixture(autouse=True)
 def _patch_binary(monkeypatch):
     monkeypatch.setattr(
@@ -84,9 +91,10 @@ def test_single_gpu_relaxation_logs_info(tmp_path):
     with patch("fullwave.solver.solver.logger") as mock_logger:
         solver._estimate_gpu_memory(sensor)
 
-    mock_logger.info.assert_called_once()
-    fmt = mock_logger.info.call_args[0][0]
-    mode_arg = mock_logger.info.call_args[0][2]
+    gpu_calls = _gpu_log_calls(mock_logger)
+    assert len(gpu_calls) == 1
+    fmt = gpu_calls[0][0][0]
+    mode_arg = gpu_calls[0][0][2]
     assert "GPU memory estimate" in fmt
     assert mode_arg == "relaxation"
 
@@ -98,8 +106,9 @@ def test_single_gpu_no_halo(tmp_path):
     with patch("fullwave.solver.solver.logger") as mock_logger:
         solver._estimate_gpu_memory(sensor)
 
+    gpu_calls = _gpu_log_calls(mock_logger)
     # halo argument: dev_id, mode, saving, GB, depth, halo, lateral
-    halo_value = mock_logger.info.call_args[0][6]
+    halo_value = gpu_calls[0][0][6]
     assert halo_value == 0
 
 
@@ -110,7 +119,8 @@ def test_estimate_is_positive(tmp_path):
     with patch("fullwave.solver.solver.logger") as mock_logger:
         solver._estimate_gpu_memory(sensor)
 
-    gb_value = mock_logger.info.call_args[0][4]
+    gpu_calls = _gpu_log_calls(mock_logger)
+    gb_value = gpu_calls[0][0][4]
     assert gb_value > 0
 
 
@@ -120,11 +130,11 @@ def test_save_gpu_memory_reduces_estimate(tmp_path):
 
     with patch("fullwave.solver.solver.logger") as mock_no_save:
         solver_no_save._estimate_gpu_memory(solver_no_save.pml_builder.extended_sensor)
-    gb_no_save = mock_no_save.info.call_args[0][4]
+    gb_no_save = _gpu_log_calls(mock_no_save)[0][0][4]
 
     with patch("fullwave.solver.solver.logger") as mock_save:
         solver_save._estimate_gpu_memory(solver_save.pml_builder.extended_sensor)
-    gb_save = mock_save.info.call_args[0][4]
+    gb_save = _gpu_log_calls(mock_save)[0][0][4]
 
     assert gb_save <= gb_no_save
 
@@ -136,7 +146,8 @@ def test_save_gpu_memory_label_in_log(tmp_path):
     with patch("fullwave.solver.solver.logger") as mock_logger:
         solver._estimate_gpu_memory(sensor)
 
-    saving_arg = mock_logger.info.call_args[0][3]
+    gpu_calls = _gpu_log_calls(mock_logger)
+    saving_arg = gpu_calls[0][0][3]
     assert "save_gpu_memory" in saving_arg
 
 
@@ -147,7 +158,8 @@ def test_multi_gpu_two_devices_logs_per_device(tmp_path):
     with patch("fullwave.solver.solver.logger") as mock_logger:
         solver._estimate_gpu_memory(sensor)
 
-    assert mock_logger.info.call_count == 2
+    gpu_calls = _gpu_log_calls(mock_logger)
+    assert len(gpu_calls) == 2
 
 
 def test_multi_gpu_two_devices_each_has_one_halo_side(tmp_path):
@@ -157,7 +169,8 @@ def test_multi_gpu_two_devices_each_has_one_halo_side(tmp_path):
     with patch("fullwave.solver.solver.logger") as mock_logger:
         solver._estimate_gpu_memory(sensor)
 
-    for call in mock_logger.info.call_args_list:
+    gpu_calls = _gpu_log_calls(mock_logger)
+    for call in gpu_calls:
         halo_value = call[0][6]
         assert halo_value == 8  # 1 side * 8 ghost cells
 
@@ -169,11 +182,11 @@ def test_multi_gpu_three_devices_middle_has_two_halo_sides(tmp_path):
     with patch("fullwave.solver.solver.logger") as mock_logger:
         solver._estimate_gpu_memory(sensor)
 
-    assert mock_logger.info.call_count == 3
-    calls = mock_logger.info.call_args_list
-    assert calls[0][0][6] == 8  # first GPU: 1 halo side
-    assert calls[1][0][6] == 16  # middle GPU: 2 halo sides
-    assert calls[2][0][6] == 8  # last GPU: 1 halo side
+    gpu_calls = _gpu_log_calls(mock_logger)
+    assert len(gpu_calls) == 3
+    assert gpu_calls[0][0][6] == 8  # first GPU: 1 halo side
+    assert gpu_calls[1][0][6] == 16  # middle GPU: 2 halo sides
+    assert gpu_calls[2][0][6] == 8  # last GPU: 1 halo side
 
 
 def test_multi_gpu_splits_depth(tmp_path):
@@ -184,7 +197,249 @@ def test_multi_gpu_splits_depth(tmp_path):
     with patch("fullwave.solver.solver.logger") as mock_logger:
         solver._estimate_gpu_memory(sensor)
 
-    calls = mock_logger.info.call_args_list
-    depth_gpu0 = calls[0][0][5]
-    depth_gpu1 = calls[1][0][5]
+    gpu_calls = _gpu_log_calls(mock_logger)
+    depth_gpu0 = gpu_calls[0][0][5]
+    depth_gpu1 = gpu_calls[1][0][5]
     assert depth_gpu0 + depth_gpu1 == total_depth
+
+
+# ---------------------------------------------------------------------------
+# Direct formula-verification tests for _mem_exponential / _mem_relaxation
+# ---------------------------------------------------------------------------
+
+# Shared test parameters
+_SLAB = 1000
+_NDLEV = 5
+_NSRC = 100
+_NTIC = 200
+_NSEN = 50
+_NRELAX = 2
+_NAIR = 30
+_FB = 4  # float_bytes
+_IB = 4  # int_bytes
+
+
+class TestMemExponentialFormula:
+    """Verify _mem_exponential returns byte counts matching the C formulas."""
+
+    def test_3d_matches_c_formula(self):
+        expected = (
+            4 * 2 * _SLAB * _FB  # fields (p, u, v, w) x 2 time levels
+            + 4 * _SLAB * _FB  # material: rho, K, beta, a_exp
+            + 9 * 2 * _NDLEV * _FB
+            + _SLAB * _IB  # dmap + dcmap
+            + _NSRC * _NTIC * _FB  # source icmat (no save)
+            + 3 * _NSRC * _IB  # source coords (3D)
+            + _NSEN * _FB  # genoutframe
+            + (3 + 1) * _NSEN * _IB  # coordsout_local
+            + _NSEN * _IB  # p_idx_array
+        )
+        result = fullwave.Solver._mem_exponential(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=_NSRC,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=False,
+            n_sensors=_NSEN,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=True,
+        )
+        assert result == expected
+
+    def test_2d_matches_c_formula(self):
+        expected = (
+            3 * 2 * _SLAB * _FB  # fields (p, u, w) x 2 time levels
+            + 4 * _SLAB * _FB  # material
+            + 9 * 2 * _NDLEV * _FB
+            + _SLAB * _IB  # dmap + dcmap
+            + _NSRC * _NTIC * _FB  # source icmat
+            + 2 * _NSRC * _IB  # source coords (2D)
+            + _NSEN * _FB  # genoutframe
+            + (2 + 1) * _NSEN * _IB  # coordsout_local
+            + _NSEN * _IB  # p_idx_array
+        )
+        result = fullwave.Solver._mem_exponential(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=_NSRC,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=False,
+            n_sensors=_NSEN,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=False,
+        )
+        assert result == expected
+
+    def test_save_gpu_memory(self):
+        """save_gpu_memory uses single-slice icmat (n_src x fb) instead of full."""
+        result = fullwave.Solver._mem_exponential(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=_NSRC,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=True,
+            n_sensors=_NSEN,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=True,
+        )
+        result_no_save = fullwave.Solver._mem_exponential(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=_NSRC,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=False,
+            n_sensors=_NSEN,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=True,
+        )
+        # Difference should be exactly the saved icmat bytes
+        saved = _NSRC * _NTIC * _FB - _NSRC * _FB
+        assert result_no_save - result == saved
+
+    def test_no_sources_no_sensors(self):
+        """Source and sensor terms are zero when counts are 0."""
+        expected = (
+            4 * 2 * _SLAB * _FB  # fields
+            + 4 * _SLAB * _FB  # material
+            + 9 * 2 * _NDLEV * _FB
+            + _SLAB * _IB  # dmap + dcmap
+        )
+        result = fullwave.Solver._mem_exponential(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=0,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=False,
+            n_sensors=0,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=True,
+        )
+        assert result == expected
+
+
+class TestMemRelaxationFormula:
+    """Verify _mem_relaxation returns byte counts matching the C formulas."""
+
+    def test_3d_matches_c_formula(self):
+        expected = (
+            4 * 2 * _SLAB * _FB  # fields (p, u, v, w) x 2 time levels
+            + 2 * (3 * _NRELAX * 2 * _SLAB * _FB)  # psi arrays
+            + 3 * _SLAB * _FB  # material: rho, K, beta
+            + 2 * _SLAB * _FB  # kappa
+            + 2 * (2 * _NRELAX) * _SLAB * _FB  # PML
+            + 9 * 2 * _NDLEV * _FB
+            + _SLAB * _IB  # dmap + dcmap
+            + _NSRC * _NTIC * _FB  # source icmat (no save)
+            + 3 * _NSRC * _IB  # source coords (3D)
+            + 3 * _NAIR * _IB  # air coords (3D)
+            + _NSEN * _FB  # genoutframe
+            + (3 + 1) * _NSEN * _IB  # coordsout_local
+            + _NSEN * _IB  # p_idx_array
+        )
+        result = fullwave.Solver._mem_relaxation(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=_NSRC,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=False,
+            n_air=_NAIR,
+            n_sensors=_NSEN,
+            n_relax=_NRELAX,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=True,
+        )
+        assert result == expected
+
+    def test_2d_matches_c_formula(self):
+        expected = (
+            3 * 2 * _SLAB * _FB  # fields (p, u, w) x 2 time levels
+            + 2 * (2 * _NRELAX * 2 * _SLAB * _FB)  # psi arrays (ndim=2)
+            + 3 * _SLAB * _FB  # material
+            + 2 * _SLAB * _FB  # kappa
+            + 2 * (2 * _NRELAX) * _SLAB * _FB  # PML
+            + 9 * 2 * _NDLEV * _FB
+            + _SLAB * _IB  # dmap + dcmap
+            + _NSRC * _NTIC * _FB  # source icmat
+            + 2 * _NSRC * _IB  # source coords (2D)
+            + 2 * _NAIR * _IB  # air coords (2D)
+            + _NSEN * _FB  # genoutframe
+            + (2 + 1) * _NSEN * _IB  # coordsout_local
+            + _NSEN * _IB  # p_idx_array
+        )
+        result = fullwave.Solver._mem_relaxation(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=_NSRC,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=False,
+            n_air=_NAIR,
+            n_sensors=_NSEN,
+            n_relax=_NRELAX,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=False,
+        )
+        assert result == expected
+
+    def test_no_air_no_sources_no_sensors(self):
+        """Conditional terms are zero when counts are 0."""
+        expected = (
+            4 * 2 * _SLAB * _FB  # fields
+            + 2 * (3 * _NRELAX * 2 * _SLAB * _FB)  # psi
+            + 3 * _SLAB * _FB  # material
+            + 2 * _SLAB * _FB  # kappa
+            + 2 * (2 * _NRELAX) * _SLAB * _FB  # PML
+            + 9 * 2 * _NDLEV * _FB
+            + _SLAB * _IB  # dmap + dcmap
+        )
+        result = fullwave.Solver._mem_relaxation(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=0,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=False,
+            n_air=0,
+            n_sensors=0,
+            n_relax=_NRELAX,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=True,
+        )
+        assert result == expected
+
+    def test_save_gpu_memory(self):
+        """save_gpu_memory uses single-slice icmat (n_src x fb) instead of full."""
+        result = fullwave.Solver._mem_relaxation(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=_NSRC,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=True,
+            n_air=_NAIR,
+            n_sensors=_NSEN,
+            n_relax=_NRELAX,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=True,
+        )
+        result_no_save = fullwave.Solver._mem_relaxation(
+            slab=_SLAB,
+            n_deriv_levels=_NDLEV,
+            n_sources=_NSRC,
+            n_source_timesteps=_NTIC,
+            save_gpu_memory=False,
+            n_air=_NAIR,
+            n_sensors=_NSEN,
+            n_relax=_NRELAX,
+            float_bytes=_FB,
+            int_bytes=_IB,
+            is_3d=True,
+        )
+        saved = _NSRC * _NTIC * _FB - _NSRC * _FB
+        assert result_no_save - result == saved

From ee33526a4f53ca7ffd5cf559d3fc02c21650c2aa Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Thu, 26 Feb 2026 21:32:26 -0500
Subject: [PATCH 15/31] =?UTF-8?q?Bump=20version:=201.2.6-dev3=20=E2=86=92?=
 =?UTF-8?q?=201.2.6-dev4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .bumpversion.toml    | 2 +-
 fullwave/__init__.py | 2 +-
 pyproject.toml       | 2 +-
 uv.lock              | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.bumpversion.toml b/.bumpversion.toml
index d0cf5ad..e6f6ab8 100644
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "1.2.6-dev3"
+current_version = "1.2.6-dev4"
 parse = """(?x)
     (?P<major>0|[1-9]\\d*)\\.
     (?P<minor>0|[1-9]\\d*)\\.
diff --git a/fullwave/__init__.py b/fullwave/__init__.py
index 0400fa0..bbf90e0 100644
--- a/fullwave/__init__.py
+++ b/fullwave/__init__.py
@@ -60,7 +60,7 @@
     __version__ = version("fullwave")
 except PackageNotFoundError:
     # Update via bump-my-version, not manually
-    __version__ = "1.2.6-dev3"
+    __version__ = "1.2.6-dev4"
 
 VERSION = __version__  # for convenience
 logger.info("Fullwave version: %s", __version__)
diff --git a/pyproject.toml b/pyproject.toml
index bf2f35f..2630eea 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "fullwave25"
-version = "1.2.6-dev3" # Update via bump-my-version, not manually
+version = "1.2.6-dev4" # Update via bump-my-version, not manually
 description = "Fullwave 2.5: Ultrasound wave propagation simulation with heterogeneous power law attenuation modelling capabilities"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/uv.lock b/uv.lock
index a3cb3fd..9ae6958 100644
--- a/uv.lock
+++ b/uv.lock
@@ -735,7 +735,7 @@ wheels = [
 
 [[package]]
 name = "fullwave25"
-version = "1.2.6.dev3"
+version = "1.2.6.dev4"
 source = { editable = "." }
 dependencies = [
     { name = "matplotlib" },

From 5bcfb5bc2debf16d7c3e1a3d1470f96f6107904f Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Thu, 26 Feb 2026 21:36:37 -0500
Subject: [PATCH 16/31] Set default value of gpu_memory_estimate to True in
 Solver class

---
 fullwave/solver/solver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fullwave/solver/solver.py b/fullwave/solver/solver.py
index 8e53ce6..46bbb3d 100644
--- a/fullwave/solver/solver.py
+++ b/fullwave/solver/solver.py
@@ -765,7 +765,7 @@ def run(
         release_after_write: bool = False,
         highpass_cutoff_mhz: float | None = None,
         bandpass_cutoff_mhz: tuple[float, float] | None = None,
-        gpu_memory_estimate: bool = False,
+        gpu_memory_estimate: bool = True,
     ) -> NDArray[np.float64] | Path:
         r"""Run the fullwave simulation and return the result as a NumPy array.
 

From d93db484b65bcb3f1b26acc1ae58b4f998a64e19 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Thu, 26 Feb 2026 21:36:43 -0500
Subject: [PATCH 17/31] =?UTF-8?q?Bump=20version:=201.2.6-dev4=20=E2=86=92?=
 =?UTF-8?q?=201.2.6-dev5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .bumpversion.toml    | 2 +-
 fullwave/__init__.py | 2 +-
 pyproject.toml       | 2 +-
 uv.lock              | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.bumpversion.toml b/.bumpversion.toml
index e6f6ab8..9d28346 100644
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "1.2.6-dev4"
+current_version = "1.2.6-dev5"
 parse = """(?x)
     (?P<major>0|[1-9]\\d*)\\.
     (?P<minor>0|[1-9]\\d*)\\.
diff --git a/fullwave/__init__.py b/fullwave/__init__.py
index bbf90e0..fdc07e7 100644
--- a/fullwave/__init__.py
+++ b/fullwave/__init__.py
@@ -60,7 +60,7 @@
     __version__ = version("fullwave")
 except PackageNotFoundError:
     # Update via bump-my-version, not manually
-    __version__ = "1.2.6-dev4"
+    __version__ = "1.2.6-dev5"
 
 VERSION = __version__  # for convenience
 logger.info("Fullwave version: %s", __version__)
diff --git a/pyproject.toml b/pyproject.toml
index 2630eea..d43c440 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "fullwave25"
-version = "1.2.6-dev4" # Update via bump-my-version, not manually
+version = "1.2.6-dev5" # Update via bump-my-version, not manually
 description = "Fullwave 2.5: Ultrasound wave propagation simulation with heterogeneous power law attenuation modelling capabilities"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/uv.lock b/uv.lock
index 9ae6958..759c91c 100644
--- a/uv.lock
+++ b/uv.lock
@@ -735,7 +735,7 @@ wheels = [
 
 [[package]]
 name = "fullwave25"
-version = "1.2.6.dev4"
+version = "1.2.6.dev5"
 source = { editable = "." }
 dependencies = [
     { name = "matplotlib" },

From e66c506c1ebc16398b8fa33904b36d17e1e5b930 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Thu, 5 Mar 2026 10:48:14 -0500
Subject: [PATCH 18/31] Add GPU support for PML computation using CuPy in
 PMLBuilder and Solver classes

---
 fullwave/solver/pml_builder.py | 458 ++++++++++++++++++++++-----------
 fullwave/solver/solver.py      |   8 +
 2 files changed, 318 insertions(+), 148 deletions(-)

diff --git a/fullwave/solver/pml_builder.py b/fullwave/solver/pml_builder.py
index de52885..13773de 100644
--- a/fullwave/solver/pml_builder.py
+++ b/fullwave/solver/pml_builder.py
@@ -1,44 +1,96 @@
 """Perfectly Matched Layer (PML) setup for Fullwave."""
 
+from __future__ import annotations
+
 import concurrent.futures
 import logging
 from collections import OrderedDict
 from dataclasses import dataclass, field
 from functools import cached_property
 from pathlib import Path
+from typing import TYPE_CHECKING
 
 import matplotlib.pyplot as plt
 import numexpr as ne
 import numpy as np
-from numpy.typing import NDArray
 
 import fullwave
+
+if TYPE_CHECKING:
+    from types import ModuleType
+
+    from numpy.typing import NDArray
 from fullwave.solver.utils import initialize_relaxation_param_dict
 from fullwave.utils import check_functions, plot_utils
 
 logger = logging.getLogger("__main__." + __name__)
 
+_CUPY_AVAILABLE: bool | None = None
+
+
+def _check_cupy() -> bool:
+    """Return True if CuPy is importable; result is cached after the first call."""
+    global _CUPY_AVAILABLE  # noqa: PLW0603
+    if _CUPY_AVAILABLE is None:
+        try:
+            import cupy  # noqa: F401, PLC0415
+
+            _CUPY_AVAILABLE = True
+        except ImportError:
+            _CUPY_AVAILABLE = False
+    return _CUPY_AVAILABLE
+
+
+def _get_array_module(*, use_gpu: bool) -> ModuleType:
+    """Return ``cupy`` when *use_gpu* is True and CuPy is available, else ``numpy``."""
+    if use_gpu and _check_cupy():
+        import cupy  # noqa: PLC0415
 
-def _smooth_transition_function_part(x: NDArray[np.float64]) -> NDArray[np.float64]:
-    return np.where(x > 0, np.exp(-1 / (x + 1e-20)), 0)
+        return cupy
+    return np
 
 
-def _smooth_transition_function(x: NDArray[np.float64]) -> NDArray[np.float64]:
-    return _smooth_transition_function_part(x) / (
-        _smooth_transition_function_part(x) + _smooth_transition_function_part(1 - x)
+def _smooth_transition_function_part(
+    x: NDArray[np.float64],
+    xp: ModuleType = np,
+) -> NDArray[np.float64]:
+    """Smooth bump helper (works with numpy or cupy)."""
+    return xp.where(x > 0, xp.exp(-1 / (x + 1e-20)), 0)
+
+
+def _smooth_transition_function(
+    x: NDArray[np.float64],
+    xp: ModuleType = np,
+) -> NDArray[np.float64]:
+    """Smooth transition function (works with numpy or cupy)."""
+    return _smooth_transition_function_part(x, xp=xp) / (
+        _smooth_transition_function_part(x, xp=xp) + _smooth_transition_function_part(1 - x, xp=xp)
     )
 
 
-def _linear_transition_function(x: NDArray[np.float64]) -> NDArray[np.float64]:
+def _linear_transition_function(
+    x: NDArray[np.float64],
+    xp: ModuleType = np,  # noqa: ARG001
+) -> NDArray[np.float64]:
+    """Linear transition function."""
     return x
 
 
-def _n_th_deg_polynomial_function(x: NDArray[np.float64], n: int = 2) -> NDArray[np.float64]:
+def _n_th_deg_polynomial_function(
+    x: NDArray[np.float64],
+    n: int = 2,
+    xp: ModuleType = np,  # noqa: ARG001
+) -> NDArray[np.float64]:
+    """N-th degree polynomial transition function."""
     return x**n
 
 
-def _cosine_transition_function(x: NDArray[np.float64]) -> NDArray[np.float64]:
-    return 0.5 * (1 - np.cos(np.pi * x))
+def _cosine_transition_function(
+    x: NDArray[np.float64],
+    xp: ModuleType = np,
+) -> NDArray[np.float64]:
+    """Cosine transition function (works with numpy or cupy)."""
+    return 0.5 * (1 - xp.cos(xp.pi * x))
 
 
 def _obtain_relax_var_rename_dict(
@@ -113,7 +165,7 @@ class PMLBuilder:
     pml_mask_x: NDArray[np.float64] = field(init=False)
     pml_mask_y: NDArray[np.float64] = field(init=False)
 
-    def __init__(
+    def __init__(  # noqa: PLR0912
         self,
         grid: fullwave.Grid,
         medium: fullwave.Medium,
@@ -124,6 +176,7 @@ def __init__(
         n_pml_layer: int = 40,
         n_transition_layer: int = 40,
         use_isotropic_relaxation: bool = False,
+        use_gpu: bool = False,
         # pml_alpha_target: float = 1.1,
         # pml_alpha_power_target: float = 1.6,
         # pml_strength_factor: float = 2.0,
@@ -158,6 +211,9 @@ def __init__(
             This option omits the anisotropic relaxation mechanisms to model the attenuation.
             We usually recommend using isotropic relaxation mechanisms
             unless the anisotropic attenuation is required for the simulation.
+        use_gpu : bool, optional
+            If True, use CuPy for GPU-accelerated PML computation (default is False).
+            Requires CuPy to be installed. Falls back to CPU if CuPy is unavailable.
 
         """
         check_functions.check_instance(
@@ -184,6 +240,15 @@ def __init__(
         self.is_3d = grid.is_3d
         self.use_isotropic_relaxation = use_isotropic_relaxation
 
+        self.use_gpu = use_gpu
+        self.xp: ModuleType = _get_array_module(use_gpu=use_gpu)
+        if self.xp is not np:
+            logger.info("PMLBuilder: using CuPy GPU backend")
+        elif use_gpu:
+            logger.warning(
+                "PMLBuilder: use_gpu=True but CuPy is not available. Falling back to CPU (numpy)."
+            )
+
         self.m_spatial_order = m_spatial_order
         self.n_pml_layer = n_pml_layer
         self.n_transition_layer = n_transition_layer
@@ -219,40 +284,52 @@ def __init__(
 
         logger.debug("building extended medium for pml...")
         if isinstance(self.medium_org, fullwave.MediumRelaxationMaps):
-            with concurrent.futures.ThreadPoolExecutor() as executor:
-                future_sound_speed = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.sound_speed,
-                )
-                future_density = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.density,
-                )
-                future_beta = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.beta,
-                )
-                future_alpha_coeff = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.alpha_coeff,
-                )
-                future_alpha_power = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.alpha_power,
-                )
-                future_relaxation_param_dict = {
-                    key: executor.submit(self._extend_map_for_pml, value)
-                    for key, value in self.medium_org.relaxation_param_dict.items()
-                }
-
-                extended_sound_speed = future_sound_speed.result()
-                extended_density = future_density.result()
-                extended_beta = future_beta.result()
-                extended_alpha_coeff = future_alpha_coeff.result()
-                extended_alpha_power = future_alpha_power.result()
+            if self.xp is not np:
+                # GPU path: run sequentially to avoid CuPy multi-thread issues
+                extended_sound_speed = self._extend_map_for_pml(self.medium_org.sound_speed)
+                extended_density = self._extend_map_for_pml(self.medium_org.density)
+                extended_beta = self._extend_map_for_pml(self.medium_org.beta)
+                extended_alpha_coeff = self._extend_map_for_pml(self.medium_org.alpha_coeff)
+                extended_alpha_power = self._extend_map_for_pml(self.medium_org.alpha_power)
                 extended_relaxation_param_dict = {
-                    key: future.result() for key, future in future_relaxation_param_dict.items()
+                    key: self._extend_map_for_pml(value)
+                    for key, value in self.medium_org.relaxation_param_dict.items()
                 }
+            else:
+                with concurrent.futures.ThreadPoolExecutor() as executor:
+                    future_sound_speed = executor.submit(
+                        self._extend_map_for_pml,
+                        self.medium_org.sound_speed,
+                    )
+                    future_density = executor.submit(
+                        self._extend_map_for_pml,
+                        self.medium_org.density,
+                    )
+                    future_beta = executor.submit(
+                        self._extend_map_for_pml,
+                        self.medium_org.beta,
+                    )
+                    future_alpha_coeff = executor.submit(
+                        self._extend_map_for_pml,
+                        self.medium_org.alpha_coeff,
+                    )
+                    future_alpha_power = executor.submit(
+                        self._extend_map_for_pml,
+                        self.medium_org.alpha_power,
+                    )
+                    future_relaxation_param_dict = {
+                        key: executor.submit(self._extend_map_for_pml, value)
+                        for key, value in self.medium_org.relaxation_param_dict.items()
+                    }
+
+                    extended_sound_speed = future_sound_speed.result()
+                    extended_density = future_density.result()
+                    extended_beta = future_beta.result()
+                    extended_alpha_coeff = future_alpha_coeff.result()
+                    extended_alpha_power = future_alpha_power.result()
+                    extended_relaxation_param_dict = {
+                        key: future.result() for key, future in future_relaxation_param_dict.items()
+                    }
 
             self.extended_medium = fullwave.MediumRelaxationMaps(
                 grid=self.extended_grid,
@@ -268,33 +345,41 @@ def __init__(
                 dtype=getattr(self.medium_org, "dtype", np.float64),
             )
         else:
-            with concurrent.futures.ThreadPoolExecutor() as executor:
-                future_sound_speed = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.sound_speed,
-                )
-                future_density = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.density,
-                )
-                future_beta = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.beta,
-                )
-                future_alpha_coeff = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.alpha_coeff,
-                )
-                future_alpha_power = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.alpha_power,
-                )
-
-                extended_sound_speed = future_sound_speed.result()
-                extended_density = future_density.result()
-                extended_beta = future_beta.result()
-                extended_alpha_coeff = future_alpha_coeff.result()
-                extended_alpha_power = future_alpha_power.result()
+            if self.xp is not np:
+                # GPU path: run sequentially to avoid CuPy multi-thread issues
+                extended_sound_speed = self._extend_map_for_pml(self.medium_org.sound_speed)
+                extended_density = self._extend_map_for_pml(self.medium_org.density)
+                extended_beta = self._extend_map_for_pml(self.medium_org.beta)
+                extended_alpha_coeff = self._extend_map_for_pml(self.medium_org.alpha_coeff)
+                extended_alpha_power = self._extend_map_for_pml(self.medium_org.alpha_power)
+            else:
+                with concurrent.futures.ThreadPoolExecutor() as executor:
+                    future_sound_speed = executor.submit(
+                        self._extend_map_for_pml,
+                        self.medium_org.sound_speed,
+                    )
+                    future_density = executor.submit(
+                        self._extend_map_for_pml,
+                        self.medium_org.density,
+                    )
+                    future_beta = executor.submit(
+                        self._extend_map_for_pml,
+                        self.medium_org.beta,
+                    )
+                    future_alpha_coeff = executor.submit(
+                        self._extend_map_for_pml,
+                        self.medium_org.alpha_coeff,
+                    )
+                    future_alpha_power = executor.submit(
+                        self._extend_map_for_pml,
+                        self.medium_org.alpha_power,
+                    )
+
+                    extended_sound_speed = future_sound_speed.result()
+                    extended_density = future_density.result()
+                    extended_beta = future_beta.result()
+                    extended_alpha_coeff = future_alpha_coeff.result()
+                    extended_alpha_power = future_alpha_power.result()
             self.extended_medium = fullwave.Medium(
                 grid=self.extended_grid,
                 sound_speed=extended_sound_speed,
@@ -456,22 +541,30 @@ def _extend_map_for_pml(
         *,
         fill_edge: bool = True,
     ) -> NDArray[np.float64 | np.int64 | np.bool_]:
-        """Fast version using pre-allocation and direct assignment instead of np.pad."""
+        """Fast version using pre-allocation and direct assignment instead of np.pad.
+
+        When ``self.use_gpu`` is True and CuPy is available, the computation
+        runs on the GPU and the result is returned as a numpy array.
+        """
+        xp = self.xp
         pad = self.num_boundary_points
 
+        # Transfer to GPU if needed
+        input_gpu = xp.asarray(input_map) if xp is not np else input_map
+
         # Pre-allocate output array with correct dtype
         if self.is_3d:
-            nx, ny, nz = input_map.shape
-            output = np.empty((nx + 2 * pad, ny + 2 * pad, nz + 2 * pad), dtype=input_map.dtype)
+            nx, ny, nz = input_gpu.shape
+            output = xp.empty((nx + 2 * pad, ny + 2 * pad, nz + 2 * pad), dtype=input_gpu.dtype)
 
             # Fill center with original data (single copy)
-            output[pad : pad + nx, pad : pad + ny, pad : pad + nz] = input_map
+            output[pad : pad + nx, pad : pad + ny, pad : pad + nz] = input_gpu
 
             if fill_edge:
                 # Fill edges efficiently using broadcasting
                 # X boundaries
-                output[:pad, pad : pad + ny, pad : pad + nz] = input_map[0:1, :, :]
-                output[pad + nx :, pad : pad + ny, pad : pad + nz] = input_map[-1:, :, :]
+                output[:pad, pad : pad + ny, pad : pad + nz] = input_gpu[0:1, :, :]
+                output[pad + nx :, pad : pad + ny, pad : pad + nz] = input_gpu[-1:, :, :]
 
                 # Y boundaries (now includes X corners)
                 output[:, :pad, pad : pad + nz] = output[:, pad : pad + 1, pad : pad + nz]
@@ -493,16 +586,16 @@ def _extend_map_for_pml(
                 output[:, :, :pad] = 0
                 output[:, :, pad + nz :] = 0
         else:  # 2D case
-            nx, ny = input_map.shape
-            output = np.empty((nx + 2 * pad, ny + 2 * pad), dtype=input_map.dtype)
+            nx, ny = input_gpu.shape
+            output = xp.empty((nx + 2 * pad, ny + 2 * pad), dtype=input_gpu.dtype)
 
             # Fill center
-            output[pad : pad + nx, pad : pad + ny] = input_map
+            output[pad : pad + nx, pad : pad + ny] = input_gpu
 
             if fill_edge:
                 # Fill edges
-                output[:pad, pad : pad + ny] = input_map[0:1, :]
-                output[pad + nx :, pad : pad + ny] = input_map[-1:, :]
+                output[:pad, pad : pad + ny] = input_gpu[0:1, :]
+                output[pad + nx :, pad : pad + ny] = input_gpu[-1:, :]
                 output[:, :pad] = output[:, pad : pad + 1]
                 output[:, pad + ny :] = output[:, pad + ny - 1 : pad + ny]
             else:
@@ -511,6 +604,9 @@ def _extend_map_for_pml(
                 output[:, :pad] = 0
                 output[:, pad + ny :] = 0
 
+        # Transfer back to CPU if needed
+        if xp is not np:
+            return xp.asnumpy(output)
         return output
 
     def _localize_pml_region(self) -> tuple[NDArray[np.float64], ...]:
@@ -587,34 +683,43 @@ def _localize_pml_region(self) -> tuple[NDArray[np.float64], ...]:
 
         return pml_mask_x, pml_mask_y
 
-    @staticmethod
     def _calc_a_and_b(
+        self,
         d_x: float | NDArray[np.float64],
         kappa_x: float | NDArray[np.float64],
         alpha_x: float | NDArray[np.float64],
         dt: float | NDArray[np.float64],
         output_dtype: np.dtype | None = None,
     ) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
-        d_x = np.asarray(d_x, dtype=np.float64)
-        kappa_x = np.asarray(kappa_x, dtype=np.float64)
-        alpha_x = np.asarray(alpha_x, dtype=np.float64)
-        dt = np.asarray(dt, dtype=np.float64)
-
-        eps = np.finfo(np.float64).eps  # noqa: F841
+        xp = self.xp
+        use_gpu = xp is not np
 
-        # b = exp(-(dx/kappa_x + alpha_x) * dt)
-        b = ne.evaluate("exp(-(d_x/kappa_x + alpha_x) * dt)")
+        d_x = xp.asarray(d_x, dtype=xp.float64)
+        kappa_x = xp.asarray(kappa_x, dtype=xp.float64)
+        alpha_x = xp.asarray(alpha_x, dtype=xp.float64)
+        dt = xp.asarray(dt, dtype=xp.float64)
 
-        # denom = kappa_x*(dx + kappa_x*alpha_x) + eps
-        denom = ne.evaluate("kappa_x*(d_x + kappa_x*alpha_x) + eps")  # noqa: F841
+        eps = xp.finfo(xp.float64).eps
 
-        # a = dx/denom*(b - 1)
-        a = ne.evaluate("d_x/denom*(b - 1)")
-
-        if output_dtype is not None and output_dtype != np.float64:
+        if use_gpu:
+            b = xp.exp(-(d_x / kappa_x + alpha_x) * dt)
+            denom = kappa_x * (d_x + kappa_x * alpha_x) + eps
+            a = d_x / denom * (b - 1)
+        else:
+            eps_local = eps  # noqa: F841
+            # b = exp(-(dx/kappa_x + alpha_x) * dt)
+            b = ne.evaluate("exp(-(d_x/kappa_x + alpha_x) * dt)")
+            # denom = kappa_x*(dx + kappa_x*alpha_x) + eps
+            denom = ne.evaluate("kappa_x*(d_x + kappa_x*alpha_x) + eps_local")
+            # a = dx/denom*(b - 1)
+            a = ne.evaluate("d_x/denom*(b - 1)")
+
+        if output_dtype is not None and output_dtype != xp.float64:
             a = a.astype(output_dtype, copy=False)
             b = b.astype(output_dtype, copy=False)
 
+        if use_gpu:
+            return xp.asnumpy(a), xp.asnumpy(b)
         return a, b
 
     def run(self, *, use_pml: bool = True) -> fullwave.MediumRelaxationMaps:
@@ -1237,7 +1342,9 @@ def _apply_transition_and_pml(  # noqa: PLR0912
             layer_offset = 0
 
         # Compute transition function once
-        transition_linspace = np.linspace(0, 1, layer_thickness + 1)
+        xp = self.xp
+        use_gpu = xp is not np
+        transition_linspace = xp.linspace(0, 1, layer_thickness + 1)
         transition_map = {
             "smooth": _smooth_transition_function,
             "linear": _linear_transition_function,
@@ -1256,9 +1363,10 @@ def _apply_transition_and_pml(  # noqa: PLR0912
             transition_function = transition_map[transition_type](
                 transition_linspace,
                 n=n_polynomial,
+                xp=xp,
             )
         else:
-            transition_function = transition_map[transition_type](transition_linspace)
+            transition_function = transition_map[transition_type](transition_linspace, xp=xp)
 
         n_axis_extended = array_shape[axis]
         m_offset = self.m_spatial_order + layer_offset
@@ -1267,10 +1375,18 @@ def _apply_transition_and_pml(  # noqa: PLR0912
         up_end = m_offset + layer_thickness
         down_start = n_axis_extended - m_offset - layer_thickness - 1
 
+        # Transfer to GPU if needed
+        if use_gpu:
+            input_array = xp.asarray(input_array)
+
         # Move axis to 0 for uniform processing
-        working_array = np.moveaxis(input_array, axis, 0)
-        # make working_array writeable
-        working_array.setflags(write=True)
+        working_array = xp.moveaxis(input_array, axis, 0)
+        if not use_gpu:
+            # make working_array writeable (numpy-specific)
+            working_array.setflags(write=True)
+        else:
+            # CuPy arrays are always writeable; ensure contiguous copy
+            working_array = working_array.copy()
 
         # Apply boundary conditions
         working_array[: m_offset + layer_thickness] = value_target
@@ -1303,7 +1419,10 @@ def _apply_transition_and_pml(  # noqa: PLR0912
         working_array[down_start:down_end] = down_vals - trans_down * (down_vals - value_target)
 
         # Move axis back
-        return np.moveaxis(working_array, 0, axis)
+        result = xp.moveaxis(working_array, 0, axis)
+        if use_gpu:
+            return xp.asnumpy(result)
+        return result
 
     @staticmethod
     def _calc_time_constants(
@@ -1470,6 +1589,7 @@ def __init__(
         *,
         m_spatial_order: int = 8,
         n_pml_layer: int = 40,
+        use_gpu: bool = False,
         # n_transition_layer: int = 40,
         # pml_alpha_target: float = 1.1,
         # pml_alpha_power_target: float = 1.6,
@@ -1495,6 +1615,9 @@ def __init__(
             see Pinton, G. (2021) http://arxiv.org/abs/2106.11476 for more detail.
         n_pml_layer : int, optional
             PML layer thickness (default is 40).
+        use_gpu : bool, optional
+            If True, use CuPy for GPU-accelerated PML computation (default is False).
+            Requires CuPy to be installed. Falls back to CPU if CuPy is unavailable.
         n_transition_layer : int, optional
             Number of transition layers (default is 40).
         pml_alpha_target : float, optional
@@ -1534,6 +1657,16 @@ def __init__(
         self.sensor_org = sensor
         self.is_3d = grid.is_3d
 
+        self.use_gpu = use_gpu
+        self.xp: ModuleType = _get_array_module(use_gpu=use_gpu)
+        if self.xp is not np:
+            logger.info("PMLBuilderExponentialAttenuation: using CuPy GPU backend")
+        elif use_gpu:
+            logger.warning(
+                "PMLBuilderExponentialAttenuation: use_gpu=True but CuPy is not available. "
+                "Falling back to CPU (numpy)."
+            )
+
         self.m_spatial_order = m_spatial_order
         self.n_pml_layer = n_pml_layer
         # self.n_transition_layer = n_transition_layer
@@ -1569,34 +1702,42 @@ def __init__(
         )
 
         logger.debug("building extended medium for pml...")
-        # run _extend_map_for_pml in parallel for all medium properties since it is a bottleneck
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            future_sound_speed = executor.submit(
-                self._extend_map_for_pml,
-                self.medium_org.sound_speed,
-            )
-            future_density = executor.submit(
-                self._extend_map_for_pml,
-                self.medium_org.density,
-            )
-            future_beta = executor.submit(
-                self._extend_map_for_pml,
-                self.medium_org.beta,
-            )
-            future_alpha_coeff = executor.submit(
-                self._extend_map_for_pml,
-                self.medium_org.alpha_coeff,
-            )
-            future_alpha_power = executor.submit(
-                self._extend_map_for_pml,
-                self.medium_org.alpha_power,
-            )
+        if self.xp is not np:
+            # GPU path: run sequentially to avoid CuPy multi-thread issues
+            extended_sound_speed = self._extend_map_for_pml(self.medium_org.sound_speed)
+            extended_density = self._extend_map_for_pml(self.medium_org.density)
+            extended_beta = self._extend_map_for_pml(self.medium_org.beta)
+            extended_alpha_coeff = self._extend_map_for_pml(self.medium_org.alpha_coeff)
+            extended_alpha_power = self._extend_map_for_pml(self.medium_org.alpha_power)
+        else:
+            # CPU path: run in parallel for all medium properties since it is a bottleneck
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                future_sound_speed = executor.submit(
+                    self._extend_map_for_pml,
+                    self.medium_org.sound_speed,
+                )
+                future_density = executor.submit(
+                    self._extend_map_for_pml,
+                    self.medium_org.density,
+                )
+                future_beta = executor.submit(
+                    self._extend_map_for_pml,
+                    self.medium_org.beta,
+                )
+                future_alpha_coeff = executor.submit(
+                    self._extend_map_for_pml,
+                    self.medium_org.alpha_coeff,
+                )
+                future_alpha_power = executor.submit(
+                    self._extend_map_for_pml,
+                    self.medium_org.alpha_power,
+                )
 
-            extended_sound_speed = future_sound_speed.result()
-            extended_density = future_density.result()
-            extended_beta = future_beta.result()
-            extended_alpha_coeff = future_alpha_coeff.result()
-            extended_alpha_power = future_alpha_power.result()
+                extended_sound_speed = future_sound_speed.result()
+                extended_density = future_density.result()
+                extended_beta = future_beta.result()
+                extended_alpha_coeff = future_alpha_coeff.result()
+                extended_alpha_power = future_alpha_power.result()
 
         self.extended_medium = fullwave.Medium(
             grid=self.extended_grid,
@@ -1727,8 +1868,7 @@ def run(self, *, use_pml: bool = True) -> fullwave.MediumExponentialAttenuation:
         logger.debug("Extended medium built successfully without applying PML.")
         return extended_medium
 
-    @staticmethod
-    def _mask_body_2d(nx: int, ny: int, n_body: int) -> NDArray[np.float32]:
+    def _mask_body_2d(self, nx: int, ny: int, n_body: int) -> NDArray[np.float32]:
         """Create a mask for the PML region.
 
         Parameters
@@ -1747,28 +1887,37 @@ def _mask_body_2d(nx: int, ny: int, n_body: int) -> NDArray[np.float32]:
             Interior (body) region is 1, PML boundary approaches 0.
 
         """
+        xp = self.xp
+        use_gpu = xp is not np
 
         def edge_distance_1d(n: int, n_body: int) -> NDArray[np.float32]:
-            d = np.zeros(n, dtype=np.float32)
+            d = xp.zeros(n, dtype=xp.float32)
             if n_body <= 0:
                 return d
-            d[:n_body] = np.arange(n_body, 0, -1, dtype=np.float32)
-            d[-n_body:] = np.arange(1, n_body + 1, dtype=np.float32)
+            d[:n_body] = xp.arange(n_body, 0, -1, dtype=xp.float32)
+            d[-n_body:] = xp.arange(1, n_body + 1, dtype=xp.float32)
             return d
 
-        rx = edge_distance_1d(nx, n_body)[:, None]  # noqa: F841
-        ry = edge_distance_1d(ny, n_body)[None, :]  # noqa: F841
+        rx = edge_distance_1d(nx, n_body)[:, None]
+        ry = edge_distance_1d(ny, n_body)[None, :]
 
-        mask_sq = ne.evaluate("rx*rx + ry*ry")
+        if use_gpu:
+            mask_sq = rx * rx + ry * ry
+            mmax = float(xp.sqrt(mask_sq.max()))
+            if mmax > 0.0:
+                mask_sq = mask_sq / (mmax * mmax)
+            result = 1 - xp.sqrt(mask_sq)
+            return xp.asnumpy(result)
 
+        rx_np = rx  # noqa: F841
+        ry_np = ry  # noqa: F841
+        mask_sq = ne.evaluate("rx_np*rx_np + ry_np*ry_np")
         mmax = float(np.sqrt(mask_sq.max()))
         if mmax > 0.0:
             mask_sq = ne.evaluate("mask_sq / (mmax*mmax)")
-
         return ne.evaluate("1 - sqrt(mask_sq)")
 
-    @staticmethod
-    def _mask_body_3d(nx: int, ny: int, nz: int, n_body: int) -> NDArray[np.float32]:
+    def _mask_body_3d(self, nx: int, ny: int, nz: int, n_body: int) -> NDArray[np.float32]:
         """Create a mask for the PML region.
 
         Parameters
@@ -1788,21 +1937,34 @@ def _mask_body_3d(nx: int, ny: int, nz: int, n_body: int) -> NDArray[np.float32]
             A 3D numpy array representing the PML mask.
 
         """
+        xp = self.xp
+        use_gpu = xp is not np
 
         def edge_distance_1d(n: int, n_body: int) -> NDArray[np.float32]:
-            d = np.zeros(n, dtype=np.float32)
+            d = xp.zeros(n, dtype=xp.float32)
             if n_body <= 0:
                 return d
-            d[:n_body] = np.arange(n_body, 0, -1, dtype=np.float32)
-            d[-n_body:] = np.arange(1, n_body + 1, dtype=np.float32)
+            d[:n_body] = xp.arange(n_body, 0, -1, dtype=xp.float32)
+            d[-n_body:] = xp.arange(1, n_body + 1, dtype=xp.float32)
             return d
 
-        rx = edge_distance_1d(nx, n_body)[:, None, None]  # noqa: F841
-        ry = edge_distance_1d(ny, n_body)[None, :, None]  # noqa: F841
-        rz = edge_distance_1d(nz, n_body)[None, None, :]  # noqa: F841
-
+        rx = edge_distance_1d(nx, n_body)[:, None, None]
+        ry = edge_distance_1d(ny, n_body)[None, :, None]
+        rz = edge_distance_1d(nz, n_body)[None, None, :]
+
+        if use_gpu:
+            mask_sq = rx * rx + ry * ry + rz * rz
+            mmax = float(xp.sqrt(mask_sq.max()))
+            if mmax > 0.0:
+                mask_sq = mask_sq / (mmax * mmax)
+            result = 1 - xp.sqrt(mask_sq)
+            return xp.asnumpy(result)
+
+        rx_np = rx  # noqa: F841
+        ry_np = ry  # noqa: F841
+        rz_np = rz  # noqa: F841
         # 1) compute squared distance with numexpr (no reduction here)
-        mask_sq = ne.evaluate("rx*rx + ry*ry + rz*rz")  # shape (nx, ny, nz), float32
+        mask_sq = ne.evaluate("rx_np*rx_np + ry_np*ry_np + rz_np*rz_np")
 
         # 2) reduction done by NumPy, then scalar sqrt
         mmax = float(np.sqrt(mask_sq.max()))
diff --git a/fullwave/solver/solver.py b/fullwave/solver/solver.py
index 46bbb3d..2410c86 100644
--- a/fullwave/solver/solver.py
+++ b/fullwave/solver/solver.py
@@ -330,6 +330,7 @@ def __init__(  # noqa: PLR0912
         cuda_device_id: str | int | list | None = None,
         save_gpu_memory: bool = False,
         verify_gpu: bool = True,
+        use_gpu_pml: bool = False,
     ) -> None:
         """Initialize a Solver instance for the fullwave simulation.
 
@@ -407,6 +408,11 @@ def __init__(  # noqa: PLR0912
             example 2: 2 for using GPU 2 or "2" as a string.
         save_gpu_memory : bool, optional
             Whether to save GPU memory by using ICMAT_MEMORY_SAVING flag in the simulation.
+        use_gpu_pml : bool, optional
+            Whether to use CuPy for GPU-accelerated PML computation (default is False).
+            Requires CuPy to be installed. Falls back to CPU if CuPy is unavailable.
+            This accelerates the PML array padding and transition computations
+            using the GPU, which is especially beneficial for large 3D grids.
             The simulation does not load initial conditions into GPU memory and
             it loads the slice of the wavefield needed for the current time step
             from CPU memory at each time step.
@@ -572,6 +578,7 @@ def __init__(  # noqa: PLR0912
                 sensor=self.sensor,
                 m_spatial_order=m_spatial_order,
                 n_pml_layer=pml_layer_thickness_px,
+                use_gpu=use_gpu_pml,
             )
         else:
             self.pml_builder = PMLBuilder(
@@ -583,6 +590,7 @@ def __init__(  # noqa: PLR0912
                 n_pml_layer=pml_layer_thickness_px,
                 n_transition_layer=n_transition_layer,
                 use_isotropic_relaxation=use_isotropic_relaxation,
+                use_gpu=use_gpu_pml,
             )
 
     @staticmethod

From fd7c6dc05cb0f429e5deb36def1f08c4256005ec Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Thu, 5 Mar 2026 11:47:58 -0500
Subject: [PATCH 19/31] Add GPU support and tests for Medium and PMLBuilder
 classes using CuPy

---
 fullwave/medium.py             | 148 +++++++--
 fullwave/solver/pml_builder.py |  13 +-
 tests/test_cupy_equivalence.py | 532 +++++++++++++++++++++++++++++++++
 3 files changed, 665 insertions(+), 28 deletions(-)
 create mode 100644 tests/test_cupy_equivalence.py

diff --git a/fullwave/medium.py b/fullwave/medium.py
index d7feaa2..1d0a8d1 100644
--- a/fullwave/medium.py
+++ b/fullwave/medium.py
@@ -1,15 +1,17 @@
 """Medium class for Fullwave."""
 
+from __future__ import annotations
+
 import logging
 from collections import OrderedDict
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
+from typing import TYPE_CHECKING
 
 import matplotlib.pyplot as plt
 import numexpr as ne
 import numpy as np
-from numpy.typing import NDArray
 
 from fullwave import Grid
 from fullwave.solver.utils import initialize_relaxation_param_dict
@@ -17,8 +19,37 @@
 from fullwave.utils.coordinates import coords_to_map, map_to_coords
 from fullwave.utils.relaxation_parameters import generate_relaxation_params
 
+if TYPE_CHECKING:
+    from types import ModuleType
+
+    from numpy.typing import NDArray
+
 logger = logging.getLogger("__main__." + __name__)
 
+_CUPY_AVAILABLE: bool | None = None
+
+
+def _check_cupy() -> bool:
+    """Return True if CuPy is importable; result is cached after the first call."""
+    global _CUPY_AVAILABLE  # noqa: PLW0603
+    if _CUPY_AVAILABLE is None:
+        try:
+            import cupy  # noqa: F401, PLC0415
+
+            _CUPY_AVAILABLE = True
+        except ImportError:
+            _CUPY_AVAILABLE = False
+    return _CUPY_AVAILABLE
+
+
+def _get_array_module(*, use_gpu: bool) -> ModuleType:
+    """Return ``cupy`` when *use_gpu* is True and CuPy is available, else ``numpy``."""
+    if use_gpu and _check_cupy():
+        import cupy  # noqa: PLC0415
+
+        return cupy
+    return np
+
 
 @dataclass
 class MediumRelaxationMaps:
@@ -47,6 +78,7 @@ def __init__(
         use_isotropic_relaxation: bool = True,
         n_jobs: int = -1,
         dtype: type = np.float64,
+        use_gpu: bool = False,
     ) -> None:
         """Medium class for Fullwave.
 
@@ -91,6 +123,9 @@ def __init__(
         dtype : type, optional
             Data type for medium arrays. Default is np.float64.
             Use np.float32 to reduce Python-side memory usage by ~50%.
+        use_gpu : bool, optional
+            If True, use CuPy for GPU-accelerated computation (default is False).
+            Requires CuPy to be installed. Falls back to CPU if CuPy is unavailable.
 
         """
         check_functions.check_compatible_value(
@@ -98,6 +133,15 @@ def __init__(
             [2],
             "Only n_relaxation_mechanisms=2 are supported currently.",
         )
+        self.use_gpu = use_gpu
+        self.xp: ModuleType = _get_array_module(use_gpu=use_gpu)
+        if self.xp is not np:
+            logger.info("MediumRelaxationMaps: using CuPy GPU backend")
+        elif use_gpu:
+            logger.warning(
+                "MediumRelaxationMaps: use_gpu=True but CuPy is not available. "
+                "Falling back to CPU (numpy)."
+            )
         self.n_relaxation_mechanisms = n_relaxation_mechanisms
         self.dtype = np.dtype(dtype)
         self.relaxation_param_dict = initialize_relaxation_param_dict(
@@ -169,7 +213,10 @@ def _update_relaxation_param_dict(
         # SIMD/cache-friendly pass over contiguous arrays.
         # x1 and x2 directions are independent, so run in parallel threads
         # (numpy/numexpr release the GIL during computation).
-        def _sort_by_time_const(
+        xp = self.xp
+        use_gpu = xp is not np
+
+        def _sort_by_time_const_cpu(
             d_arrays: list[NDArray[np.float64]],
             a_arrays: list[NDArray[np.float64]],
             kappa: NDArray[np.float64],  # noqa: ARG001
@@ -184,11 +231,38 @@ def _sort_by_time_const(
                     a_arrays[i] = np.where(swap, aj, ai)
                     a_arrays[j] = np.where(swap, ai, aj)
 
-        with ThreadPoolExecutor(max_workers=2) as pool:
-            fut_x1 = pool.submit(_sort_by_time_const, d_x1, a_x1, kappa_x1)
-            fut_x2 = pool.submit(_sort_by_time_const, d_x2, a_x2, kappa_x2)
-            fut_x1.result()
-            fut_x2.result()
+        def _sort_by_time_const_gpu(
+            d_arrays: list,
+            a_arrays: list,
+            kappa: NDArray[np.float64],
+        ) -> None:
+            kappa_gpu = xp.asarray(kappa)
+            d_gpu = [xp.asarray(d) for d in d_arrays]
+            a_gpu = [xp.asarray(a) for a in a_arrays]
+            for i in range(n_nu):
+                for j in range(i + 1, n_nu):
+                    swap = d_gpu[i] / kappa_gpu + a_gpu[i] > d_gpu[j] / kappa_gpu + a_gpu[j]
+                    d_gpu[i], d_gpu[j] = (
+                        xp.where(swap, d_gpu[j], d_gpu[i]),
+                        xp.where(swap, d_gpu[i], d_gpu[j]),
+                    )
+                    a_gpu[i], a_gpu[j] = (
+                        xp.where(swap, a_gpu[j], a_gpu[i]),
+                        xp.where(swap, a_gpu[i], a_gpu[j]),
+                    )
+            for i in range(n_nu):
+                d_arrays[i] = xp.asnumpy(d_gpu[i])
+                a_arrays[i] = xp.asnumpy(a_gpu[i])
+
+        if use_gpu:
+            _sort_by_time_const_gpu(d_x1, a_x1, kappa_x1)
+            _sort_by_time_const_gpu(d_x2, a_x2, kappa_x2)
+        else:
+            with ThreadPoolExecutor(max_workers=2) as pool:
+                fut_x1 = pool.submit(_sort_by_time_const_cpu, d_x1, a_x1, kappa_x1)
+                fut_x2 = pool.submit(_sort_by_time_const_cpu, d_x2, a_x2, kappa_x2)
+                fut_x1.result()
+                fut_x2.result()
 
         # Write results into relaxation_param_dict
         param_dict = self.relaxation_param_dict
@@ -288,34 +362,43 @@ def n_air(self) -> int:
         """Return the number of air coordinates."""
         return self.air_coords.shape[0]
 
-    @staticmethod
     def _calc_a_and_b(
+        self,
         dx: float | NDArray[np.float64],
         kappa_x: float | NDArray[np.float64],
         alpha_x: float | NDArray[np.float64],
         dt: float | NDArray[np.float64],
         output_dtype: np.dtype | None = None,
     ) -> tuple[NDArray[np.float64], NDArray[np.float64]]:
-        dx = np.asarray(dx, dtype=np.float64)
-        kappa_x = np.asarray(kappa_x, dtype=np.float64)
-        alpha_x = np.asarray(alpha_x, dtype=np.float64)
-        dt = np.asarray(dt, dtype=np.float64)
-
-        eps = np.finfo(np.float64).eps  # noqa: F841
-
-        # b = exp(-(dx/kappa_x + alpha_x) * dt)
-        b = ne.evaluate("exp(-(dx/kappa_x + alpha_x) * dt)")
+        xp = self.xp
+        use_gpu = xp is not np
 
-        # denom = kappa_x*(dx + kappa_x*alpha_x) + eps
-        denom = ne.evaluate("kappa_x*(dx + kappa_x*alpha_x) + eps")  # noqa: F841
+        dx = xp.asarray(dx, dtype=xp.float64)
+        kappa_x = xp.asarray(kappa_x, dtype=xp.float64)
+        alpha_x = xp.asarray(alpha_x, dtype=xp.float64)
+        dt = xp.asarray(dt, dtype=xp.float64)
 
-        # a = dx/denom*(b - 1)
-        a = ne.evaluate("dx/denom*(b - 1)")
+        eps = xp.finfo(xp.float64).eps
 
-        if output_dtype is not None and output_dtype != np.float64:
+        if use_gpu:
+            b = xp.exp(-(dx / kappa_x + alpha_x) * dt)
+            denom = kappa_x * (dx + kappa_x * alpha_x) + eps
+            a = dx / denom * (b - 1)
+        else:
+            eps_local = eps  # noqa: F841
+            # b = exp(-(dx/kappa_x + alpha_x) * dt)
+            b = ne.evaluate("exp(-(dx/kappa_x + alpha_x) * dt)")
+            # denom = kappa_x*(dx + kappa_x*alpha_x) + eps
+            denom = ne.evaluate("kappa_x*(dx + kappa_x*alpha_x) + eps_local")
+            # a = dx/denom*(b - 1)
+            a = ne.evaluate("dx/denom*(b - 1)")
+
+        if output_dtype is not None and output_dtype != xp.float64:
             a = a.astype(output_dtype, copy=False)
             b = b.astype(output_dtype, copy=False)
 
+        if use_gpu:
+            return xp.asnumpy(a), xp.asnumpy(b)
         return a, b
 
     @staticmethod
@@ -582,7 +665,7 @@ def __repr__(self) -> str:
         """
         return str(self)
 
-    def build(self) -> "MediumRelaxationMaps":
+    def build(self) -> MediumRelaxationMaps:
         """Build the MediumRelaxationMaps instance.
 
         It returns self for compatibility with Solver class.
@@ -859,6 +942,7 @@ def __init__(
         use_isotropic_relaxation: bool = True,
         n_jobs: int = -1,
         dtype: type = np.float64,
+        use_gpu: bool = False,
     ) -> None:
         """Medium class for Fullwave.
 
@@ -913,6 +997,9 @@ def __init__(
             Use np.float32 to reduce Python-side memory usage by ~50%.
             The CUDA solver reads all data as float32, so float32 storage
             avoids redundant conversion copies.
+        use_gpu : bool, optional
+            If True, use CuPy for GPU-accelerated computation (default is False).
+            Requires CuPy to be installed. Falls back to CPU if CuPy is unavailable.
 
         """
         check_functions.check_compatible_value(
@@ -922,6 +1009,14 @@ def __init__(
         )
         check_functions.check_instance(grid, Grid)
         check_functions.check_path_exists(path_relaxation_parameters_database)
+        self.use_gpu = use_gpu
+        self.xp: ModuleType = _get_array_module(use_gpu=use_gpu)
+        if self.xp is not np:
+            logger.info("Medium: using CuPy GPU backend")
+        elif use_gpu:
+            logger.warning(
+                "Medium: use_gpu=True but CuPy is not available. Falling back to CPU (numpy)."
+            )
         self.grid = grid
         self.is_3d = grid.is_3d
         self.dtype = np.dtype(dtype)
@@ -1187,6 +1282,7 @@ def build(self) -> MediumRelaxationMaps:
             use_isotropic_relaxation=self.use_isotropic_relaxation,
             n_jobs=self.n_jobs,
             dtype=self.dtype,
+            use_gpu=self.use_gpu,
         )
 
     def _db_mhz_cm_to_a_exp(
@@ -1213,7 +1309,11 @@ def _db_mhz_cm_to_a_exp(
         f0 = self.grid.omega / (2.0 * np.pi * 1e6)  # scalar
         att_factor_dt = -self.grid.dt * 0.5 * f0 * self.grid.c0 / (1e-2 * np_factor)
 
-        # numexpr: exp(att_factor_dt * alpha_coeff)
+        xp = self.xp
+        if xp is not np:
+            alpha_gpu = xp.asarray(alpha_coeff)
+            result = xp.exp(att_factor_dt * alpha_gpu)
+            return xp.asnumpy(result)
         return ne.evaluate("exp(att * a)", local_dict={"a": alpha_coeff, "att": att_factor_dt})
 
     def build_exponential(self) -> MediumExponentialAttenuation:
diff --git a/fullwave/solver/pml_builder.py b/fullwave/solver/pml_builder.py
index 13773de..f3c0650 100644
--- a/fullwave/solver/pml_builder.py
+++ b/fullwave/solver/pml_builder.py
@@ -343,6 +343,7 @@ def __init__(  # noqa: PLR0912
                 n_relaxation_mechanisms=self.medium_org.n_relaxation_mechanisms,
                 n_jobs=self.medium_org.n_jobs,
                 dtype=getattr(self.medium_org, "dtype", np.float64),
+                use_gpu=self.use_gpu,
             )
         else:
             if self.xp is not np:
@@ -393,6 +394,7 @@ def __init__(  # noqa: PLR0912
                 attenuation_builder=self.medium_org.attenuation_builder,
                 n_jobs=self.medium_org.n_jobs,
                 dtype=getattr(self.medium_org, "dtype", np.float64),
+                use_gpu=self.use_gpu,
             )
         logger.debug("building extended medium for pml...done")
 
@@ -1751,6 +1753,7 @@ def __init__(
             path_relaxation_parameters_database=self.medium_org.path_relaxation_parameters_database,
             attenuation_builder=self.medium_org.attenuation_builder,
             dtype=getattr(self.medium_org, "dtype", np.float64),
+            use_gpu=self.use_gpu,
         )
         logger.debug("Extended medium for PML built successfully.")
 
@@ -1906,7 +1909,7 @@ def edge_distance_1d(n: int, n_body: int) -> NDArray[np.float32]:
             mmax = float(xp.sqrt(mask_sq.max()))
             if mmax > 0.0:
                 mask_sq = mask_sq / (mmax * mmax)
-            result = 1 - xp.sqrt(mask_sq)
+            result = xp.maximum(1 - xp.sqrt(mask_sq), 0)
             return xp.asnumpy(result)
 
         rx_np = rx  # noqa: F841
@@ -1915,7 +1918,8 @@ def edge_distance_1d(n: int, n_body: int) -> NDArray[np.float32]:
         mmax = float(np.sqrt(mask_sq.max()))
         if mmax > 0.0:
             mask_sq = ne.evaluate("mask_sq / (mmax*mmax)")
-        return ne.evaluate("1 - sqrt(mask_sq)")
+        result = ne.evaluate("1 - sqrt(mask_sq)")
+        return np.maximum(result, 0)
 
     def _mask_body_3d(self, nx: int, ny: int, nz: int, n_body: int) -> NDArray[np.float32]:
         """Create a mask for the PML region.
@@ -1957,7 +1961,7 @@ def edge_distance_1d(n: int, n_body: int) -> NDArray[np.float32]:
             mmax = float(xp.sqrt(mask_sq.max()))
             if mmax > 0.0:
                 mask_sq = mask_sq / (mmax * mmax)
-            result = 1 - xp.sqrt(mask_sq)
+            result = xp.maximum(1 - xp.sqrt(mask_sq), 0)
             return xp.asnumpy(result)
 
         rx_np = rx  # noqa: F841
@@ -1973,7 +1977,8 @@ def edge_distance_1d(n: int, n_body: int) -> NDArray[np.float32]:
             mask_sq = ne.evaluate("mask_sq / (mmax*mmax)")
 
         # 4) final sqrt elementwise
-        return ne.evaluate("1 - sqrt(mask_sq)")
+        result = ne.evaluate("1 - sqrt(mask_sq)")
+        return np.maximum(result, 0)
 
     def _apply_pml_3d(
         self,
diff --git a/tests/test_cupy_equivalence.py b/tests/test_cupy_equivalence.py
new file mode 100644
index 0000000..f2581fb
--- /dev/null
+++ b/tests/test_cupy_equivalence.py
@@ -0,0 +1,532 @@
+"""Tests verifying that CuPy (GPU) and NumPy (CPU) paths produce identical results.
+
+Every test in this module is automatically skipped when CuPy is not installed
+or when no CUDA device is available.
+"""
+
+import numpy as np
+import pytest
+
+import fullwave.medium as medium_module
+from fullwave.medium import Medium, MediumRelaxationMaps
+from fullwave.solver.pml_builder import PMLBuilder, PMLBuilderExponentialAttenuation
+from fullwave.solver.utils import initialize_relaxation_param_dict
+
+# ---------------------------------------------------------------------------
+# Skip the entire module when CuPy / CUDA is unavailable
+# ---------------------------------------------------------------------------
+try:
+    import cupy as cp
+
+    cp.cuda.runtime.getDeviceCount()  # raises if no device
+    _CUPY_AVAILABLE = True
+except Exception:
+    _CUPY_AVAILABLE = False
+
+pytestmark = pytest.mark.skipif(not _CUPY_AVAILABLE, reason="CuPy or CUDA device not available")
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+class DummyGrid2D:
+    def __init__(self, nx, ny, dt=1e-8, f0=1e6, c0=1540.0, ppw=12, cfl=0.4):
+        self.nx = nx
+        self.ny = ny
+        self.nz = 1
+        self.dx = c0 / (f0 * ppw)
+        self.dy = self.dx
+        self.dz = self.dx
+        self.dt = dt
+        self.f0 = f0
+        self.c0 = c0
+        self.ppw = ppw
+        self.cfl = cfl
+        self.duration = dt * 100
+        self.omega = 2.0 * np.pi * f0
+        self.is_3d = False
+
+
+class DummyGrid3D:
+    def __init__(self, nx, ny, nz, dt=1e-8, f0=1e6, c0=1540.0, ppw=12, cfl=0.4):
+        self.nx = nx
+        self.ny = ny
+        self.nz = nz
+        self.dx = c0 / (f0 * ppw)
+        self.dy = self.dx
+        self.dz = self.dx
+        self.dt = dt
+        self.f0 = f0
+        self.c0 = c0
+        self.ppw = ppw
+        self.cfl = cfl
+        self.duration = dt * 100
+        self.omega = 2.0 * np.pi * f0
+        self.is_3d = True
+
+
+def _dummy_check_functions():
+    return type(
+        "dummy",
+        (),
+        {
+            "check_instance": lambda *_args: None,
+            "check_path_exists": lambda *_args: None,
+            "check_compatible_value": lambda *_args: None,
+        },
+    )()
+
+
+def _get_relaxation_dict(shape, n_relaxation_mechanisms=2):
+    base = initialize_relaxation_param_dict(n_relaxation_mechanisms=n_relaxation_mechanisms)
+    rng = np.random.default_rng(42)
+    return {key: rng.uniform(0.5, 2.0, size=shape) for key in base}
+
+
+# ---------------------------------------------------------------------------
+# Medium tests
+# ---------------------------------------------------------------------------
+class TestMediumCupyEquivalence:
+    """Compare CPU vs GPU for Medium methods."""
+
+    @pytest.fixture(autouse=True)
+    def _patch(self, monkeypatch):
+        monkeypatch.setattr(medium_module, "check_functions", _dummy_check_functions())
+
+    def _make_medium_pair(self, grid_shape, grid):
+        rng = np.random.default_rng(123)
+        sound_speed = rng.uniform(1400, 1600, grid_shape)
+        density = rng.uniform(900, 1100, grid_shape)
+        alpha_coeff = rng.uniform(0.1, 1.0, grid_shape)
+        alpha_power = rng.uniform(1.0, 2.0, grid_shape)
+        beta = rng.uniform(0.5, 1.5, grid_shape)
+
+        cpu = Medium(
+            grid,
+            sound_speed.copy(),
+            density.copy(),
+            alpha_coeff.copy(),
+            alpha_power.copy(),
+            beta.copy(),
+            use_gpu=False,
+        )
+        gpu = Medium(
+            grid,
+            sound_speed.copy(),
+            density.copy(),
+            alpha_coeff.copy(),
+            alpha_power.copy(),
+            beta.copy(),
+            use_gpu=True,
+        )
+        return cpu, gpu
+
+    def test_db_mhz_cm_to_a_exp_2d(self):
+        shape = (32, 32)
+        grid = DummyGrid2D(nx=shape[0], ny=shape[1])
+        cpu, gpu = self._make_medium_pair(shape, grid)
+
+        cpu_result = cpu._db_mhz_cm_to_a_exp(cpu.alpha_coeff)
+        gpu_result = gpu._db_mhz_cm_to_a_exp(gpu.alpha_coeff)
+
+        np.testing.assert_allclose(gpu_result, cpu_result, rtol=1e-12)
+
+    def test_db_mhz_cm_to_a_exp_3d(self):
+        shape = (16, 16, 16)
+        grid = DummyGrid3D(nx=shape[0], ny=shape[1], nz=shape[2])
+        cpu, gpu = self._make_medium_pair(shape, grid)
+
+        cpu_result = cpu._db_mhz_cm_to_a_exp(cpu.alpha_coeff)
+        gpu_result = gpu._db_mhz_cm_to_a_exp(gpu.alpha_coeff)
+
+        np.testing.assert_allclose(gpu_result, cpu_result, rtol=1e-12)
+
+
+class TestMediumRelaxationMapsCupyEquivalence:
+    """Compare CPU vs GPU for MediumRelaxationMaps methods."""
+
+    @pytest.fixture(autouse=True)
+    def _patch(self, monkeypatch):
+        monkeypatch.setattr(medium_module, "check_functions", _dummy_check_functions())
+
+    def _make_pair(self, grid_shape, grid):
+        rng = np.random.default_rng(456)
+        sound_speed = rng.uniform(1400, 1600, grid_shape)
+        density = rng.uniform(900, 1100, grid_shape)
+        beta = rng.uniform(0.5, 1.5, grid_shape)
+        relax = _get_relaxation_dict(grid_shape)
+
+        cpu = MediumRelaxationMaps(
+            grid,
+            sound_speed.copy(),
+            density.copy(),
+            beta.copy(),
+            {k: v.copy() for k, v in relax.items()},
+            use_gpu=False,
+        )
+        gpu = MediumRelaxationMaps(
+            grid,
+            sound_speed.copy(),
+            density.copy(),
+            beta.copy(),
+            {k: v.copy() for k, v in relax.items()},
+            use_gpu=True,
+        )
+        return cpu, gpu
+
+    def test_relaxation_param_dict_2d(self):
+        shape = (20, 20)
+        grid = DummyGrid2D(nx=shape[0], ny=shape[1])
+        cpu, gpu = self._make_pair(shape, grid)
+
+        for key in cpu.relaxation_param_dict:
+            np.testing.assert_allclose(
+                gpu.relaxation_param_dict[key],
+                cpu.relaxation_param_dict[key],
+                rtol=1e-10,
+                err_msg=f"relaxation_param_dict[{key}] mismatch",
+            )
+
+    def test_relaxation_param_dict_for_fw2_2d(self):
+        shape = (20, 20)
+        grid = DummyGrid2D(nx=shape[0], ny=shape[1])
+        cpu, gpu = self._make_pair(shape, grid)
+
+        for key in cpu.relaxation_param_dict_for_fw2:
+            np.testing.assert_allclose(
+                gpu.relaxation_param_dict_for_fw2[key],
+                cpu.relaxation_param_dict_for_fw2[key],
+                rtol=1e-10,
+                err_msg=f"relaxation_param_dict_for_fw2[{key}] mismatch",
+            )
+
+    def test_calc_a_and_b(self):
+        shape = (20, 20)
+        grid = DummyGrid2D(nx=shape[0], ny=shape[1])
+        cpu, gpu = self._make_pair(shape, grid)
+
+        rng = np.random.default_rng(789)
+        dx = rng.uniform(0.01, 0.5, shape)
+        kappa = rng.uniform(0.5, 3.0, shape)
+        alpha = rng.uniform(0.01, 0.5, shape)
+        dt = 1e-8
+
+        a_cpu, b_cpu = cpu._calc_a_and_b(dx, kappa, alpha, dt)
+        a_gpu, b_gpu = gpu._calc_a_and_b(dx, kappa, alpha, dt)
+
+        np.testing.assert_allclose(a_gpu, a_cpu, rtol=1e-12)
+        np.testing.assert_allclose(b_gpu, b_cpu, rtol=1e-12)
+
+
+# ---------------------------------------------------------------------------
+# PMLBuilder tests
+# ---------------------------------------------------------------------------
+class TestPMLBuilderCupyEquivalence:
+    """Compare CPU vs GPU for PMLBuilder._extend_map_for_pml and _apply_transition_and_pml."""
+
+    @pytest.fixture(autouse=True)
+    def _patch(self, monkeypatch):
+        monkeypatch.setattr(medium_module, "check_functions", _dummy_check_functions())
+
+    def _make_pml_pair(self, grid, medium, source, sensor, **kwargs):
+        cpu = PMLBuilder(
+            grid,
+            medium,
+            source,
+            sensor,
+            use_gpu=False,
+            **kwargs,
+        )
+        gpu = PMLBuilder(
+            grid,
+            medium,
+            source,
+            sensor,
+            use_gpu=True,
+            **kwargs,
+        )
+        return cpu, gpu
+
+    @pytest.fixture()
+    def setup_2d(self):
+        import fullwave
+
+        grid = fullwave.Grid(
+            domain_size=(0.01, 0.01),
+            f0=1e6,
+            duration=1e-6,
+            c0=1540.0,
+            ppw=12,
+            cfl=0.4,
+        )
+        rng = np.random.default_rng(111)
+        shape = (grid.nx, grid.ny)
+        sound_speed = rng.uniform(1400, 1600, shape)
+        density = rng.uniform(900, 1100, shape)
+        alpha_coeff = rng.uniform(0.1, 1.0, shape)
+        alpha_power = rng.uniform(1.0, 2.0, shape)
+        beta = rng.uniform(0.5, 1.5, shape)
+
+        medium = fullwave.Medium(
+            grid=grid,
+            sound_speed=sound_speed,
+            density=density,
+            alpha_coeff=alpha_coeff,
+            alpha_power=alpha_power,
+            beta=beta,
+        )
+        src_coords = np.array([[grid.nx // 2, y] for y in range(grid.ny)])
+        source = fullwave.Source(
+            p0=np.ones((src_coords.shape[0], grid.nt)),
+            coords=src_coords,
+            grid_shape=shape,
+        )
+        sen_coords = np.array([[grid.nx // 2 + 5, y] for y in range(grid.ny)])
+        sensor = fullwave.Sensor(
+            coords=sen_coords,
+            grid_shape=shape,
+        )
+        return grid, medium, source, sensor
+
+    def test_extend_map_for_pml_2d_fill_edge(self, setup_2d):
+        grid, medium, source, sensor = setup_2d
+        cpu, gpu = self._make_pml_pair(grid, medium, source, sensor)
+
+        rng = np.random.default_rng(222)
+        arr = rng.uniform(0.0, 100.0, (grid.nx, grid.ny))
+
+        cpu_result = cpu._extend_map_for_pml(arr.copy(), fill_edge=True)
+        gpu_result = gpu._extend_map_for_pml(arr.copy(), fill_edge=True)
+
+        np.testing.assert_allclose(gpu_result, cpu_result, rtol=1e-14)
+
+    def test_extend_map_for_pml_2d_zero_fill(self, setup_2d):
+        grid, medium, source, sensor = setup_2d
+        cpu, gpu = self._make_pml_pair(grid, medium, source, sensor)
+
+        rng = np.random.default_rng(333)
+        arr = rng.uniform(0.0, 100.0, (grid.nx, grid.ny))
+
+        cpu_result = cpu._extend_map_for_pml(arr.copy(), fill_edge=False)
+        gpu_result = gpu._extend_map_for_pml(arr.copy(), fill_edge=False)
+
+        np.testing.assert_allclose(gpu_result, cpu_result, rtol=1e-14)
+
+    def test_apply_transition_and_pml_2d(self, setup_2d):
+        grid, medium, source, sensor = setup_2d
+        cpu, gpu = self._make_pml_pair(grid, medium, source, sensor)
+
+        ext_shape = cpu.extended_medium.sound_speed.shape
+        rng = np.random.default_rng(444)
+        arr = rng.uniform(0.1, 5.0, ext_shape)
+
+        for axis in [0, 1]:
+            for transition_type in ["smooth", "linear", "cosine", "polynomial"]:
+                cpu_result = cpu._apply_transition_and_pml(
+                    arr.copy(),
+                    value_target=0.0,
+                    array_shape=ext_shape,
+                    axis=axis,
+                    transition_type=transition_type,
+                    is_3d=False,
+                )
+                gpu_result = gpu._apply_transition_and_pml(
+                    arr.copy(),
+                    value_target=0.0,
+                    array_shape=ext_shape,
+                    axis=axis,
+                    transition_type=transition_type,
+                    is_3d=False,
+                )
+                np.testing.assert_allclose(
+                    gpu_result,
+                    cpu_result,
+                    rtol=1e-12,
+                    err_msg=f"axis={axis}, transition_type={transition_type}",
+                )
+
+    def test_calc_a_and_b(self, setup_2d):
+        grid, medium, source, sensor = setup_2d
+        cpu, gpu = self._make_pml_pair(grid, medium, source, sensor)
+
+        rng = np.random.default_rng(555)
+        shape = cpu.extended_medium.sound_speed.shape
+        dx = rng.uniform(0.01, 0.5, shape)
+        kappa = rng.uniform(0.5, 3.0, shape)
+        alpha = rng.uniform(0.01, 0.5, shape)
+        dt = grid.dt
+
+        a_cpu, b_cpu = cpu._calc_a_and_b(dx, kappa, alpha, dt)
+        a_gpu, b_gpu = gpu._calc_a_and_b(dx, kappa, alpha, dt)
+
+        np.testing.assert_allclose(a_gpu, a_cpu, rtol=1e-12)
+        np.testing.assert_allclose(b_gpu, b_cpu, rtol=1e-12)
+
+    def test_extended_medium_identical(self, setup_2d):
+        """The extended medium (after __init__) should be identical CPU vs GPU."""
+        grid, medium, source, sensor = setup_2d
+        cpu, gpu = self._make_pml_pair(grid, medium, source, sensor)
+
+        np.testing.assert_allclose(
+            gpu.extended_medium.sound_speed,
+            cpu.extended_medium.sound_speed,
+            rtol=1e-14,
+        )
+        np.testing.assert_allclose(
+            gpu.extended_medium.density,
+            cpu.extended_medium.density,
+            rtol=1e-14,
+        )
+        np.testing.assert_allclose(
+            gpu.extended_medium.alpha_coeff,
+            cpu.extended_medium.alpha_coeff,
+            rtol=1e-14,
+        )
+        np.testing.assert_allclose(
+            gpu.extended_medium.alpha_power,
+            cpu.extended_medium.alpha_power,
+            rtol=1e-14,
+        )
+        np.testing.assert_allclose(
+            gpu.extended_medium.beta,
+            cpu.extended_medium.beta,
+            rtol=1e-14,
+        )
+
+
+class TestPMLBuilderExponentialAttenuationCupyEquivalence:
+    """Compare CPU vs GPU for PMLBuilderExponentialAttenuation."""
+
+    @pytest.fixture()
+    def setup_2d(self):
+        import fullwave
+
+        grid = fullwave.Grid(
+            domain_size=(0.01, 0.01),
+            f0=1e6,
+            duration=1e-6,
+            c0=1540.0,
+            ppw=12,
+            cfl=0.4,
+        )
+        rng = np.random.default_rng(666)
+        shape = (grid.nx, grid.ny)
+        sound_speed = rng.uniform(1400, 1600, shape)
+        density = rng.uniform(900, 1100, shape)
+        alpha_coeff = rng.uniform(0.1, 1.0, shape)
+        alpha_power = rng.uniform(1.0, 2.0, shape)
+        beta = rng.uniform(0.5, 1.5, shape)
+
+        medium = fullwave.Medium(
+            grid=grid,
+            sound_speed=sound_speed,
+            density=density,
+            alpha_coeff=alpha_coeff,
+            alpha_power=alpha_power,
+            beta=beta,
+        )
+        src_coords = np.array([[grid.nx // 2, y] for y in range(grid.ny)])
+        source = fullwave.Source(
+            p0=np.ones((src_coords.shape[0], grid.nt)),
+            coords=src_coords,
+            grid_shape=shape,
+        )
+        sen_coords = np.array([[grid.nx // 2 + 5, y] for y in range(grid.ny)])
+        sensor = fullwave.Sensor(
+            coords=sen_coords,
+            grid_shape=shape,
+        )
+        return grid, medium, source, sensor
+
+    def test_mask_body_2d(self, setup_2d):
+        grid, medium, source, sensor = setup_2d
+        cpu = PMLBuilderExponentialAttenuation(
+            grid,
+            medium,
+            source,
+            sensor,
+            use_gpu=False,
+        )
+        gpu = PMLBuilderExponentialAttenuation(
+            grid,
+            medium,
+            source,
+            sensor,
+            use_gpu=True,
+        )
+
+        nx, ny = cpu.extended_medium.sound_speed.shape[:2]
+        cpu_mask = cpu._mask_body_2d(nx, ny, cpu.num_boundary_points)
+        gpu_mask = gpu._mask_body_2d(nx, ny, gpu.num_boundary_points)
+
+        np.testing.assert_allclose(gpu_mask, cpu_mask, rtol=1e-5, atol=1e-7)
+
+    def test_extended_medium_identical(self, setup_2d):
+        grid, medium, source, sensor = setup_2d
+        cpu = PMLBuilderExponentialAttenuation(
+            grid,
+            medium,
+            source,
+            sensor,
+            use_gpu=False,
+        )
+        gpu = PMLBuilderExponentialAttenuation(
+            grid,
+            medium,
+            source,
+            sensor,
+            use_gpu=True,
+        )
+
+        np.testing.assert_allclose(
+            gpu.extended_medium.sound_speed,
+            cpu.extended_medium.sound_speed,
+            rtol=1e-14,
+        )
+        np.testing.assert_allclose(
+            gpu.extended_medium.density,
+            cpu.extended_medium.density,
+            rtol=1e-14,
+        )
+
+    def test_run_identical(self, setup_2d):
+        grid, medium, source, sensor = setup_2d
+        cpu_builder = PMLBuilderExponentialAttenuation(
+            grid,
+            medium,
+            source,
+            sensor,
+            use_gpu=False,
+        )
+        gpu_builder = PMLBuilderExponentialAttenuation(
+            grid,
+            medium,
+            source,
+            sensor,
+            use_gpu=True,
+        )
+
+        cpu_result = cpu_builder.run(use_pml=True)
+        gpu_result = gpu_builder.run(use_pml=True)
+
+        np.testing.assert_allclose(
+            gpu_result.sound_speed,
+            cpu_result.sound_speed,
+            rtol=1e-12,
+        )
+        np.testing.assert_allclose(
+            gpu_result.density,
+            cpu_result.density,
+            rtol=1e-12,
+        )
+        np.testing.assert_allclose(
+            gpu_result.alpha_exp,
+            cpu_result.alpha_exp,
+            rtol=1e-5,
+            atol=1e-7,
+        )
+        np.testing.assert_allclose(
+            gpu_result.beta,
+            cpu_result.beta,
+            rtol=1e-12,
+        )

From ad26b9bda0939ac9291f5a24b878f3d4fa341149 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Thu, 5 Mar 2026 12:50:28 -0500
Subject: [PATCH 20/31] Add GPU support for Medium and PMLBuilder classes using
 CuPy

---
 fullwave/medium.py                   |  32 ++++
 fullwave/solver/input_file_writer.py |  65 ++++++-
 fullwave/solver/pml_builder.py       |  91 ++++++---
 fullwave/solver/solver.py            |   2 +
 tests/test_cupy_equivalence.py       | 273 ++++++++++++++++++++++++++-
 5 files changed, 423 insertions(+), 40 deletions(-)

diff --git a/fullwave/medium.py b/fullwave/medium.py
index 1d0a8d1..81e3172 100644
--- a/fullwave/medium.py
+++ b/fullwave/medium.py
@@ -332,6 +332,12 @@ def check_relaxation_param_dict(
     @property
     def bulk_modulus(self) -> NDArray[np.float64]:
         """Return the bulk_modulus."""
+        xp = self.xp
+        if xp is not np:
+            c_gpu = xp.asarray(self.sound_speed)
+            rho_gpu = xp.asarray(self.density)
+            result = c_gpu * c_gpu * rho_gpu
+            return xp.asnumpy(result)
         return np.multiply(self.sound_speed**2, self.density)
 
     @property
@@ -706,6 +712,7 @@ def __init__(
         air_map: NDArray[np.int64] | None = None,
         air_coords: NDArray[np.int64] | None = None,
         dtype: type = np.float64,
+        use_gpu: bool = False,
     ) -> None:
         """Medium class for Fullwave.
 
@@ -736,9 +743,21 @@ def __init__(
         dtype : type, optional
             Data type for medium arrays. Default is np.float64.
             Use np.float32 to reduce Python-side memory usage by ~50%.
+        use_gpu : bool, optional
+            If True, use CuPy for GPU-accelerated computation (default is False).
+            Requires CuPy to be installed. Falls back to CPU if CuPy is unavailable.
 
         """
         check_functions.check_instance(grid, Grid)
+        self.use_gpu = use_gpu
+        self.xp: ModuleType = _get_array_module(use_gpu=use_gpu)
+        if self.xp is not np:
+            logger.info("MediumExponentialAttenuation: using CuPy GPU backend")
+        elif use_gpu:
+            logger.warning(
+                "MediumExponentialAttenuation: use_gpu=True but CuPy is not available. "
+                "Falling back to CPU (numpy)."
+            )
         self.grid = grid
         self.is_3d = grid.is_3d
         self.dtype = np.dtype(dtype)
@@ -806,6 +825,12 @@ def air_map(self) -> NDArray[np.int64]:
     @property
     def bulk_modulus(self) -> NDArray[np.float64]:
         """Return the bulk_modulus."""
+        xp = getattr(self, "xp", np)
+        if xp is not np:
+            c_gpu = xp.asarray(self.sound_speed)
+            rho_gpu = xp.asarray(self.density)
+            result = c_gpu * c_gpu * rho_gpu
+            return xp.asnumpy(result)
         return np.multiply(self.sound_speed**2, self.density)
 
     @property
@@ -1104,6 +1129,12 @@ def air_map(self) -> NDArray[np.int64]:
     @property
     def bulk_modulus(self) -> NDArray[np.float64]:
         """Return the bulk_modulus."""
+        xp = getattr(self, "xp", np)
+        if xp is not np:
+            c_gpu = xp.asarray(self.sound_speed)
+            rho_gpu = xp.asarray(self.density)
+            result = c_gpu * c_gpu * rho_gpu
+            return xp.asnumpy(result)
         return np.multiply(self.sound_speed**2, self.density)
 
     @property
@@ -1337,6 +1368,7 @@ def build_exponential(self) -> MediumExponentialAttenuation:
             beta=self.beta,
             air_coords=self.air_coords,
             dtype=self.dtype,
+            use_gpu=self.use_gpu,
         )
 
     def print_info(self) -> None:
diff --git a/fullwave/solver/input_file_writer.py b/fullwave/solver/input_file_writer.py
index ccbdb14..bee8459 100644
--- a/fullwave/solver/input_file_writer.py
+++ b/fullwave/solver/input_file_writer.py
@@ -37,6 +37,7 @@ def __init__(
         use_isotropic_relaxation: bool = False,
         release_after_write: bool = False,
         pml_thickness: int = 0,
+        use_gpu: bool = False,
     ) -> None:
         """Initialize the InputGeneratorBase instance.
 
@@ -86,6 +87,7 @@ def __init__(
         self._work_dir = Path(work_dir)
         self.path_fullwave_simulation_bin = path_fullwave_simulation_bin
         self.use_isotropic_relaxation = use_isotropic_relaxation
+        self.use_gpu = use_gpu
 
         if validate_input:
             check_functions.check_path_exists(self.path_fullwave_simulation_bin)
@@ -106,13 +108,41 @@ def __init__(
         self.release_after_write = release_after_write
         self.pml_thickness = pml_thickness
 
-        self._dim = int(
-            np.rint(self.medium.sound_speed.max()) - np.rint(self.medium.sound_speed.min()),
-        )
+        if self.use_gpu:
+            try:
+                import cupy as cp  # noqa: PLC0415
+
+                c_gpu = cp.asarray(self.medium.sound_speed, dtype=cp.float64)
+                c_min_val = float(c_gpu.min())
+                c_max_val = float(c_gpu.max())
+                self._dim = int(cp.rint(cp.float64(c_max_val)) - cp.rint(cp.float64(c_min_val)))
+
+                # Compute dc_map in the same GPU pass (avoids a second H2D transfer)
+                c_min_rounded = float(matlab_round(c_min_val))
+                offset = -c_min_rounded + 1
+                c_gpu += 1e-9
+                cp.rint(c_gpu, out=c_gpu)
+                c_gpu += offset
+                self._dc_map = cp.asnumpy(c_gpu.astype(cp.int32))
+                logger.debug("dc map for stencil coefficients set (GPU, fused).")
+                self._dc_map_ready = True
+            except ImportError:
+                self._dim = int(
+                    np.rint(self.medium.sound_speed.max()) - np.rint(self.medium.sound_speed.min()),
+                )
+                c_min_val = float(self.medium.sound_speed.min())
+                self._dc_map_ready = False
+        else:
+            self._dim = int(
+                np.rint(self.medium.sound_speed.max()) - np.rint(self.medium.sound_speed.min()),
+            )
+            c_min_val = float(self.medium.sound_speed.min())
+            self._dc_map_ready = False
 
         self._set_d_mat()
-        self._set_d_map(self._dim, self.medium.sound_speed)
-        self._set_dc_map(self.medium.sound_speed)
+        self._set_d_map(self._dim, self.medium.sound_speed, c_min=c_min_val)
+        if not self._dc_map_ready:
+            self._set_dc_map(self.medium.sound_speed)
         logger.debug("InputFileWriter instance created.")
 
     def run(
@@ -468,10 +498,10 @@ def _horner7(
         # (((((((a7*x + a6)*x + a5)*x + a4)*x + a3)*x + a2)*x + a1)*x + a0)
         return ((((((a7 * x + a6) * x + a5) * x + a4) * x + a3) * x + a2) * x + a1) * x + a0
 
-    def _set_d_map(self, dim: int, c_map: np.ndarray) -> None:
+    def _set_d_map(self, dim: int, c_map: np.ndarray, *, c_min: float | None = None) -> None:
         self._d_map = np.zeros((9, 2, dim + 1), dtype=np.float64)
 
-        cmin = float(np.min(c_map))  # compute once (was inside loop)
+        cmin = c_min if c_min is not None else float(np.min(c_map))
         scale = self.grid.dt / self.grid.dx  # compute once
         i = np.arange(dim + 1, dtype=np.float64)
         r = (i + cmin) * scale
@@ -682,10 +712,27 @@ def _set_dc_map(self, c_map: np.ndarray) -> None:
 
         For large 3-D maps the naive approach allocates a full float64 copy
         and makes several sequential passes.
-        This version processes the first axis in chunks using a thread pool so
-        that (a) peak memory stays bounded and (b) multiple cores share the work.
+        GPU path: uses CuPy for the entire computation in one pass.
+        CPU path: processes the first axis in chunks using a thread pool.
         """
         logger.debug("Setting dc map for stencil coefficients.")
+
+        if self.use_gpu:
+            try:
+                import cupy as cp  # noqa: PLC0415
+
+                c_gpu = cp.asarray(c_map, dtype=cp.float64)
+                c_min_rounded = float(matlab_round(float(c_gpu.min())))
+                offset = -c_min_rounded + 1
+                c_gpu += 1e-9
+                cp.rint(c_gpu, out=c_gpu)
+                c_gpu += offset
+                self._dc_map = cp.asnumpy(c_gpu.astype(cp.int32))
+                logger.debug("dc map for stencil coefficients set (GPU).")
+                return
+            except ImportError:
+                pass
+
         c_min_rounded = matlab_round(c_map.min())
         offset = float(-c_min_rounded + 1)
 
diff --git a/fullwave/solver/pml_builder.py b/fullwave/solver/pml_builder.py
index f3c0650..fe401d6 100644
--- a/fullwave/solver/pml_builder.py
+++ b/fullwave/solver/pml_builder.py
@@ -289,8 +289,6 @@ def __init__(  # noqa: PLR0912
                 extended_sound_speed = self._extend_map_for_pml(self.medium_org.sound_speed)
                 extended_density = self._extend_map_for_pml(self.medium_org.density)
                 extended_beta = self._extend_map_for_pml(self.medium_org.beta)
-                extended_alpha_coeff = self._extend_map_for_pml(self.medium_org.alpha_coeff)
-                extended_alpha_power = self._extend_map_for_pml(self.medium_org.alpha_power)
                 extended_relaxation_param_dict = {
                     key: self._extend_map_for_pml(value)
                     for key, value in self.medium_org.relaxation_param_dict.items()
@@ -309,14 +307,6 @@ def __init__(  # noqa: PLR0912
                         self._extend_map_for_pml,
                         self.medium_org.beta,
                     )
-                    future_alpha_coeff = executor.submit(
-                        self._extend_map_for_pml,
-                        self.medium_org.alpha_coeff,
-                    )
-                    future_alpha_power = executor.submit(
-                        self._extend_map_for_pml,
-                        self.medium_org.alpha_power,
-                    )
                     future_relaxation_param_dict = {
                         key: executor.submit(self._extend_map_for_pml, value)
                         for key, value in self.medium_org.relaxation_param_dict.items()
@@ -325,8 +315,6 @@ def __init__(  # noqa: PLR0912
                     extended_sound_speed = future_sound_speed.result()
                     extended_density = future_density.result()
                     extended_beta = future_beta.result()
-                    extended_alpha_coeff = future_alpha_coeff.result()
-                    extended_alpha_power = future_alpha_power.result()
                     extended_relaxation_param_dict = {
                         key: future.result() for key, future in future_relaxation_param_dict.items()
                     }
@@ -336,8 +324,6 @@ def __init__(  # noqa: PLR0912
                 sound_speed=extended_sound_speed,
                 density=extended_density,
                 beta=extended_beta,
-                alpha_coeff=extended_alpha_coeff,
-                alpha_power=extended_alpha_power,
                 relaxation_param_dict=extended_relaxation_param_dict,
                 air_coords=self.medium_org.air_coords + self.num_boundary_points,
                 n_relaxation_mechanisms=self.medium_org.n_relaxation_mechanisms,
@@ -939,10 +925,10 @@ def _compute_one(
 
         items = list(rename_dict.items())
 
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            futures = [
-                executor.submit(
-                    _compute_one,
+        if self.xp is not np:
+            # GPU path: run sequentially to avoid CuPy multi-thread CUDA context issues
+            results = [
+                _compute_one(
                     key_fw2,
                     key_py,
                     relaxation_param_dict,
@@ -956,7 +942,25 @@ def _compute_one(
                 )
                 for key_fw2, key_py in items
             ]
-            results = [f.result() for f in futures]
+        else:
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                futures = [
+                    executor.submit(
+                        _compute_one,
+                        key_fw2,
+                        key_py,
+                        relaxation_param_dict,
+                        alpha_target_higher_nu,
+                        d_target_higher_nu,
+                        alpha_target_pml,
+                        d_target_pml,
+                        n_polynomial,
+                        self.is_3d,
+                        self._apply_transition_and_pml,
+                    )
+                    for key_fw2, key_py in items
+                ]
+                results = [f.result() for f in futures]
         out_dict = dict(results)
 
         logger.debug("Calculating PML a and b coefficients...")
@@ -984,9 +988,13 @@ def _worker(
             # Return keys + values so parent can update dict safely
             return (f"a_pml_{axis}{nu}", a, f"b_pml_{axis}{nu}", b)
 
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            futures = [executor.submit(_worker, nu, axis) for nu, axis in tasks]
-            results = [f.result() for f in futures]
+        if self.xp is not np:
+            # GPU path: run sequentially
+            results = [_worker(nu, axis) for nu, axis in tasks]
+        else:
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                futures = [executor.submit(_worker, nu, axis) for nu, axis in tasks]
+                results = [f.result() for f in futures]
 
         for a_key, a_val, b_key, b_val in results:
             out_dict[a_key] = a_val
@@ -1223,10 +1231,11 @@ def _compute_one(
             raise ValueError(error_msg)
 
         items = list(rename_dict.items())
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            futures = [
-                executor.submit(
-                    _compute_one,
+
+        if self.xp is not np:
+            # GPU path: run sequentially to avoid CuPy multi-thread CUDA context issues
+            results = [
+                _compute_one(
                     key_fw2,
                     key_py,
                     relaxation_param_dict,
@@ -1240,7 +1249,25 @@ def _compute_one(
                 )
                 for key_fw2, key_py in items
             ]
-            results = [f.result() for f in futures]
+        else:
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                futures = [
+                    executor.submit(
+                        _compute_one,
+                        key_fw2,
+                        key_py,
+                        relaxation_param_dict,
+                        alpha_target_higher_nu,
+                        d_target_higher_nu,
+                        alpha_target_pml,
+                        d_target_pml,
+                        n_polynomial,
+                        self.is_3d,
+                        self._apply_transition_and_pml,
+                    )
+                    for key_fw2, key_py in items
+                ]
+                results = [f.result() for f in futures]
         out_dict = dict(results)
 
         logger.debug("Calculating PML a and b coefficients...")
@@ -1269,9 +1296,13 @@ def _worker(
             # Return keys + values so parent can update dict safely
             return (f"a_pml_{axis}{nu}", a, f"b_pml_{axis}{nu}", b)
 
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            futures = [executor.submit(_worker, nu, axis) for nu, axis in tasks]
-            results = [f.result() for f in futures]
+        if self.xp is not np:
+            # GPU path: run sequentially
+            results = [_worker(nu, axis) for nu, axis in tasks]
+        else:
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                futures = [executor.submit(_worker, nu, axis) for nu, axis in tasks]
+                results = [f.result() for f in futures]
 
         for a_key, a_val, b_key, b_val in results:
             out_dict[a_key] = a_val
diff --git a/fullwave/solver/solver.py b/fullwave/solver/solver.py
index 2410c86..a6f32f4 100644
--- a/fullwave/solver/solver.py
+++ b/fullwave/solver/solver.py
@@ -560,6 +560,7 @@ def __init__(  # noqa: PLR0912
 
         self.path_fullwave_simulation_bin = path_fullwave_simulation_bin
         self.cuda_device_id = cuda_device_id
+        self.use_gpu_pml = use_gpu_pml
 
         self.fullwave_launcher = Launcher(
             path_fullwave_simulation_bin,
@@ -943,6 +944,7 @@ def run(
             use_isotropic_relaxation=self.use_isotropic_relaxation,
             release_after_write=release_after_write,
             pml_thickness=pml_thickness,
+            use_gpu=self.use_gpu_pml,
         )
         simulation_dir = input_file_writer.run(
             simulation_dir_name,
diff --git a/tests/test_cupy_equivalence.py b/tests/test_cupy_equivalence.py
index f2581fb..3b09d37 100644
--- a/tests/test_cupy_equivalence.py
+++ b/tests/test_cupy_equivalence.py
@@ -8,7 +8,8 @@
 import pytest
 
 import fullwave.medium as medium_module
-from fullwave.medium import Medium, MediumRelaxationMaps
+from fullwave.medium import Medium, MediumExponentialAttenuation, MediumRelaxationMaps
+from fullwave.solver.input_file_writer import InputFileWriter
 from fullwave.solver.pml_builder import PMLBuilder, PMLBuilderExponentialAttenuation
 from fullwave.solver.utils import initialize_relaxation_param_dict
 
@@ -530,3 +531,273 @@ def test_run_identical(self, setup_2d):
             cpu_result.beta,
             rtol=1e-12,
         )
+
+
+class TestMediumExponentialAttenuationCupyEquivalence:
+    """Compare CPU vs GPU for MediumExponentialAttenuation."""
+
+    @pytest.fixture(autouse=True)
+    def _patch(self, monkeypatch):
+        monkeypatch.setattr(medium_module, "check_functions", _dummy_check_functions())
+
+    def _make_pair(self, grid_shape, grid):
+        rng = np.random.default_rng(999)
+        sound_speed = rng.uniform(1400, 1600, grid_shape)
+        density = rng.uniform(900, 1100, grid_shape)
+        alpha_exp = rng.uniform(0.9, 1.0, grid_shape)
+        beta = rng.uniform(0.5, 1.5, grid_shape)
+
+        cpu = MediumExponentialAttenuation(
+            grid,
+            sound_speed.copy(),
+            density.copy(),
+            alpha_exp.copy(),
+            beta.copy(),
+            use_gpu=False,
+        )
+        gpu = MediumExponentialAttenuation(
+            grid,
+            sound_speed.copy(),
+            density.copy(),
+            alpha_exp.copy(),
+            beta.copy(),
+            use_gpu=True,
+        )
+        return cpu, gpu
+
+    def test_bulk_modulus_2d(self):
+        shape = (32, 32)
+        grid = DummyGrid2D(nx=shape[0], ny=shape[1])
+        cpu, gpu = self._make_pair(shape, grid)
+
+        np.testing.assert_allclose(gpu.bulk_modulus, cpu.bulk_modulus, rtol=1e-12)
+
+    def test_bulk_modulus_3d(self):
+        shape = (16, 16, 16)
+        grid = DummyGrid3D(nx=shape[0], ny=shape[1], nz=shape[2])
+        cpu, gpu = self._make_pair(shape, grid)
+
+        np.testing.assert_allclose(gpu.bulk_modulus, cpu.bulk_modulus, rtol=1e-12)
+
+
+class TestMediumRelaxationMapsBulkModulusCupyEquivalence:
+    """Compare CPU vs GPU for MediumRelaxationMaps.bulk_modulus."""
+
+    @pytest.fixture(autouse=True)
+    def _patch(self, monkeypatch):
+        monkeypatch.setattr(medium_module, "check_functions", _dummy_check_functions())
+
+    def test_bulk_modulus_2d(self):
+        shape = (20, 20)
+        grid = DummyGrid2D(nx=shape[0], ny=shape[1])
+        rng = np.random.default_rng(456)
+        sound_speed = rng.uniform(1400, 1600, shape)
+        density = rng.uniform(900, 1100, shape)
+        beta = rng.uniform(0.5, 1.5, shape)
+        relax = _get_relaxation_dict(shape)
+
+        cpu = MediumRelaxationMaps(
+            grid,
+            sound_speed.copy(),
+            density.copy(),
+            beta.copy(),
+            {k: v.copy() for k, v in relax.items()},
+            use_gpu=False,
+        )
+        gpu = MediumRelaxationMaps(
+            grid,
+            sound_speed.copy(),
+            density.copy(),
+            beta.copy(),
+            {k: v.copy() for k, v in relax.items()},
+            use_gpu=True,
+        )
+        np.testing.assert_allclose(gpu.bulk_modulus, cpu.bulk_modulus, rtol=1e-12)
+
+
+class TestInputFileWriterCupyEquivalence:
+    """Compare CPU vs GPU for InputFileWriter._set_dc_map and dim calc."""
+
+    @pytest.fixture(autouse=True)
+    def _patch(self, monkeypatch):
+        monkeypatch.setattr(medium_module, "check_functions", _dummy_check_functions())
+
+    @pytest.fixture()
+    def setup_2d(self):
+        import fullwave
+
+        grid = fullwave.Grid(
+            domain_size=(1e-2, 1e-2),
+            f0=1e6,
+            duration=1e-5,
+            c0=1540.0,
+            ppw=6,
+            cfl=0.4,
+        )
+        shape = (grid.nx, grid.ny)
+
+        rng = np.random.default_rng(777)
+        sound_speed = rng.uniform(1400, 1600, shape)
+        density = rng.uniform(900, 1100, shape)
+        alpha_exp = rng.uniform(0.9, 1.0, shape)
+        beta = np.zeros(shape)
+
+        medium = fullwave.MediumExponentialAttenuation(
+            grid,
+            sound_speed,
+            density,
+            alpha_exp,
+            beta,
+            use_gpu=False,
+        )
+        return grid, medium
+
+    def test_dim_calc(self, setup_2d):
+        grid, medium = setup_2d
+        # CPU dim
+        cpu_dim = int(
+            np.rint(medium.sound_speed.max()) - np.rint(medium.sound_speed.min()),
+        )
+        # GPU dim
+        import cupy as cp
+
+        c_gpu = cp.asarray(medium.sound_speed)
+        gpu_dim = int(cp.rint(c_gpu.max()) - cp.rint(c_gpu.min()))
+
+        assert cpu_dim == gpu_dim
+
+    def test_dc_map(self, setup_2d, tmp_path):
+        import fullwave
+
+        grid, medium = setup_2d
+
+        src_coords = np.array([[grid.nx // 2, grid.ny // 2]])
+        source = fullwave.Source(
+            p0=np.ones((1, 10)),
+            coords=src_coords,
+            grid_shape=(grid.nx, grid.ny),
+        )
+        sensor = fullwave.Sensor(
+            coords=src_coords,
+            grid_shape=(grid.nx, grid.ny),
+        )
+
+        cpu_writer = InputFileWriter(
+            work_dir=tmp_path / "cpu",
+            grid=grid,
+            medium=medium,
+            source=source,
+            sensor=sensor,
+            validate_input=False,
+            use_exponential_attenuation=True,
+            use_gpu=False,
+        )
+        gpu_writer = InputFileWriter(
+            work_dir=tmp_path / "gpu",
+            grid=grid,
+            medium=medium,
+            source=source,
+            sensor=sensor,
+            validate_input=False,
+            use_exponential_attenuation=True,
+            use_gpu=True,
+        )
+        np.testing.assert_array_equal(gpu_writer._dc_map, cpu_writer._dc_map)
+
+
+class TestPMLBuilderRelaxationCupyEquivalence:
+    """Compare CPU vs GPU for PMLBuilder (multiple relaxation path)."""
+
+    @pytest.fixture(autouse=True)
+    def _patch(self, monkeypatch):
+        monkeypatch.setattr(medium_module, "check_functions", _dummy_check_functions())
+
+    @pytest.fixture()
+    def setup_2d(self):
+        import fullwave
+
+        grid = fullwave.Grid(
+            domain_size=(1e-2, 1e-2),
+            f0=1e6,
+            duration=1e-5,
+            c0=1540.0,
+            ppw=6,
+            cfl=0.4,
+        )
+        shape = (grid.nx, grid.ny)
+
+        rng = np.random.default_rng(321)
+        sound_speed = rng.uniform(1400, 1600, shape)
+        density = rng.uniform(900, 1100, shape)
+        beta = rng.uniform(0.5, 1.5, shape)
+        relax = _get_relaxation_dict(shape)
+
+        medium = fullwave.MediumRelaxationMaps(
+            grid,
+            sound_speed,
+            density,
+            beta,
+            relax,
+            use_gpu=False,
+        )
+        src_coords = np.array([[grid.nx // 2, i] for i in range(grid.ny)])
+        sen_coords = np.array([[0, i] for i in range(grid.ny)])
+        source = fullwave.Source(
+            p0=np.ones((src_coords.shape[0], 10)),
+            coords=src_coords,
+            grid_shape=shape,
+        )
+        sensor = fullwave.Sensor(
+            coords=sen_coords,
+            grid_shape=shape,
+        )
+        return grid, medium, source, sensor
+
+    def test_extended_medium_identical(self, setup_2d):
+        grid, medium, source, sensor = setup_2d
+        cpu = PMLBuilder(grid, medium, source, sensor, use_gpu=False)
+        gpu = PMLBuilder(grid, medium, source, sensor, use_gpu=True)
+
+        np.testing.assert_allclose(
+            gpu.extended_medium.sound_speed,
+            cpu.extended_medium.sound_speed,
+            rtol=1e-14,
+        )
+        np.testing.assert_allclose(
+            gpu.extended_medium.density,
+            cpu.extended_medium.density,
+            rtol=1e-14,
+        )
+        for key in cpu.extended_medium.relaxation_param_dict:
+            np.testing.assert_allclose(
+                gpu.extended_medium.relaxation_param_dict[key],
+                cpu.extended_medium.relaxation_param_dict[key],
+                rtol=1e-10,
+                err_msg=f"extended relaxation_param_dict[{key}] mismatch",
+            )
+
+    def test_run_identical(self, setup_2d):
+        grid, medium, source, sensor = setup_2d
+        cpu_builder = PMLBuilder(grid, medium, source, sensor, use_gpu=False)
+        gpu_builder = PMLBuilder(grid, medium, source, sensor, use_gpu=True)
+
+        cpu_result = cpu_builder.run(use_pml=True)
+        gpu_result = gpu_builder.run(use_pml=True)
+
+        np.testing.assert_allclose(
+            gpu_result.sound_speed,
+            cpu_result.sound_speed,
+            rtol=1e-12,
+        )
+        np.testing.assert_allclose(
+            gpu_result.density,
+            cpu_result.density,
+            rtol=1e-12,
+        )
+        for key in cpu_result.relaxation_param_dict_for_fw2:
+            np.testing.assert_allclose(
+                gpu_result.relaxation_param_dict_for_fw2[key],
+                cpu_result.relaxation_param_dict_for_fw2[key],
+                rtol=1e-10,
+                err_msg=f"run() relaxation_param_dict_for_fw2[{key}] mismatch",
+            )

From 018db00301d25709e0c677db77ba49f96aa71823 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Thu, 5 Mar 2026 13:19:38 -0500
Subject: [PATCH 21/31] Enhance GPU support across Medium, PMLBuilder, and
 InputFileWriter classes; streamline array handling and ensure compatibility
 with CuPy.

---
 fullwave/medium.py                   | 220 +++++++++++++--------------
 fullwave/solver/input_file_writer.py |  19 ++-
 fullwave/solver/pml_builder.py       |  20 +--
 tests/test_cupy_equivalence.py       |  73 +++++----
 4 files changed, 164 insertions(+), 168 deletions(-)

diff --git a/fullwave/medium.py b/fullwave/medium.py
index 81e3172..5caaa1e 100644
--- a/fullwave/medium.py
+++ b/fullwave/medium.py
@@ -144,16 +144,17 @@ def __init__(
             )
         self.n_relaxation_mechanisms = n_relaxation_mechanisms
         self.dtype = np.dtype(dtype)
+        xp = self.xp
         self.relaxation_param_dict = initialize_relaxation_param_dict(
             n_relaxation_mechanisms=n_relaxation_mechanisms,
-            value=np.zeros_like(sound_speed, dtype=self.dtype),
+            value=xp.zeros_like(xp.asarray(sound_speed), dtype=self.dtype),
         )
         self.grid = grid
         self.is_3d = grid.is_3d
 
-        self.sound_speed = sound_speed
-        self.density = density
-        self.beta = beta
+        self.sound_speed = xp.atleast_2d(xp.asarray(sound_speed)).astype(self.dtype, copy=False)
+        self.density = xp.atleast_2d(xp.asarray(density)).astype(self.dtype, copy=False)
+        self.beta = xp.atleast_2d(xp.asarray(beta)).astype(self.dtype, copy=False)
 
         if air_coords is not None:
             if air_map is not None:
@@ -167,7 +168,6 @@ def __init__(
             self.air_coords = np.empty((0, ndim), dtype=np.int64)
 
         self.n_jobs = n_jobs
-        self.__post_init__()
 
         self._update_relaxation_param_dict(
             relaxation_param_updates=relaxation_param_dict,
@@ -179,12 +179,6 @@ def __init__(
         self.check_fields()
         logger.debug("MediumRelaxationMaps instance created.")
 
-    def __post_init__(self) -> None:
-        """Post-initialization processing for Medium."""
-        self.sound_speed = np.atleast_2d(self.sound_speed).astype(self.dtype, copy=False)
-        self.density = np.atleast_2d(self.density).astype(self.dtype, copy=False)
-        self.beta = np.atleast_2d(self.beta).astype(self.dtype, copy=False)
-
     def _update_relaxation_param_dict(
         self,
         relaxation_param_updates: dict[str, NDArray[np.float64]],
@@ -251,8 +245,8 @@ def _sort_by_time_const_gpu(
                         xp.where(swap, a_gpu[i], a_gpu[j]),
                     )
             for i in range(n_nu):
-                d_arrays[i] = xp.asnumpy(d_gpu[i])
-                a_arrays[i] = xp.asnumpy(a_gpu[i])
+                d_arrays[i] = d_gpu[i]
+                a_arrays[i] = a_gpu[i]
 
         if use_gpu:
             _sort_by_time_const_gpu(d_x1, a_x1, kappa_x1)
@@ -266,15 +260,15 @@ def _sort_by_time_const_gpu(
 
         # Write results into relaxation_param_dict
         param_dict = self.relaxation_param_dict
-        param_dict["kappa_x1"] = np.atleast_2d(kappa_x1)
-        param_dict["kappa_x2"] = np.atleast_2d(kappa_x2)
+        param_dict["kappa_x1"] = xp.atleast_2d(xp.asarray(kappa_x1))
+        param_dict["kappa_x2"] = xp.atleast_2d(xp.asarray(kappa_x2))
 
         for i in range(n_nu):
             nu = i + 1
-            param_dict[f"d_x1_nu{nu}"] = np.atleast_2d(d_x1[i])
-            param_dict[f"alpha_x1_nu{nu}"] = np.atleast_2d(a_x1[i])
-            param_dict[f"d_x2_nu{nu}"] = np.atleast_2d(d_x2[i])
-            param_dict[f"alpha_x2_nu{nu}"] = np.atleast_2d(a_x2[i])
+            param_dict[f"d_x1_nu{nu}"] = xp.atleast_2d(xp.asarray(d_x1[i]))
+            param_dict[f"alpha_x1_nu{nu}"] = xp.atleast_2d(xp.asarray(a_x1[i]))
+            param_dict[f"d_x2_nu{nu}"] = xp.atleast_2d(xp.asarray(d_x2[i]))
+            param_dict[f"alpha_x2_nu{nu}"] = xp.atleast_2d(xp.asarray(a_x2[i]))
 
         # Cache and check keys
         desired_key_set = getattr(
@@ -329,16 +323,17 @@ def check_relaxation_param_dict(
                 )
                 raise ValueError(error_msg)
 
+    def _to_numpy(self, arr: NDArray) -> NDArray:
+        """Transfer array to CPU numpy. No-op if already numpy."""
+        if self.xp is not np:
+            return self.xp.asnumpy(arr)
+        return arr
+
     @property
-    def bulk_modulus(self) -> NDArray[np.float64]:
-        """Return the bulk_modulus."""
+    def bulk_modulus(self) -> NDArray:
+        """Return the bulk_modulus (stays on same device as source arrays)."""
         xp = self.xp
-        if xp is not np:
-            c_gpu = xp.asarray(self.sound_speed)
-            rho_gpu = xp.asarray(self.density)
-            result = c_gpu * c_gpu * rho_gpu
-            return xp.asnumpy(result)
-        return np.multiply(self.sound_speed**2, self.density)
+        return xp.multiply(self.sound_speed**2, self.density)
 
     @property
     def air_map(self) -> NDArray[np.int64]:
@@ -403,8 +398,6 @@ def _calc_a_and_b(
             a = a.astype(output_dtype, copy=False)
             b = b.astype(output_dtype, copy=False)
 
-        if use_gpu:
-            return xp.asnumpy(a), xp.asnumpy(b)
         return a, b
 
     @staticmethod
@@ -571,17 +564,18 @@ def plot(
             error_msg = "3D plotting is not implemented yet."
             raise NotImplementedError(error_msg)
 
+        _np = self._to_numpy
         if plot_fw2_params:
             target_map_dict: OrderedDict = OrderedDict(
                 [
-                    ("Sound speed", self.sound_speed),
-                    ("Density", self.density),
-                    ("Beta", self.beta),
+                    ("Sound speed", _np(self.sound_speed)),
+                    ("Density", _np(self.density)),
+                    ("Beta", _np(self.beta)),
                     ("Air map", self.air_map),
                 ],
             )
             for key in self.relaxation_param_dict_for_fw2:
-                target_map_dict[key] = self.relaxation_param_dict_for_fw2[key]
+                target_map_dict[key] = _np(self.relaxation_param_dict_for_fw2[key])
         else:
             relaxation_param_dict_keys = initialize_relaxation_param_dict(
                 n_relaxation_mechanisms=self.n_relaxation_mechanisms,
@@ -589,14 +583,14 @@ def plot(
 
             target_map_dict: OrderedDict = OrderedDict(
                 [
-                    ("Sound speed", self.sound_speed),
-                    ("Density", self.density),
-                    ("Beta", self.beta),
+                    ("Sound speed", _np(self.sound_speed)),
+                    ("Density", _np(self.density)),
+                    ("Beta", _np(self.beta)),
                     ("Air map", self.air_map),
                 ],
             )
             for key in relaxation_param_dict_keys:
-                target_map_dict[key] = self.relaxation_param_dict[key]
+                target_map_dict[key] = _np(self.relaxation_param_dict[key])
 
         num_plots = len(target_map_dict)
         # calculate subplot shape to make a square
@@ -641,21 +635,22 @@ def __str__(self) -> str:
             A string summarizing the Medium properties.
 
         """
+        xp = self.xp
         return (
             f"Relaxation Medium:\n"
             f"  Grid: {self.grid}\n"
             "\n"
-            f"  Sound speed: min {np.min(self.sound_speed):.2f} m/s, "
-            f"max {np.max(self.sound_speed):.2f} m/s\n"
-            f"  Density: min {np.min(self.density):.2f} kg/m^3, "
-            f"max {np.max(self.density):.2f} kg/m^3\n"
-            f"  Beta: min {np.min(self.beta):.2f}, max {np.max(self.beta):.2f}\n"
+            f"  Sound speed: min {float(xp.min(self.sound_speed)):.2f} m/s, "
+            f"max {float(xp.max(self.sound_speed)):.2f} m/s\n"
+            f"  Density: min {float(xp.min(self.density)):.2f} kg/m^3, "
+            f"max {float(xp.max(self.density)):.2f} kg/m^3\n"
+            f"  Beta: min {float(xp.min(self.beta)):.2f}, max {float(xp.max(self.beta)):.2f}\n"
             f"  Number of air coordinates: {self.n_air}\n"
             f"  Number of relaxation mechanisms: {self.n_relaxation_mechanisms}\n"
             f"  Relaxation parameters:\n"
         ) + "".join(
             [
-                f"    {key}: min {np.min(value):.4e}, max {np.max(value):.4e}\n"
+                f"    {key}: min {float(xp.min(value)):.4e}, max {float(xp.max(value)):.4e}\n"
                 for key, value in self.relaxation_param_dict.items()
             ],
         )
@@ -762,10 +757,11 @@ def __init__(
         self.is_3d = grid.is_3d
         self.dtype = np.dtype(dtype)
 
-        self.sound_speed = sound_speed
-        self.density = density
-        self.alpha_exp = alpha_exp
-        self.beta = beta
+        xp = self.xp
+        self.sound_speed = xp.atleast_2d(xp.asarray(sound_speed)).astype(self.dtype, copy=False)
+        self.density = xp.atleast_2d(xp.asarray(density)).astype(self.dtype, copy=False)
+        self.alpha_exp = xp.atleast_2d(xp.asarray(alpha_exp)).astype(self.dtype, copy=False)
+        self.beta = xp.atleast_2d(xp.asarray(beta)).astype(self.dtype, copy=False)
 
         if air_coords is not None:
             if air_map is not None:
@@ -778,16 +774,8 @@ def __init__(
             ndim = 3 if self.is_3d else 2
             self.air_coords = np.empty((0, ndim), dtype=np.int64)
 
-        self.__post_init__()
         self.check_fields()
 
-    def __post_init__(self) -> None:
-        """Post-initialization processing for Medium."""
-        self.sound_speed = np.atleast_2d(self.sound_speed).astype(self.dtype, copy=False)
-        self.density = np.atleast_2d(self.density).astype(self.dtype, copy=False)
-        self.alpha_exp = np.atleast_2d(self.alpha_exp).astype(self.dtype, copy=False)
-        self.beta = np.atleast_2d(self.beta).astype(self.dtype, copy=False)
-
     def check_fields(self) -> None:
         """Check if the fields have the correct shape."""
         grid_shape: tuple[int, ...]
@@ -807,6 +795,12 @@ def _error_msg(
         assert self.alpha_exp.shape == grid_shape, _error_msg(self.alpha_exp, grid_shape)
         assert self.beta.shape == grid_shape, _error_msg(self.beta, grid_shape)
 
+    def _to_numpy(self, arr: NDArray) -> NDArray:
+        """Transfer array to CPU numpy. No-op if already numpy."""
+        if self.xp is not np:
+            return self.xp.asnumpy(arr)
+        return arr
+
     @property
     def air_map(self) -> NDArray[np.int64]:
         """Returns the air map.
@@ -823,15 +817,10 @@ def air_map(self) -> NDArray[np.int64]:
         return coords_to_map(self.air_coords, grid_shape=grid_shape, is_3d=self.is_3d)
 
     @property
-    def bulk_modulus(self) -> NDArray[np.float64]:
-        """Return the bulk_modulus."""
-        xp = getattr(self, "xp", np)
-        if xp is not np:
-            c_gpu = xp.asarray(self.sound_speed)
-            rho_gpu = xp.asarray(self.density)
-            result = c_gpu * c_gpu * rho_gpu
-            return xp.asnumpy(result)
-        return np.multiply(self.sound_speed**2, self.density)
+    def bulk_modulus(self) -> NDArray:
+        """Return the bulk_modulus (stays on same device as source arrays)."""
+        xp = self.xp
+        return xp.multiply(self.sound_speed**2, self.density)
 
     @property
     def n_coords_zero(self) -> int:
@@ -1046,11 +1035,12 @@ def __init__(
         self.is_3d = grid.is_3d
         self.dtype = np.dtype(dtype)
 
-        self.sound_speed = sound_speed
-        self.density = density
-        self.alpha_coeff = alpha_coeff
-        self.alpha_power = alpha_power
-        self.beta = beta
+        xp = self.xp
+        self.sound_speed = xp.atleast_2d(xp.asarray(sound_speed)).astype(self.dtype, copy=False)
+        self.density = xp.atleast_2d(xp.asarray(density)).astype(self.dtype, copy=False)
+        self.alpha_coeff = xp.atleast_2d(xp.asarray(alpha_coeff)).astype(self.dtype, copy=False)
+        self.alpha_power = xp.atleast_2d(xp.asarray(alpha_power)).astype(self.dtype, copy=False)
+        self.beta = xp.atleast_2d(xp.asarray(beta)).astype(self.dtype, copy=False)
 
         if air_coords is not None:
             if air_map is not None:
@@ -1078,18 +1068,9 @@ def __init__(
 
         self.attenuation_builder = attenuation_builder
         self.n_jobs = n_jobs
-        self.__post_init__()
         self.check_fields()
         logger.debug("Medium instance created.")
 
-    def __post_init__(self) -> None:
-        """Post-initialization processing for Medium."""
-        self.sound_speed = np.atleast_2d(self.sound_speed).astype(self.dtype, copy=False)
-        self.density = np.atleast_2d(self.density).astype(self.dtype, copy=False)
-        self.alpha_coeff = np.atleast_2d(self.alpha_coeff).astype(self.dtype, copy=False)
-        self.alpha_power = np.atleast_2d(self.alpha_power).astype(self.dtype, copy=False)
-        self.beta = np.atleast_2d(self.beta).astype(self.dtype, copy=False)
-
     def check_fields(self) -> None:
         """Check if the fields have the correct shape."""
         grid_shape: tuple[int, ...]
@@ -1126,16 +1107,17 @@ def air_map(self) -> NDArray[np.int64]:
             return np.zeros(grid_shape, dtype=int)
         return coords_to_map(self.air_coords, grid_shape=grid_shape, is_3d=self.is_3d)
 
+    def _to_numpy(self, arr: NDArray) -> NDArray:
+        """Transfer array to CPU numpy. No-op if already numpy."""
+        if self.xp is not np:
+            return self.xp.asnumpy(arr)
+        return arr
+
     @property
-    def bulk_modulus(self) -> NDArray[np.float64]:
-        """Return the bulk_modulus."""
-        xp = getattr(self, "xp", np)
-        if xp is not np:
-            c_gpu = xp.asarray(self.sound_speed)
-            rho_gpu = xp.asarray(self.density)
-            result = c_gpu * c_gpu * rho_gpu
-            return xp.asnumpy(result)
-        return np.multiply(self.sound_speed**2, self.density)
+    def bulk_modulus(self) -> NDArray:
+        """Return the bulk_modulus (stays on same device as source arrays)."""
+        xp = self.xp
+        return xp.multiply(self.sound_speed**2, self.density)
 
     @property
     def n_coords_zero(self) -> int:
@@ -1161,6 +1143,7 @@ def plot(
         dpi: int = 300,
     ) -> None:
         """Plot the medium fields using matplotlib."""
+        _np = self._to_numpy
         if self.is_3d:
             plt.close("all")
             _, axes = plt.subplots(2, 6, figsize=figsize)
@@ -1168,16 +1151,16 @@ def plot(
             for ax, map_data, title in zip(
                 axes.flatten(),
                 [
-                    self.sound_speed[:, :, self.grid.nz // 2],
-                    self.sound_speed[:, self.grid.ny // 2, :],
-                    self.density[:, :, self.grid.nz // 2],
-                    self.density[:, self.grid.ny // 2, :],
-                    self.alpha_coeff[:, :, self.grid.nz // 2],
-                    self.alpha_coeff[:, self.grid.ny // 2, :],
-                    self.alpha_power[:, :, self.grid.nz // 2],
-                    self.alpha_power[:, self.grid.ny // 2, :],
-                    self.beta[:, :, self.grid.nz // 2],
-                    self.beta[:, self.grid.ny // 2, :],
+                    _np(self.sound_speed[:, :, self.grid.nz // 2]),
+                    _np(self.sound_speed[:, self.grid.ny // 2, :]),
+                    _np(self.density[:, :, self.grid.nz // 2]),
+                    _np(self.density[:, self.grid.ny // 2, :]),
+                    _np(self.alpha_coeff[:, :, self.grid.nz // 2]),
+                    _np(self.alpha_coeff[:, self.grid.ny // 2, :]),
+                    _np(self.alpha_power[:, :, self.grid.nz // 2]),
+                    _np(self.alpha_power[:, self.grid.ny // 2, :]),
+                    _np(self.beta[:, :, self.grid.nz // 2]),
+                    _np(self.beta[:, self.grid.ny // 2, :]),
                     self.air_map[:, :, self.grid.nz // 2],
                     self.air_map[:, self.grid.ny // 2, :],
                 ],
@@ -1216,11 +1199,11 @@ def plot(
             for ax, map_data, title in zip(
                 axes.flatten(),
                 [
-                    self.sound_speed,
-                    self.density,
-                    self.alpha_coeff,
-                    self.alpha_power,
-                    self.beta,
+                    _np(self.sound_speed),
+                    _np(self.density),
+                    _np(self.alpha_coeff),
+                    _np(self.alpha_power),
+                    _np(self.beta),
                     self.air_map,
                 ],
                 [
@@ -1285,11 +1268,15 @@ def build(self) -> MediumRelaxationMaps:
 
         """
         logger.debug("Building MediumRelaxationMaps from alpha and power maps.")
+        xp = self.xp
+        # generate_relaxation_params uses Numba and requires numpy arrays
+        alpha_coeff_np = xp.asnumpy(self.alpha_coeff) if xp is not np else self.alpha_coeff
+        alpha_power_np = xp.asnumpy(self.alpha_power) if xp is not np else self.alpha_power
         if self.attenuation_builder == "lookup":
             relaxation_param_dict = generate_relaxation_params(
                 n_relaxation_mechanisms=self.n_relaxation_mechanisms,
-                alpha_coeff=self.alpha_coeff,
-                alpha_power=self.alpha_power,
+                alpha_coeff=alpha_coeff_np,
+                alpha_power=alpha_power_np,
                 path_database=self.path_relaxation_parameters_database,
             )
         else:
@@ -1302,6 +1289,8 @@ def build(self) -> MediumRelaxationMaps:
             relaxation_param_dict = {
                 k: v.astype(self.dtype, copy=False) for k, v in relaxation_param_dict.items()
             }
+        # Convert relaxation params to GPU if needed (MediumRelaxationMaps will
+        # call xp.asarray on them in __init__)
         return MediumRelaxationMaps(
             grid=self.grid,
             sound_speed=self.sound_speed,
@@ -1342,9 +1331,9 @@ def _db_mhz_cm_to_a_exp(
 
         xp = self.xp
         if xp is not np:
+            # alpha_coeff may already be a CuPy array; asarray is a no-op in that case
             alpha_gpu = xp.asarray(alpha_coeff)
-            result = xp.exp(att_factor_dt * alpha_gpu)
-            return xp.asnumpy(result)
+            return xp.exp(att_factor_dt * alpha_gpu)
         return ne.evaluate("exp(att * a)", local_dict={"a": alpha_coeff, "att": att_factor_dt})
 
     def build_exponential(self) -> MediumExponentialAttenuation:
@@ -1388,19 +1377,20 @@ def __str__(self) -> str:
             A string summarizing the Medium properties.
 
         """
+        xp = self.xp
         return (
             f"Medium: \n"
             f"  Grid: {self.grid}\n"
             "\n"
-            f"  Sound speed: min={np.min(self.sound_speed):.2f}, "
-            f"max={np.max(self.sound_speed):.2f}\n"
-            f"  Density: min={np.min(self.density):.2f}, "
-            f"max={np.max(self.density):.2f}\n"
-            f"  Alpha coeff: min={np.min(self.alpha_coeff):.2f}, "
-            f"max={np.max(self.alpha_coeff):.2f}\n"
-            f"  Alpha power: min={np.min(self.alpha_power):.2f}, "
-            f"max={np.max(self.alpha_power):.2f}\n"
-            f"  Beta: min={np.min(self.beta):.2f}, max={np.max(self.beta):.2f}\n"
+            f"  Sound speed: min={float(xp.min(self.sound_speed)):.2f}, "
+            f"max={float(xp.max(self.sound_speed)):.2f}\n"
+            f"  Density: min={float(xp.min(self.density)):.2f}, "
+            f"max={float(xp.max(self.density)):.2f}\n"
+            f"  Alpha coeff: min={float(xp.min(self.alpha_coeff)):.2f}, "
+            f"max={float(xp.max(self.alpha_coeff)):.2f}\n"
+            f"  Alpha power: min={float(xp.min(self.alpha_power)):.2f}, "
+            f"max={float(xp.max(self.alpha_power)):.2f}\n"
+            f"  Beta: min={float(xp.min(self.beta)):.2f}, max={float(xp.max(self.beta)):.2f}\n"
             f"  Number of air coords: {self.n_air}\n"
             f"  Attenuation builder: {self.attenuation_builder}\n"
         )
diff --git a/fullwave/solver/input_file_writer.py b/fullwave/solver/input_file_writer.py
index bee8459..ac82b7a 100644
--- a/fullwave/solver/input_file_writer.py
+++ b/fullwave/solver/input_file_writer.py
@@ -112,18 +112,20 @@ def __init__(
             try:
                 import cupy as cp  # noqa: PLC0415
 
+                # sound_speed may already be CuPy — asarray is a no-op in that case
                 c_gpu = cp.asarray(self.medium.sound_speed, dtype=cp.float64)
                 c_min_val = float(c_gpu.min())
                 c_max_val = float(c_gpu.max())
                 self._dim = int(cp.rint(cp.float64(c_max_val)) - cp.rint(cp.float64(c_min_val)))
 
-                # Compute dc_map in the same GPU pass (avoids a second H2D transfer)
+                # Compute dc_map on GPU (use a copy to avoid mutating medium data)
+                c_tmp = c_gpu.copy()
                 c_min_rounded = float(matlab_round(c_min_val))
                 offset = -c_min_rounded + 1
-                c_gpu += 1e-9
-                cp.rint(c_gpu, out=c_gpu)
-                c_gpu += offset
-                self._dc_map = cp.asnumpy(c_gpu.astype(cp.int32))
+                c_tmp += 1e-9
+                cp.rint(c_tmp, out=c_tmp)
+                c_tmp += offset
+                self._dc_map = cp.asnumpy(c_tmp.astype(cp.int32))
                 logger.debug("dc map for stencil coefficients set (GPU, fused).")
                 self._dc_map_ready = True
             except ImportError:
@@ -1244,6 +1246,13 @@ def _write_matrix(
 
         dtype = np.dtype(var_type)
 
+        # Transfer CuPy arrays to CPU for file writing
+        if not isinstance(variable_mat, np.ndarray):
+            try:
+                variable_mat = variable_mat.get()  # CuPy → numpy
+            except AttributeError:
+                variable_mat = np.asarray(variable_mat)
+
         # Fast path: no conversion, no reorder.
         if variable_mat.dtype == dtype and variable_mat.flags.c_contiguous:
             variable_mat.tofile(save_path)  # writes in C order
diff --git a/fullwave/solver/pml_builder.py b/fullwave/solver/pml_builder.py
index fe401d6..0c44d7f 100644
--- a/fullwave/solver/pml_builder.py
+++ b/fullwave/solver/pml_builder.py
@@ -537,8 +537,8 @@ def _extend_map_for_pml(
         xp = self.xp
         pad = self.num_boundary_points
 
-        # Transfer to GPU if needed
-        input_gpu = xp.asarray(input_map) if xp is not np else input_map
+        # Ensure array is on the correct device (no-op if already there)
+        input_gpu = xp.asarray(input_map)
 
         # Pre-allocate output array with correct dtype
         if self.is_3d:
@@ -592,9 +592,6 @@ def _extend_map_for_pml(
                 output[:, :pad] = 0
                 output[:, pad + ny :] = 0
 
-        # Transfer back to CPU if needed
-        if xp is not np:
-            return xp.asnumpy(output)
         return output
 
     def _localize_pml_region(self) -> tuple[NDArray[np.float64], ...]:
@@ -706,8 +703,6 @@ def _calc_a_and_b(
             a = a.astype(output_dtype, copy=False)
             b = b.astype(output_dtype, copy=False)
 
-        if use_gpu:
-            return xp.asnumpy(a), xp.asnumpy(b)
         return a, b
 
     def run(self, *, use_pml: bool = True) -> fullwave.MediumRelaxationMaps:
@@ -1452,10 +1447,7 @@ def _apply_transition_and_pml(  # noqa: PLR0912
         working_array[down_start:down_end] = down_vals - trans_down * (down_vals - value_target)
 
         # Move axis back
-        result = xp.moveaxis(working_array, 0, axis)
-        if use_gpu:
-            return xp.asnumpy(result)
-        return result
+        return xp.moveaxis(working_array, 0, axis)
 
     @staticmethod
     def _calc_time_constants(
@@ -1940,8 +1932,7 @@ def edge_distance_1d(n: int, n_body: int) -> NDArray[np.float32]:
             mmax = float(xp.sqrt(mask_sq.max()))
             if mmax > 0.0:
                 mask_sq = mask_sq / (mmax * mmax)
-            result = xp.maximum(1 - xp.sqrt(mask_sq), 0)
-            return xp.asnumpy(result)
+            return xp.maximum(1 - xp.sqrt(mask_sq), 0)
 
         rx_np = rx  # noqa: F841
         ry_np = ry  # noqa: F841
@@ -1992,8 +1983,7 @@ def edge_distance_1d(n: int, n_body: int) -> NDArray[np.float32]:
             mmax = float(xp.sqrt(mask_sq.max()))
             if mmax > 0.0:
                 mask_sq = mask_sq / (mmax * mmax)
-            result = xp.maximum(1 - xp.sqrt(mask_sq), 0)
-            return xp.asnumpy(result)
+            return xp.maximum(1 - xp.sqrt(mask_sq), 0)
 
         rx_np = rx  # noqa: F841
         ry_np = ry  # noqa: F841
diff --git a/tests/test_cupy_equivalence.py b/tests/test_cupy_equivalence.py
index 3b09d37..7a02b68 100644
--- a/tests/test_cupy_equivalence.py
+++ b/tests/test_cupy_equivalence.py
@@ -66,6 +66,13 @@ def __init__(self, nx, ny, nz, dt=1e-8, f0=1e6, c0=1540.0, ppw=12, cfl=0.4):
         self.is_3d = True
 
 
+def _to_np(arr):
+    """Convert CuPy array to numpy; no-op for numpy arrays."""
+    if isinstance(arr, np.ndarray):
+        return arr
+    return arr.get()
+
+
 def _dummy_check_functions():
     return type(
         "dummy",
@@ -130,7 +137,7 @@ def test_db_mhz_cm_to_a_exp_2d(self):
         cpu_result = cpu._db_mhz_cm_to_a_exp(cpu.alpha_coeff)
         gpu_result = gpu._db_mhz_cm_to_a_exp(gpu.alpha_coeff)
 
-        np.testing.assert_allclose(gpu_result, cpu_result, rtol=1e-12)
+        np.testing.assert_allclose(_to_np(gpu_result), cpu_result, rtol=1e-12)
 
     def test_db_mhz_cm_to_a_exp_3d(self):
         shape = (16, 16, 16)
@@ -140,7 +147,7 @@ def test_db_mhz_cm_to_a_exp_3d(self):
         cpu_result = cpu._db_mhz_cm_to_a_exp(cpu.alpha_coeff)
         gpu_result = gpu._db_mhz_cm_to_a_exp(gpu.alpha_coeff)
 
-        np.testing.assert_allclose(gpu_result, cpu_result, rtol=1e-12)
+        np.testing.assert_allclose(_to_np(gpu_result), cpu_result, rtol=1e-12)
 
 
 class TestMediumRelaxationMapsCupyEquivalence:
@@ -182,7 +189,7 @@ def test_relaxation_param_dict_2d(self):
 
         for key in cpu.relaxation_param_dict:
             np.testing.assert_allclose(
-                gpu.relaxation_param_dict[key],
+                _to_np(gpu.relaxation_param_dict[key]),
                 cpu.relaxation_param_dict[key],
                 rtol=1e-10,
                 err_msg=f"relaxation_param_dict[{key}] mismatch",
@@ -195,7 +202,7 @@ def test_relaxation_param_dict_for_fw2_2d(self):
 
         for key in cpu.relaxation_param_dict_for_fw2:
             np.testing.assert_allclose(
-                gpu.relaxation_param_dict_for_fw2[key],
+                _to_np(gpu.relaxation_param_dict_for_fw2[key]),
                 cpu.relaxation_param_dict_for_fw2[key],
                 rtol=1e-10,
                 err_msg=f"relaxation_param_dict_for_fw2[{key}] mismatch",
@@ -215,8 +222,8 @@ def test_calc_a_and_b(self):
         a_cpu, b_cpu = cpu._calc_a_and_b(dx, kappa, alpha, dt)
         a_gpu, b_gpu = gpu._calc_a_and_b(dx, kappa, alpha, dt)
 
-        np.testing.assert_allclose(a_gpu, a_cpu, rtol=1e-12)
-        np.testing.assert_allclose(b_gpu, b_cpu, rtol=1e-12)
+        np.testing.assert_allclose(_to_np(a_gpu), a_cpu, rtol=1e-12)
+        np.testing.assert_allclose(_to_np(b_gpu), b_cpu, rtol=1e-12)
 
 
 # ---------------------------------------------------------------------------
@@ -299,7 +306,7 @@ def test_extend_map_for_pml_2d_fill_edge(self, setup_2d):
         cpu_result = cpu._extend_map_for_pml(arr.copy(), fill_edge=True)
         gpu_result = gpu._extend_map_for_pml(arr.copy(), fill_edge=True)
 
-        np.testing.assert_allclose(gpu_result, cpu_result, rtol=1e-14)
+        np.testing.assert_allclose(_to_np(gpu_result), cpu_result, rtol=1e-14)
 
     def test_extend_map_for_pml_2d_zero_fill(self, setup_2d):
         grid, medium, source, sensor = setup_2d
@@ -311,7 +318,7 @@ def test_extend_map_for_pml_2d_zero_fill(self, setup_2d):
         cpu_result = cpu._extend_map_for_pml(arr.copy(), fill_edge=False)
         gpu_result = gpu._extend_map_for_pml(arr.copy(), fill_edge=False)
 
-        np.testing.assert_allclose(gpu_result, cpu_result, rtol=1e-14)
+        np.testing.assert_allclose(_to_np(gpu_result), cpu_result, rtol=1e-14)
 
     def test_apply_transition_and_pml_2d(self, setup_2d):
         grid, medium, source, sensor = setup_2d
@@ -340,7 +347,7 @@ def test_apply_transition_and_pml_2d(self, setup_2d):
                     is_3d=False,
                 )
                 np.testing.assert_allclose(
-                    gpu_result,
+                    _to_np(gpu_result),
                     cpu_result,
                     rtol=1e-12,
                     err_msg=f"axis={axis}, transition_type={transition_type}",
@@ -360,8 +367,8 @@ def test_calc_a_and_b(self, setup_2d):
         a_cpu, b_cpu = cpu._calc_a_and_b(dx, kappa, alpha, dt)
         a_gpu, b_gpu = gpu._calc_a_and_b(dx, kappa, alpha, dt)
 
-        np.testing.assert_allclose(a_gpu, a_cpu, rtol=1e-12)
-        np.testing.assert_allclose(b_gpu, b_cpu, rtol=1e-12)
+        np.testing.assert_allclose(_to_np(a_gpu), a_cpu, rtol=1e-12)
+        np.testing.assert_allclose(_to_np(b_gpu), b_cpu, rtol=1e-12)
 
     def test_extended_medium_identical(self, setup_2d):
         """The extended medium (after __init__) should be identical CPU vs GPU."""
@@ -369,27 +376,27 @@ def test_extended_medium_identical(self, setup_2d):
         cpu, gpu = self._make_pml_pair(grid, medium, source, sensor)
 
         np.testing.assert_allclose(
-            gpu.extended_medium.sound_speed,
+            _to_np(gpu.extended_medium.sound_speed),
             cpu.extended_medium.sound_speed,
             rtol=1e-14,
         )
         np.testing.assert_allclose(
-            gpu.extended_medium.density,
+            _to_np(gpu.extended_medium.density),
             cpu.extended_medium.density,
             rtol=1e-14,
         )
         np.testing.assert_allclose(
-            gpu.extended_medium.alpha_coeff,
+            _to_np(gpu.extended_medium.alpha_coeff),
             cpu.extended_medium.alpha_coeff,
             rtol=1e-14,
         )
         np.testing.assert_allclose(
-            gpu.extended_medium.alpha_power,
+            _to_np(gpu.extended_medium.alpha_power),
             cpu.extended_medium.alpha_power,
             rtol=1e-14,
         )
         np.testing.assert_allclose(
-            gpu.extended_medium.beta,
+            _to_np(gpu.extended_medium.beta),
             cpu.extended_medium.beta,
             rtol=1e-14,
         )
@@ -460,7 +467,7 @@ def test_mask_body_2d(self, setup_2d):
         cpu_mask = cpu._mask_body_2d(nx, ny, cpu.num_boundary_points)
         gpu_mask = gpu._mask_body_2d(nx, ny, gpu.num_boundary_points)
 
-        np.testing.assert_allclose(gpu_mask, cpu_mask, rtol=1e-5, atol=1e-7)
+        np.testing.assert_allclose(_to_np(gpu_mask), cpu_mask, rtol=1e-5, atol=1e-7)
 
     def test_extended_medium_identical(self, setup_2d):
         grid, medium, source, sensor = setup_2d
@@ -480,12 +487,12 @@ def test_extended_medium_identical(self, setup_2d):
         )
 
         np.testing.assert_allclose(
-            gpu.extended_medium.sound_speed,
+            _to_np(gpu.extended_medium.sound_speed),
             cpu.extended_medium.sound_speed,
             rtol=1e-14,
         )
         np.testing.assert_allclose(
-            gpu.extended_medium.density,
+            _to_np(gpu.extended_medium.density),
             cpu.extended_medium.density,
             rtol=1e-14,
         )
@@ -511,23 +518,23 @@ def test_run_identical(self, setup_2d):
         gpu_result = gpu_builder.run(use_pml=True)
 
         np.testing.assert_allclose(
-            gpu_result.sound_speed,
+            _to_np(gpu_result.sound_speed),
             cpu_result.sound_speed,
             rtol=1e-12,
         )
         np.testing.assert_allclose(
-            gpu_result.density,
+            _to_np(gpu_result.density),
             cpu_result.density,
             rtol=1e-12,
         )
         np.testing.assert_allclose(
-            gpu_result.alpha_exp,
+            _to_np(gpu_result.alpha_exp),
             cpu_result.alpha_exp,
             rtol=1e-5,
             atol=1e-7,
         )
         np.testing.assert_allclose(
-            gpu_result.beta,
+            _to_np(gpu_result.beta),
             cpu_result.beta,
             rtol=1e-12,
         )
@@ -570,14 +577,14 @@ def test_bulk_modulus_2d(self):
         grid = DummyGrid2D(nx=shape[0], ny=shape[1])
         cpu, gpu = self._make_pair(shape, grid)
 
-        np.testing.assert_allclose(gpu.bulk_modulus, cpu.bulk_modulus, rtol=1e-12)
+        np.testing.assert_allclose(_to_np(gpu.bulk_modulus), cpu.bulk_modulus, rtol=1e-12)
 
     def test_bulk_modulus_3d(self):
         shape = (16, 16, 16)
         grid = DummyGrid3D(nx=shape[0], ny=shape[1], nz=shape[2])
         cpu, gpu = self._make_pair(shape, grid)
 
-        np.testing.assert_allclose(gpu.bulk_modulus, cpu.bulk_modulus, rtol=1e-12)
+        np.testing.assert_allclose(_to_np(gpu.bulk_modulus), cpu.bulk_modulus, rtol=1e-12)
 
 
 class TestMediumRelaxationMapsBulkModulusCupyEquivalence:
@@ -612,7 +619,7 @@ def test_bulk_modulus_2d(self):
             {k: v.copy() for k, v in relax.items()},
             use_gpu=True,
         )
-        np.testing.assert_allclose(gpu.bulk_modulus, cpu.bulk_modulus, rtol=1e-12)
+        np.testing.assert_allclose(_to_np(gpu.bulk_modulus), cpu.bulk_modulus, rtol=1e-12)
 
 
 class TestInputFileWriterCupyEquivalence:
@@ -702,7 +709,7 @@ def test_dc_map(self, setup_2d, tmp_path):
             use_exponential_attenuation=True,
             use_gpu=True,
         )
-        np.testing.assert_array_equal(gpu_writer._dc_map, cpu_writer._dc_map)
+        np.testing.assert_array_equal(_to_np(gpu_writer._dc_map), cpu_writer._dc_map)
 
 
 class TestPMLBuilderRelaxationCupyEquivalence:
@@ -759,18 +766,18 @@ def test_extended_medium_identical(self, setup_2d):
         gpu = PMLBuilder(grid, medium, source, sensor, use_gpu=True)
 
         np.testing.assert_allclose(
-            gpu.extended_medium.sound_speed,
+            _to_np(gpu.extended_medium.sound_speed),
             cpu.extended_medium.sound_speed,
             rtol=1e-14,
         )
         np.testing.assert_allclose(
-            gpu.extended_medium.density,
+            _to_np(gpu.extended_medium.density),
             cpu.extended_medium.density,
             rtol=1e-14,
         )
         for key in cpu.extended_medium.relaxation_param_dict:
             np.testing.assert_allclose(
-                gpu.extended_medium.relaxation_param_dict[key],
+                _to_np(gpu.extended_medium.relaxation_param_dict[key]),
                 cpu.extended_medium.relaxation_param_dict[key],
                 rtol=1e-10,
                 err_msg=f"extended relaxation_param_dict[{key}] mismatch",
@@ -785,18 +792,18 @@ def test_run_identical(self, setup_2d):
         gpu_result = gpu_builder.run(use_pml=True)
 
         np.testing.assert_allclose(
-            gpu_result.sound_speed,
+            _to_np(gpu_result.sound_speed),
             cpu_result.sound_speed,
             rtol=1e-12,
         )
         np.testing.assert_allclose(
-            gpu_result.density,
+            _to_np(gpu_result.density),
             cpu_result.density,
             rtol=1e-12,
         )
         for key in cpu_result.relaxation_param_dict_for_fw2:
             np.testing.assert_allclose(
-                gpu_result.relaxation_param_dict_for_fw2[key],
+                _to_np(gpu_result.relaxation_param_dict_for_fw2[key]),
                 cpu_result.relaxation_param_dict_for_fw2[key],
                 rtol=1e-10,
                 err_msg=f"run() relaxation_param_dict_for_fw2[{key}] mismatch",

From f6aa4781b9c9f5d1675ce29d60622701dd1c6b42 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Thu, 5 Mar 2026 13:32:04 -0500
Subject: [PATCH 22/31] Implement GPU support for relaxation parameter
 generation using CuPy; add tests for CPU/GPU equivalence.

---
 fullwave/medium.py                      |   8 +-
 fullwave/utils/relaxation_parameters.py | 168 +++++++++++++++++++++---
 tests/test_cupy_equivalence.py          |  76 +++++++++++
 3 files changed, 225 insertions(+), 27 deletions(-)

diff --git a/fullwave/medium.py b/fullwave/medium.py
index 5caaa1e..878b3df 100644
--- a/fullwave/medium.py
+++ b/fullwave/medium.py
@@ -1268,15 +1268,11 @@ def build(self) -> MediumRelaxationMaps:
 
         """
         logger.debug("Building MediumRelaxationMaps from alpha and power maps.")
-        xp = self.xp
-        # generate_relaxation_params uses Numba and requires numpy arrays
-        alpha_coeff_np = xp.asnumpy(self.alpha_coeff) if xp is not np else self.alpha_coeff
-        alpha_power_np = xp.asnumpy(self.alpha_power) if xp is not np else self.alpha_power
         if self.attenuation_builder == "lookup":
             relaxation_param_dict = generate_relaxation_params(
                 n_relaxation_mechanisms=self.n_relaxation_mechanisms,
-                alpha_coeff=alpha_coeff_np,
-                alpha_power=alpha_power_np,
+                alpha_coeff=self.alpha_coeff,
+                alpha_power=self.alpha_power,
                 path_database=self.path_relaxation_parameters_database,
             )
         else:
diff --git a/fullwave/utils/relaxation_parameters.py b/fullwave/utils/relaxation_parameters.py
index e820888..17a80c3 100644
--- a/fullwave/utils/relaxation_parameters.py
+++ b/fullwave/utils/relaxation_parameters.py
@@ -247,6 +247,91 @@ def _map_parameters_search(
     return output_flat.reshape(*spatial_shape, n_params)
 
 
+def _map_parameters_search_gpu(
+    input_tensor: NDArray[np.float64],
+    look_up_table: NDArray[np.float64],
+    alpha_list: NDArray[np.float64],
+    power_list: NDArray[np.float64],
+    invalid_matrix: NDArray[np.bool_],
+) -> NDArray[np.float64]:
+    """GPU version of _map_parameters_search using CuPy searchsorted + fancy indexing.
+
+    Parameters
+    ----------
+    input_tensor : cp.ndarray
+        Input tensor with shape (..., 2) where last dim is (alpha, power).
+    look_up_table : NDArray[np.float64]
+        Precomputed parameter table shape (B1, B2, 4 * n_relaxation + 2).
+    alpha_list : NDArray[np.float64]
+        List of alpha values for the lookup table.
+    power_list : NDArray[np.float64]
+        List of power values for the lookup table.
+    invalid_matrix : NDArray[np.bool_]
+        Matrix indicating invalid (alpha, power) combinations.
+
+    Returns
+    -------
+    cp.ndarray
+        Output tensor with shape (..., 4 * n_relaxation + 2).
+
+    """
+    import cupy as cp  # noqa: PLC0415
+
+    logger.debug("Mapping parameters using CuPy GPU kernel.")
+    time_start = time.time()
+
+    spatial_shape = input_tensor.shape[:-1]
+
+    # Transfer small LUT arrays to GPU (these are tiny, ~KB)
+    alpha_sorted = cp.asarray(alpha_list[0].round(10))
+    power_sorted = cp.asarray(power_list[0].round(10))
+    lut_gpu = cp.asarray(look_up_table)
+
+    # Flatten spatial dims
+    n_elements = int(cp.prod(cp.asarray(list(spatial_shape))))
+    input_flat = input_tensor.reshape(n_elements, 2)
+
+    # Searchsorted on GPU
+    alpha_indices = cp.searchsorted(alpha_sorted, input_flat[:, 0], side="left")
+    power_indices = cp.searchsorted(power_sorted, input_flat[:, 1], side="left")
+
+    # Clip to valid range
+    alpha_indices = cp.clip(alpha_indices, 0, len(alpha_sorted) - 1)
+    power_indices = cp.clip(power_indices, 0, len(power_sorted) - 1)
+
+    # LUT lookup via fancy indexing
+    output_flat = lut_gpu[alpha_indices, power_indices, :]
+
+    time_end = time.time()
+    logger.debug("CuPy GPU kernel time: %.4f seconds.", time_end - time_start)
+
+    # Check for invalid combinations (transfer only the boolean result)
+    invalid_gpu = cp.asarray(invalid_matrix)
+    has_invalid = bool(cp.any(invalid_gpu[alpha_indices, power_indices]))
+    if has_invalid:
+        invalid_flags = invalid_gpu[alpha_indices, power_indices].reshape(spatial_shape)
+        # Transfer only the small set of invalid points for the warning message
+        invalid_indices_np = cp.asnumpy(invalid_flags)
+        input_np = cp.asnumpy(input_tensor)
+        invalid_alpha_power = np.unique(
+            input_np[..., :2][np.where(invalid_indices_np)],
+            axis=0,
+        )
+        invalid_attenuation = ", ".join(
+            [f"({a:.4f}, {p:.4f})" for a, p in invalid_alpha_power],
+        )
+        message = (
+            "Warning: Some attenuation values correspond to invalid relaxation parameters. "
+            "This is due to the limitations of the precomputed lookup table. "
+            "Please change the attenuation values.\n"
+            f"Number of invalid points: {int(cp.sum(invalid_flags))}.\n"
+            f"Invalid attenuation values (alpha, power): {invalid_attenuation}\n"
+        )
+        logger.warning(message)
+
+    return output_flat.reshape(*spatial_shape, look_up_table.shape[2])
+
+
 def generate_relaxation_params(
     alpha_coeff: NDArray[np.float64],
     alpha_power: NDArray[np.float64],
@@ -361,6 +446,9 @@ def generate(
     ) -> dict[str, NDArray[np.float64]]:
         """Generate relaxation parameters based on attenuation values.
 
+        Dispatches to CuPy GPU path when inputs are CuPy arrays,
+        otherwise uses the Numba CPU path.
+
         Parameters
         ----------
         alpha_coeff : NDArray[np.float64]
@@ -374,32 +462,24 @@ def generate(
             A dictionary containing the computed relaxation parameters.
 
         """
-        if np.any(alpha_coeff < self.alpha_min) or np.any(alpha_power < self.power_min):
-            error_msg = (
-                "attenuation is out of range."
-                "the out-of-range values will be clipped to the min value."
-                f"alpha minimum: {self.alpha_min}, "
-                f"power minimum: {self.power_min}"
-            )
-            logger.warning(error_msg)
-        if np.any(alpha_coeff > self.alpha_max) or np.any(alpha_power > self.power_max):
-            error_msg = (
-                "attenuation is out of range."
-                "the out-of-range values will be clipped to the max value."
-                f"alpha maximum: {self.alpha_max}, "
-                f"power maximum: {self.power_max}"
-            )
-            logger.warning(error_msg)
+        use_gpu = not isinstance(alpha_coeff, np.ndarray)
+
+        if use_gpu:
+            return self._generate_gpu(alpha_coeff, alpha_power)
+        return self._generate_cpu(alpha_coeff, alpha_power)
+
+    def _generate_cpu(
+        self,
+        alpha_coeff: NDArray[np.float64],
+        alpha_power: NDArray[np.float64],
+    ) -> dict[str, NDArray[np.float64]]:
+        """CPU path using Numba fused kernel."""
+        self._warn_out_of_range(alpha_coeff, alpha_power, xp=np)
 
         alpha_coeff = np.clip(alpha_coeff, self.alpha_min, self.alpha_max)
         alpha_power = np.clip(alpha_power, self.power_min, self.power_max)
 
-        # # Normalize to [0, 1] for the lookup table
-        # alpha_coeff = (alpha_coeff - self.alpha_min) / (self.alpha_max - self.alpha_min)
-        # alpha_power = (alpha_power - self.power_min) / (self.power_max - self.power_min)
-
         input_data = np.stack([alpha_coeff, alpha_power], axis=-1)
-        # output = _map_parameters(input_data, self.look_up_table, self.alpha_list, self.power_list)
         output = _map_parameters_search(
             input_data,
             self.look_up_table,
@@ -412,3 +492,49 @@ def generate(
         for i, key in enumerate(relaxation_param_dict.keys()):
             relaxation_param_dict[key] = output[..., i]
         return relaxation_param_dict
+
+    def _generate_gpu(
+        self,
+        alpha_coeff: NDArray[np.float64],
+        alpha_power: NDArray[np.float64],
+    ) -> dict[str, NDArray[np.float64]]:
+        """GPU path using CuPy searchsorted + fancy indexing."""
+        import cupy as cp  # noqa: PLC0415
+
+        self._warn_out_of_range(alpha_coeff, alpha_power, xp=cp)
+
+        alpha_coeff = cp.clip(alpha_coeff, self.alpha_min, self.alpha_max)
+        alpha_power = cp.clip(alpha_power, self.power_min, self.power_max)
+
+        input_data = cp.stack([alpha_coeff, alpha_power], axis=-1)
+        output = _map_parameters_search_gpu(
+            input_data,
+            self.look_up_table,
+            self.alpha_list,
+            self.power_list,
+            self.invalid_matrix,
+        )
+
+        relaxation_param_dict = initialize_relaxation_param_dict(self.n_relaxation_mechanisms)
+        for i, key in enumerate(relaxation_param_dict.keys()):
+            relaxation_param_dict[key] = output[..., i]
+        return relaxation_param_dict
+
+    def _warn_out_of_range(self, alpha_coeff: NDArray, alpha_power: NDArray, *, xp: object) -> None:
+        """Log warnings if attenuation values are out of LUT range."""
+        if xp.any(alpha_coeff < self.alpha_min) or xp.any(alpha_power < self.power_min):
+            error_msg = (
+                "attenuation is out of range."
+                "the out-of-range values will be clipped to the min value."
+                f"alpha minimum: {self.alpha_min}, "
+                f"power minimum: {self.power_min}"
+            )
+            logger.warning(error_msg)
+        if xp.any(alpha_coeff > self.alpha_max) or xp.any(alpha_power > self.power_max):
+            error_msg = (
+                "attenuation is out of range."
+                "the out-of-range values will be clipped to the max value."
+                f"alpha maximum: {self.alpha_max}, "
+                f"power maximum: {self.power_max}"
+            )
+            logger.warning(error_msg)
diff --git a/tests/test_cupy_equivalence.py b/tests/test_cupy_equivalence.py
index 7a02b68..ec41bd2 100644
--- a/tests/test_cupy_equivalence.py
+++ b/tests/test_cupy_equivalence.py
@@ -12,6 +12,7 @@
 from fullwave.solver.input_file_writer import InputFileWriter
 from fullwave.solver.pml_builder import PMLBuilder, PMLBuilderExponentialAttenuation
 from fullwave.solver.utils import initialize_relaxation_param_dict
+from fullwave.utils.relaxation_parameters import RelaxationParametersGenerator
 
 # ---------------------------------------------------------------------------
 # Skip the entire module when CuPy / CUDA is unavailable
@@ -808,3 +809,78 @@ def test_run_identical(self, setup_2d):
                 rtol=1e-10,
                 err_msg=f"run() relaxation_param_dict_for_fw2[{key}] mismatch",
             )
+
+
+class TestRelaxationParametersGeneratorCupyEquivalence:
+    """Compare CPU (Numba) vs GPU (CuPy) for RelaxationParametersGenerator."""
+
+    @pytest.fixture()
+    def generator(self):
+        from pathlib import Path
+
+        db_path = (
+            Path(__file__).parent.parent
+            / "fullwave"
+            / "solver"
+            / "bins"
+            / "database"
+            / "relaxation_params_database_num_relax=2_20260113_0957.mat"
+        )
+        if not db_path.exists():
+            pytest.skip(f"LUT database not found at {db_path}")
+        return RelaxationParametersGenerator(n_relaxation_mechanisms=2, path_database=db_path)
+
+    def test_generate_identical_2d(self, generator):
+        """GPU generate must produce identical results to CPU generate for 2D arrays."""
+        rng = np.random.default_rng(123)
+        shape = (50, 60)
+        alpha_coeff = rng.uniform(generator.alpha_min, generator.alpha_max, size=shape)
+        alpha_power = rng.uniform(generator.power_min, generator.power_max, size=shape)
+
+        cpu_result = generator._generate_cpu(alpha_coeff, alpha_power)
+        gpu_result = generator._generate_gpu(cp.asarray(alpha_coeff), cp.asarray(alpha_power))
+
+        for key in cpu_result:
+            np.testing.assert_allclose(
+                _to_np(gpu_result[key]),
+                cpu_result[key],
+                rtol=1e-12,
+                err_msg=f"key={key} mismatch (2D)",
+            )
+
+    def test_generate_identical_3d(self, generator):
+        """GPU generate must produce identical results to CPU generate for 3D arrays."""
+        rng = np.random.default_rng(456)
+        shape = (20, 25, 15)
+        alpha_coeff = rng.uniform(generator.alpha_min, generator.alpha_max, size=shape)
+        alpha_power = rng.uniform(generator.power_min, generator.power_max, size=shape)
+
+        cpu_result = generator._generate_cpu(alpha_coeff, alpha_power)
+        gpu_result = generator._generate_gpu(cp.asarray(alpha_coeff), cp.asarray(alpha_power))
+
+        for key in cpu_result:
+            np.testing.assert_allclose(
+                _to_np(gpu_result[key]),
+                cpu_result[key],
+                rtol=1e-12,
+                err_msg=f"key={key} mismatch (3D)",
+            )
+
+    def test_generate_dispatches_correctly(self, generator):
+        """generate() dispatches to CPU for numpy, GPU for CuPy."""
+        rng = np.random.default_rng(789)
+        shape = (10, 10)
+        alpha_coeff = rng.uniform(generator.alpha_min, generator.alpha_max, size=shape)
+        alpha_power = rng.uniform(generator.power_min, generator.power_max, size=shape)
+
+        cpu_result = generator.generate(alpha_coeff, alpha_power)
+        gpu_result = generator.generate(cp.asarray(alpha_coeff), cp.asarray(alpha_power))
+
+        for key in cpu_result:
+            assert isinstance(cpu_result[key], np.ndarray), f"CPU result {key} should be numpy"
+            np.testing.assert_allclose(
+                _to_np(gpu_result[key]),
+                cpu_result[key],
+                rtol=1e-12,
+                err_msg=f"key={key} dispatch mismatch",
+            )

From 1ae055f74950c5c18b53102f7ab179091af30a89 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Thu, 5 Mar 2026 13:39:48 -0500
Subject: [PATCH 23/31] Enhance GPU memory management in PMLBuilder and
 PMLBuilderExponentialAttenuation; free original arrays after extending to
 prevent OOM issues.

---
 fullwave/solver/pml_builder.py | 74 ++++++++++++++++++++++++++++++----
 1 file changed, 67 insertions(+), 7 deletions(-)

diff --git a/fullwave/solver/pml_builder.py b/fullwave/solver/pml_builder.py
index 0c44d7f..39e3732 100644
--- a/fullwave/solver/pml_builder.py
+++ b/fullwave/solver/pml_builder.py
@@ -285,14 +285,24 @@ def __init__(  # noqa: PLR0912
         logger.debug("building extended medium for pml...")
         if isinstance(self.medium_org, fullwave.MediumRelaxationMaps):
             if self.xp is not np:
-                # GPU path: run sequentially to avoid CuPy multi-thread issues
+                import cupy as cp  # noqa: PLC0415
+
+                # GPU path: free each original array after extending to avoid OOM.
+                pool = cp.get_default_memory_pool()
                 extended_sound_speed = self._extend_map_for_pml(self.medium_org.sound_speed)
+                self.medium_org.sound_speed = cp.asnumpy(self.medium_org.sound_speed)
+                pool.free_all_blocks()
                 extended_density = self._extend_map_for_pml(self.medium_org.density)
+                self.medium_org.density = cp.asnumpy(self.medium_org.density)
+                pool.free_all_blocks()
                 extended_beta = self._extend_map_for_pml(self.medium_org.beta)
-                extended_relaxation_param_dict = {
-                    key: self._extend_map_for_pml(value)
-                    for key, value in self.medium_org.relaxation_param_dict.items()
-                }
+                self.medium_org.beta = cp.asnumpy(self.medium_org.beta)
+                pool.free_all_blocks()
+                extended_relaxation_param_dict = {}
+                for key, value in self.medium_org.relaxation_param_dict.items():
+                    extended_relaxation_param_dict[key] = self._extend_map_for_pml(value)
+                    self.medium_org.relaxation_param_dict[key] = cp.asnumpy(value)
+                    pool.free_all_blocks()
             else:
                 with concurrent.futures.ThreadPoolExecutor() as executor:
                     future_sound_speed = executor.submit(
@@ -333,12 +343,25 @@ def __init__(  # noqa: PLR0912
             )
         else:
             if self.xp is not np:
-                # GPU path: run sequentially to avoid CuPy multi-thread issues
+                import cupy as cp  # noqa: PLC0415
+
+                # GPU path: free each original array after extending to avoid OOM.
+                pool = cp.get_default_memory_pool()
                 extended_sound_speed = self._extend_map_for_pml(self.medium_org.sound_speed)
+                self.medium_org.sound_speed = cp.asnumpy(self.medium_org.sound_speed)
+                pool.free_all_blocks()
                 extended_density = self._extend_map_for_pml(self.medium_org.density)
+                self.medium_org.density = cp.asnumpy(self.medium_org.density)
+                pool.free_all_blocks()
                 extended_beta = self._extend_map_for_pml(self.medium_org.beta)
+                self.medium_org.beta = cp.asnumpy(self.medium_org.beta)
+                pool.free_all_blocks()
                 extended_alpha_coeff = self._extend_map_for_pml(self.medium_org.alpha_coeff)
+                self.medium_org.alpha_coeff = cp.asnumpy(self.medium_org.alpha_coeff)
+                pool.free_all_blocks()
                 extended_alpha_power = self._extend_map_for_pml(self.medium_org.alpha_power)
+                self.medium_org.alpha_power = cp.asnumpy(self.medium_org.alpha_power)
+                pool.free_all_blocks()
             else:
                 with concurrent.futures.ThreadPoolExecutor() as executor:
                     future_sound_speed = executor.submit(
@@ -1728,12 +1751,28 @@ def __init__(
 
         logger.debug("building extended medium for pml...")
         if self.xp is not np:
-            # GPU path: run sequentially to avoid CuPy multi-thread issues
+            import cupy as cp  # noqa: PLC0415
+
+            # GPU path: run sequentially to avoid CuPy multi-thread issues.
+            # Move each original array back to CPU after extending to free GPU
+            # memory and avoid OOM on large 3D grids where both the original
+            # (~N^3) and extended (~(N+2*pml)^3) arrays cannot fit simultaneously.
+            pool = cp.get_default_memory_pool()
             extended_sound_speed = self._extend_map_for_pml(self.medium_org.sound_speed)
+            self.medium_org.sound_speed = cp.asnumpy(self.medium_org.sound_speed)
+            pool.free_all_blocks()
             extended_density = self._extend_map_for_pml(self.medium_org.density)
+            self.medium_org.density = cp.asnumpy(self.medium_org.density)
+            pool.free_all_blocks()
             extended_beta = self._extend_map_for_pml(self.medium_org.beta)
+            self.medium_org.beta = cp.asnumpy(self.medium_org.beta)
+            pool.free_all_blocks()
             extended_alpha_coeff = self._extend_map_for_pml(self.medium_org.alpha_coeff)
+            self.medium_org.alpha_coeff = cp.asnumpy(self.medium_org.alpha_coeff)
+            pool.free_all_blocks()
             extended_alpha_power = self._extend_map_for_pml(self.medium_org.alpha_power)
+            self.medium_org.alpha_power = cp.asnumpy(self.medium_org.alpha_power)
+            pool.free_all_blocks()
         else:
             # CPU path: run in parallel for all medium properties since it is a bottleneck
             with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -1875,6 +1914,8 @@ def run(self, *, use_pml: bool = True) -> fullwave.MediumExponentialAttenuation:
             extended_medium: fullwave.MediumExponentialAttenuation = (
                 self.extended_medium.build_exponential()
             )
+            # Free the intermediate Medium's GPU arrays — only alpha_exp is needed
+            self._free_extended_medium_gpu()
             logger.debug("Extended medium for PML built successfully.")
             if self.is_3d:
                 logger.debug("Applying 3D PML to the extended medium...")
@@ -1891,9 +1932,28 @@ def run(self, *, use_pml: bool = True) -> fullwave.MediumExponentialAttenuation:
         extended_medium: fullwave.MediumExponentialAttenuation = (
             self.extended_medium.build_exponential()
         )
+        self._free_extended_medium_gpu()
         logger.debug("Extended medium built successfully without applying PML.")
         return extended_medium
 
+    def _free_extended_medium_gpu(self) -> None:
+        """Free GPU arrays from extended_medium that are no longer needed.
+
+        After build_exponential(), the Medium's alpha_coeff, alpha_power,
+        sound_speed, density, and beta are duplicated in the returned
+        MediumExponentialAttenuation. Free them to reclaim GPU memory.
+        """
+        if self.xp is np:
+            return
+        import cupy as cp  # noqa: PLC0415
+
+        medium = self.extended_medium
+        for attr in ("sound_speed", "density", "beta", "alpha_coeff", "alpha_power"):
+            val = getattr(medium, attr, None)
+            if val is not None and not isinstance(val, np.ndarray):
+                setattr(medium, attr, cp.asnumpy(val))
+        cp.get_default_memory_pool().free_all_blocks()
+
     def _mask_body_2d(self, nx: int, ny: int, n_body: int) -> NDArray[np.float32]:
         """Create a mask for the PML region.
 

From d0f866b05e3c904d130430849907a3ce2fc0b6f4 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Thu, 5 Mar 2026 14:31:42 -0500
Subject: [PATCH 24/31] Refactor GPU array handling in PMLBuilder and Medium
 classes; implement cleanup for OOM scenarios and streamline array extension
 across CPU/GPU.

---
 fullwave/medium.py             | 102 ++++++++--
 fullwave/solver/pml_builder.py | 330 ++++++++++++++++++---------------
 2 files changed, 263 insertions(+), 169 deletions(-)

diff --git a/fullwave/medium.py b/fullwave/medium.py
index 878b3df..3985976 100644
--- a/fullwave/medium.py
+++ b/fullwave/medium.py
@@ -51,6 +51,19 @@ def _get_array_module(*, use_gpu: bool) -> ModuleType:
     return np
 
 
+def _cleanup_gpu_arrays(obj: object, attr_names: list[str]) -> None:
+    """Delete partially allocated GPU arrays and free CuPy memory pool."""
+    for attr in attr_names:
+        if hasattr(obj, attr):
+            delattr(obj, attr)
+    try:
+        import cupy as cp  # noqa: PLC0415
+
+        cp.get_default_memory_pool().free_all_blocks()
+    except ImportError:
+        pass
+
+
 @dataclass
 class MediumRelaxationMaps:
     """Medium class for Fullwave."""
@@ -145,16 +158,38 @@ def __init__(
         self.n_relaxation_mechanisms = n_relaxation_mechanisms
         self.dtype = np.dtype(dtype)
         xp = self.xp
-        self.relaxation_param_dict = initialize_relaxation_param_dict(
-            n_relaxation_mechanisms=n_relaxation_mechanisms,
-            value=xp.zeros_like(xp.asarray(sound_speed), dtype=self.dtype),
-        )
-        self.grid = grid
-        self.is_3d = grid.is_3d
-
-        self.sound_speed = xp.atleast_2d(xp.asarray(sound_speed)).astype(self.dtype, copy=False)
-        self.density = xp.atleast_2d(xp.asarray(density)).astype(self.dtype, copy=False)
-        self.beta = xp.atleast_2d(xp.asarray(beta)).astype(self.dtype, copy=False)
+        try:
+            self.relaxation_param_dict = initialize_relaxation_param_dict(
+                n_relaxation_mechanisms=n_relaxation_mechanisms,
+                value=xp.zeros_like(xp.asarray(sound_speed), dtype=self.dtype),
+            )
+            self.grid = grid
+            self.is_3d = grid.is_3d
+
+            self.sound_speed = xp.atleast_2d(xp.asarray(sound_speed)).astype(self.dtype, copy=False)
+            self.density = xp.atleast_2d(xp.asarray(density)).astype(self.dtype, copy=False)
+            self.beta = xp.atleast_2d(xp.asarray(beta)).astype(self.dtype, copy=False)
+        except Exception:
+            if xp is np:
+                raise
+            logger.warning("GPU OOM in MediumRelaxationMaps.__init__. Falling back to CPU (numpy).")
+            _cleanup_gpu_arrays(self, ["sound_speed", "density", "beta"])
+            if hasattr(self, "relaxation_param_dict"):
+                del self.relaxation_param_dict
+            import cupy as cp  # noqa: PLC0415
+
+            cp.get_default_memory_pool().free_all_blocks()
+            self.xp = np
+            xp = np
+            self.relaxation_param_dict = initialize_relaxation_param_dict(
+                n_relaxation_mechanisms=n_relaxation_mechanisms,
+                value=np.zeros_like(np.asarray(sound_speed), dtype=self.dtype),
+            )
+            self.grid = grid
+            self.is_3d = grid.is_3d
+            self.sound_speed = np.atleast_2d(np.asarray(sound_speed)).astype(self.dtype, copy=False)
+            self.density = np.atleast_2d(np.asarray(density)).astype(self.dtype, copy=False)
+            self.beta = np.atleast_2d(np.asarray(beta)).astype(self.dtype, copy=False)
 
         if air_coords is not None:
             if air_map is not None:
@@ -758,10 +793,24 @@ def __init__(
         self.dtype = np.dtype(dtype)
 
         xp = self.xp
-        self.sound_speed = xp.atleast_2d(xp.asarray(sound_speed)).astype(self.dtype, copy=False)
-        self.density = xp.atleast_2d(xp.asarray(density)).astype(self.dtype, copy=False)
-        self.alpha_exp = xp.atleast_2d(xp.asarray(alpha_exp)).astype(self.dtype, copy=False)
-        self.beta = xp.atleast_2d(xp.asarray(beta)).astype(self.dtype, copy=False)
+        try:
+            self.sound_speed = xp.atleast_2d(xp.asarray(sound_speed)).astype(self.dtype, copy=False)
+            self.density = xp.atleast_2d(xp.asarray(density)).astype(self.dtype, copy=False)
+            self.alpha_exp = xp.atleast_2d(xp.asarray(alpha_exp)).astype(self.dtype, copy=False)
+            self.beta = xp.atleast_2d(xp.asarray(beta)).astype(self.dtype, copy=False)
+        except Exception:
+            if xp is np:
+                raise
+            logger.warning(
+                "GPU OOM in MediumExponentialAttenuation.__init__. Falling back to CPU (numpy)."
+            )
+            _cleanup_gpu_arrays(self, ["sound_speed", "density", "alpha_exp", "beta"])
+            self.xp = np
+            xp = np
+            self.sound_speed = np.atleast_2d(np.asarray(sound_speed)).astype(self.dtype, copy=False)
+            self.density = np.atleast_2d(np.asarray(density)).astype(self.dtype, copy=False)
+            self.alpha_exp = np.atleast_2d(np.asarray(alpha_exp)).astype(self.dtype, copy=False)
+            self.beta = np.atleast_2d(np.asarray(beta)).astype(self.dtype, copy=False)
 
         if air_coords is not None:
             if air_map is not None:
@@ -1036,11 +1085,26 @@ def __init__(
         self.dtype = np.dtype(dtype)
 
         xp = self.xp
-        self.sound_speed = xp.atleast_2d(xp.asarray(sound_speed)).astype(self.dtype, copy=False)
-        self.density = xp.atleast_2d(xp.asarray(density)).astype(self.dtype, copy=False)
-        self.alpha_coeff = xp.atleast_2d(xp.asarray(alpha_coeff)).astype(self.dtype, copy=False)
-        self.alpha_power = xp.atleast_2d(xp.asarray(alpha_power)).astype(self.dtype, copy=False)
-        self.beta = xp.atleast_2d(xp.asarray(beta)).astype(self.dtype, copy=False)
+        try:
+            self.sound_speed = xp.atleast_2d(xp.asarray(sound_speed)).astype(self.dtype, copy=False)
+            self.density = xp.atleast_2d(xp.asarray(density)).astype(self.dtype, copy=False)
+            self.alpha_coeff = xp.atleast_2d(xp.asarray(alpha_coeff)).astype(self.dtype, copy=False)
+            self.alpha_power = xp.atleast_2d(xp.asarray(alpha_power)).astype(self.dtype, copy=False)
+            self.beta = xp.atleast_2d(xp.asarray(beta)).astype(self.dtype, copy=False)
+        except Exception:
+            if xp is np:
+                raise
+            logger.warning("GPU OOM in Medium.__init__. Falling back to CPU (numpy).")
+            _cleanup_gpu_arrays(
+                self, ["sound_speed", "density", "alpha_coeff", "alpha_power", "beta"]
+            )
+            self.xp = np
+            xp = np
+            self.sound_speed = np.atleast_2d(np.asarray(sound_speed)).astype(self.dtype, copy=False)
+            self.density = np.atleast_2d(np.asarray(density)).astype(self.dtype, copy=False)
+            self.alpha_coeff = np.atleast_2d(np.asarray(alpha_coeff)).astype(self.dtype, copy=False)
+            self.alpha_power = np.atleast_2d(np.asarray(alpha_power)).astype(self.dtype, copy=False)
+            self.beta = np.atleast_2d(np.asarray(beta)).astype(self.dtype, copy=False)
 
         if air_coords is not None:
             if air_map is not None:
diff --git a/fullwave/solver/pml_builder.py b/fullwave/solver/pml_builder.py
index 39e3732..b799843 100644
--- a/fullwave/solver/pml_builder.py
+++ b/fullwave/solver/pml_builder.py
@@ -284,56 +284,36 @@ def __init__(  # noqa: PLR0912
 
         logger.debug("building extended medium for pml...")
         if isinstance(self.medium_org, fullwave.MediumRelaxationMaps):
+            base_attrs = ["sound_speed", "density", "beta"]
+            relax_attrs = list(self.medium_org.relaxation_param_dict.keys())
+
             if self.xp is not np:
-                import cupy as cp  # noqa: PLC0415
-
-                # GPU path: free each original array after extending to avoid OOM.
-                pool = cp.get_default_memory_pool()
-                extended_sound_speed = self._extend_map_for_pml(self.medium_org.sound_speed)
-                self.medium_org.sound_speed = cp.asnumpy(self.medium_org.sound_speed)
-                pool.free_all_blocks()
-                extended_density = self._extend_map_for_pml(self.medium_org.density)
-                self.medium_org.density = cp.asnumpy(self.medium_org.density)
-                pool.free_all_blocks()
-                extended_beta = self._extend_map_for_pml(self.medium_org.beta)
-                self.medium_org.beta = cp.asnumpy(self.medium_org.beta)
-                pool.free_all_blocks()
-                extended_relaxation_param_dict = {}
-                for key, value in self.medium_org.relaxation_param_dict.items():
-                    extended_relaxation_param_dict[key] = self._extend_map_for_pml(value)
-                    self.medium_org.relaxation_param_dict[key] = cp.asnumpy(value)
-                    pool.free_all_blocks()
+                # Move medium + relaxation arrays to CPU, then extend via multi-GPU
+                named_arrays = self._ensure_numpy_medium_arrays(base_attrs)
+                for key in relax_attrs:
+                    import cupy as cp  # noqa: PLC0415
+
+                    val = self.medium_org.relaxation_param_dict[key]
+                    if not isinstance(val, np.ndarray):
+                        val_np = cp.asnumpy(val)
+                        self.medium_org.relaxation_param_dict[key] = val_np
+                    else:
+                        val_np = val
+                    named_arrays.append((key, val_np))
+                cp.get_default_memory_pool().free_all_blocks()
+                extended = self._extend_arrays_gpu(named_arrays)
             else:
-                with concurrent.futures.ThreadPoolExecutor() as executor:
-                    future_sound_speed = executor.submit(
-                        self._extend_map_for_pml,
-                        self.medium_org.sound_speed,
-                    )
-                    future_density = executor.submit(
-                        self._extend_map_for_pml,
-                        self.medium_org.density,
-                    )
-                    future_beta = executor.submit(
-                        self._extend_map_for_pml,
-                        self.medium_org.beta,
-                    )
-                    future_relaxation_param_dict = {
-                        key: executor.submit(self._extend_map_for_pml, value)
-                        for key, value in self.medium_org.relaxation_param_dict.items()
-                    }
-
-                    extended_sound_speed = future_sound_speed.result()
-                    extended_density = future_density.result()
-                    extended_beta = future_beta.result()
-                    extended_relaxation_param_dict = {
-                        key: future.result() for key, future in future_relaxation_param_dict.items()
-                    }
+                named_arrays = [(name, getattr(self.medium_org, name)) for name in base_attrs] + [
+                    (key, self.medium_org.relaxation_param_dict[key]) for key in relax_attrs
+                ]
+                extended = self._extend_arrays_cpu(named_arrays)
 
+            extended_relaxation_param_dict = {key: extended[key] for key in relax_attrs}
             self.extended_medium = fullwave.MediumRelaxationMaps(
                 grid=self.extended_grid,
-                sound_speed=extended_sound_speed,
-                density=extended_density,
-                beta=extended_beta,
+                sound_speed=extended["sound_speed"],
+                density=extended["density"],
+                beta=extended["beta"],
                 relaxation_param_dict=extended_relaxation_param_dict,
                 air_coords=self.medium_org.air_coords + self.num_boundary_points,
                 n_relaxation_mechanisms=self.medium_org.n_relaxation_mechanisms,
@@ -342,61 +322,21 @@ def __init__(  # noqa: PLR0912
                 use_gpu=self.use_gpu,
             )
         else:
+            attr_names = ["sound_speed", "density", "beta", "alpha_coeff", "alpha_power"]
             if self.xp is not np:
-                import cupy as cp  # noqa: PLC0415
-
-                # GPU path: free each original array after extending to avoid OOM.
-                pool = cp.get_default_memory_pool()
-                extended_sound_speed = self._extend_map_for_pml(self.medium_org.sound_speed)
-                self.medium_org.sound_speed = cp.asnumpy(self.medium_org.sound_speed)
-                pool.free_all_blocks()
-                extended_density = self._extend_map_for_pml(self.medium_org.density)
-                self.medium_org.density = cp.asnumpy(self.medium_org.density)
-                pool.free_all_blocks()
-                extended_beta = self._extend_map_for_pml(self.medium_org.beta)
-                self.medium_org.beta = cp.asnumpy(self.medium_org.beta)
-                pool.free_all_blocks()
-                extended_alpha_coeff = self._extend_map_for_pml(self.medium_org.alpha_coeff)
-                self.medium_org.alpha_coeff = cp.asnumpy(self.medium_org.alpha_coeff)
-                pool.free_all_blocks()
-                extended_alpha_power = self._extend_map_for_pml(self.medium_org.alpha_power)
-                self.medium_org.alpha_power = cp.asnumpy(self.medium_org.alpha_power)
-                pool.free_all_blocks()
+                named_arrays = self._ensure_numpy_medium_arrays(attr_names)
+                extended = self._extend_arrays_gpu(named_arrays)
             else:
-                with concurrent.futures.ThreadPoolExecutor() as executor:
-                    future_sound_speed = executor.submit(
-                        self._extend_map_for_pml,
-                        self.medium_org.sound_speed,
-                    )
-                    future_density = executor.submit(
-                        self._extend_map_for_pml,
-                        self.medium_org.density,
-                    )
-                    future_beta = executor.submit(
-                        self._extend_map_for_pml,
-                        self.medium_org.beta,
-                    )
-                    future_alpha_coeff = executor.submit(
-                        self._extend_map_for_pml,
-                        self.medium_org.alpha_coeff,
-                    )
-                    future_alpha_power = executor.submit(
-                        self._extend_map_for_pml,
-                        self.medium_org.alpha_power,
-                    )
+                named_arrays = [(name, getattr(self.medium_org, name)) for name in attr_names]
+                extended = self._extend_arrays_cpu(named_arrays)
 
-                    extended_sound_speed = future_sound_speed.result()
-                    extended_density = future_density.result()
-                    extended_beta = future_beta.result()
-                    extended_alpha_coeff = future_alpha_coeff.result()
-                    extended_alpha_power = future_alpha_power.result()
             self.extended_medium = fullwave.Medium(
                 grid=self.extended_grid,
-                sound_speed=extended_sound_speed,
-                density=extended_density,
-                beta=extended_beta,
-                alpha_coeff=extended_alpha_coeff,
-                alpha_power=extended_alpha_power,
+                sound_speed=extended["sound_speed"],
+                density=extended["density"],
+                beta=extended["beta"],
+                alpha_coeff=extended["alpha_coeff"],
+                alpha_power=extended["alpha_power"],
                 air_coords=self.medium_org.air_coords + self.num_boundary_points,
                 n_relaxation_mechanisms=self.medium_org.n_relaxation_mechanisms,
                 path_relaxation_parameters_database=self.medium_org.path_relaxation_parameters_database,
@@ -617,6 +557,141 @@ def _extend_map_for_pml(
 
         return output
 
+    def _extend_arrays_gpu(
+        self,
+        named_arrays: list[tuple[str, NDArray]],
+    ) -> dict[str, NDArray]:
+        """Extend medium arrays using multi-GPU, single-GPU, or CPU fallback.
+
+        Parameters
+        ----------
+        named_arrays : list of (name, numpy_array) pairs
+            Arrays to extend. Must be numpy arrays (CPU).
+
+        Returns
+        -------
+        dict[str, NDArray]
+            Extended arrays as numpy arrays.
+
+        """
+        import cupy as cp  # noqa: PLC0415
+
+        n_gpus = cp.cuda.runtime.getDeviceCount()
+        logger.info("CUDA devices available: %d", n_gpus)
+
+        # Strategy 1: Multi-GPU parallel
+        if n_gpus > 1:
+            n_workers = min(n_gpus, len(named_arrays))
+            try:
+                logger.info(
+                    "Extending %d medium arrays using %d GPUs (of %d available).",
+                    len(named_arrays),
+                    n_workers,
+                    n_gpus,
+                )
+                return self._extend_arrays_multi_gpu(named_arrays, n_gpus)
+            except Exception:
+                logger.warning("Multi-GPU extension failed. Falling back to sequential single-GPU.")
+
+        # Strategy 2: Sequential single-GPU (extend one, free, repeat)
+        try:
+            logger.info("Extending %d medium arrays sequentially on GPU 0.", len(named_arrays))
+            return self._extend_arrays_sequential_gpu(named_arrays)
+        except Exception:
+            logger.warning("Single-GPU extension failed. Falling back to CPU.")
+
+        # Strategy 3: CPU
+        logger.info("Extending %d medium arrays on CPU.", len(named_arrays))
+        return self._extend_arrays_cpu(named_arrays)
+
+    def _extend_arrays_multi_gpu(
+        self,
+        named_arrays: list[tuple[str, NDArray]],
+        n_gpus: int,
+    ) -> dict[str, NDArray]:
+        """Extend arrays in parallel, each on a different GPU.
+
+        Each thread sets its own CUDA device, transfers data, extends,
+        and returns the result as a numpy array.
+        """
+        import cupy as cp  # noqa: PLC0415
+
+        def extend_on_device(
+            args: tuple[str, NDArray, int],
+        ) -> tuple[str, NDArray]:
+            name, arr_np, device_id = args
+            with cp.cuda.Device(device_id):
+                result_gpu = self._extend_map_for_pml(arr_np)
+                result_np = cp.asnumpy(result_gpu)
+                del result_gpu
+                cp.get_default_memory_pool().free_all_blocks()
+                return name, result_np
+
+        items = [(name, arr, i % n_gpus) for i, (name, arr) in enumerate(named_arrays)]
+
+        results = {}
+        with concurrent.futures.ThreadPoolExecutor(max_workers=min(n_gpus, len(items))) as executor:
+            futures = [executor.submit(extend_on_device, item) for item in items]
+            for future in concurrent.futures.as_completed(futures):
+                name, result = future.result()
+                results[name] = result
+        return results
+
+    def _extend_arrays_sequential_gpu(
+        self,
+        named_arrays: list[tuple[str, NDArray]],
+    ) -> dict[str, NDArray]:
+        """Extend arrays one at a time on GPU, freeing after each."""
+        import cupy as cp  # noqa: PLC0415
+
+        results = {}
+        pool = cp.get_default_memory_pool()
+        for name, arr_np in named_arrays:
+            result_gpu = self._extend_map_for_pml(arr_np)
+            results[name] = cp.asnumpy(result_gpu)
+            del result_gpu
+            pool.free_all_blocks()
+        return results
+
+    def _extend_arrays_cpu(
+        self,
+        named_arrays: list[tuple[str, NDArray]],
+    ) -> dict[str, NDArray]:
+        """Extend arrays on CPU using ThreadPoolExecutor."""
+        orig_xp = self.xp
+        self.xp = np
+        try:
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                futures = {
+                    name: executor.submit(self._extend_map_for_pml, arr)
+                    for name, arr in named_arrays
+                }
+                return {name: future.result() for name, future in futures.items()}
+        finally:
+            self.xp = orig_xp
+
+    def _ensure_numpy_medium_arrays(
+        self,
+        attr_names: list[str],
+    ) -> list[tuple[str, NDArray]]:
+        """Convert medium arrays to numpy and free GPU memory.
+
+        Returns list of (name, numpy_array) pairs.
+        """
+        import cupy as cp  # noqa: PLC0415
+
+        named_arrays = []
+        for name in attr_names:
+            arr = getattr(self.medium_org, name)
+            if not isinstance(arr, np.ndarray):
+                arr_np = cp.asnumpy(arr)
+                setattr(self.medium_org, name, arr_np)
+            else:
+                arr_np = arr
+            named_arrays.append((name, arr_np))
+        cp.get_default_memory_pool().free_all_blocks()
+        return named_arrays
+
     def _localize_pml_region(self) -> tuple[NDArray[np.float64], ...]:
         if self.is_3d:
             n_x_extended, n_y_extended, n_z_extended = self.extended_medium.sound_speed.shape
@@ -1750,66 +1825,21 @@ def __init__(
         )
 
         logger.debug("building extended medium for pml...")
+        attr_names = ["sound_speed", "density", "beta", "alpha_coeff", "alpha_power"]
         if self.xp is not np:
-            import cupy as cp  # noqa: PLC0415
-
-            # GPU path: run sequentially to avoid CuPy multi-thread issues.
-            # Move each original array back to CPU after extending to free GPU
-            # memory and avoid OOM on large 3D grids where both the original
-            # (~N^3) and extended (~(N+2*pml)^3) arrays cannot fit simultaneously.
-            pool = cp.get_default_memory_pool()
-            extended_sound_speed = self._extend_map_for_pml(self.medium_org.sound_speed)
-            self.medium_org.sound_speed = cp.asnumpy(self.medium_org.sound_speed)
-            pool.free_all_blocks()
-            extended_density = self._extend_map_for_pml(self.medium_org.density)
-            self.medium_org.density = cp.asnumpy(self.medium_org.density)
-            pool.free_all_blocks()
-            extended_beta = self._extend_map_for_pml(self.medium_org.beta)
-            self.medium_org.beta = cp.asnumpy(self.medium_org.beta)
-            pool.free_all_blocks()
-            extended_alpha_coeff = self._extend_map_for_pml(self.medium_org.alpha_coeff)
-            self.medium_org.alpha_coeff = cp.asnumpy(self.medium_org.alpha_coeff)
-            pool.free_all_blocks()
-            extended_alpha_power = self._extend_map_for_pml(self.medium_org.alpha_power)
-            self.medium_org.alpha_power = cp.asnumpy(self.medium_org.alpha_power)
-            pool.free_all_blocks()
+            named_arrays = self._ensure_numpy_medium_arrays(attr_names)
+            extended = self._extend_arrays_gpu(named_arrays)
         else:
-            # CPU path: run in parallel for all medium properties since it is a bottleneck
-            with concurrent.futures.ThreadPoolExecutor() as executor:
-                future_sound_speed = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.sound_speed,
-                )
-                future_density = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.density,
-                )
-                future_beta = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.beta,
-                )
-                future_alpha_coeff = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.alpha_coeff,
-                )
-                future_alpha_power = executor.submit(
-                    self._extend_map_for_pml,
-                    self.medium_org.alpha_power,
-                )
-
-                extended_sound_speed = future_sound_speed.result()
-                extended_density = future_density.result()
-                extended_beta = future_beta.result()
-                extended_alpha_coeff = future_alpha_coeff.result()
-                extended_alpha_power = future_alpha_power.result()
+            named_arrays = [(name, getattr(self.medium_org, name)) for name in attr_names]
+            extended = self._extend_arrays_cpu(named_arrays)
 
         self.extended_medium = fullwave.Medium(
             grid=self.extended_grid,
-            sound_speed=extended_sound_speed,
-            density=extended_density,
-            beta=extended_beta,
-            alpha_coeff=extended_alpha_coeff,
-            alpha_power=extended_alpha_power,
+            sound_speed=extended["sound_speed"],
+            density=extended["density"],
+            beta=extended["beta"],
+            alpha_coeff=extended["alpha_coeff"],
+            alpha_power=extended["alpha_power"],
             air_coords=self.medium_org.air_coords + self.num_boundary_points,
             n_relaxation_mechanisms=self.medium_org.n_relaxation_mechanisms,
             path_relaxation_parameters_database=self.medium_org.path_relaxation_parameters_database,

From 7eabcb65638053b704db8ae49f68feac64a24ca4 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Thu, 5 Mar 2026 15:22:49 -0500
Subject: [PATCH 25/31] Implement GPU support for bulk modulus computation in
 InputFileWriter; add tests for precomputed bulk modulus equivalence between
 CPU and GPU.

---
 fullwave/solver/input_file_writer.py | 160 ++++++++++++++++++++++++---
 fullwave/solver/pml_builder.py       |  20 +++-
 tests/test_cupy_equivalence.py       |  40 +++++++
 3 files changed, 200 insertions(+), 20 deletions(-)

diff --git a/fullwave/solver/input_file_writer.py b/fullwave/solver/input_file_writer.py
index ac82b7a..67a4034 100644
--- a/fullwave/solver/input_file_writer.py
+++ b/fullwave/solver/input_file_writer.py
@@ -108,25 +108,20 @@ def __init__(
         self.release_after_write = release_after_write
         self.pml_thickness = pml_thickness
 
+        self._precomputed_bulk_modulus: NDArray[np.float32] | None = None
+
         if self.use_gpu:
             try:
                 import cupy as cp  # noqa: PLC0415
 
-                # sound_speed may already be CuPy — asarray is a no-op in that case
-                c_gpu = cp.asarray(self.medium.sound_speed, dtype=cp.float64)
-                c_min_val = float(c_gpu.min())
-                c_max_val = float(c_gpu.max())
-                self._dim = int(cp.rint(cp.float64(c_max_val)) - cp.rint(cp.float64(c_min_val)))
-
-                # Compute dc_map on GPU (use a copy to avoid mutating medium data)
-                c_tmp = c_gpu.copy()
-                c_min_rounded = float(matlab_round(c_min_val))
-                offset = -c_min_rounded + 1
-                c_tmp += 1e-9
-                cp.rint(c_tmp, out=c_tmp)
-                c_tmp += offset
-                self._dc_map = cp.asnumpy(c_tmp.astype(cp.int32))
-                logger.debug("dc map for stencil coefficients set (GPU, fused).")
+                n_gpus = cp.cuda.runtime.getDeviceCount()
+                if n_gpus > 1 and self.medium.sound_speed.ndim == 3:
+                    c_min_val, c_max_val = self._compute_dc_map_and_bulk_modulus_multi_gpu(
+                        n_gpus,
+                    )
+                else:
+                    c_min_val, c_max_val = self._compute_dc_map_and_bulk_modulus_single_gpu()
+                self._dim = int(round(c_max_val) - round(c_min_val))
                 self._dc_map_ready = True
             except ImportError:
                 self._dim = int(
@@ -768,6 +763,127 @@ def _process_chunk(start: int) -> None:
 
     # --- batch write utils ---
 
+    def _compute_dc_map_and_bulk_modulus_single_gpu(self) -> tuple[float, float]:
+        """Compute dc_map and bulk_modulus on a single GPU.
+
+        Returns
+        -------
+        tuple[float, float]
+            (c_min_val, c_max_val) of the sound speed.
+
+        """
+        import cupy as cp  # noqa: PLC0415
+
+        c_gpu = cp.asarray(self.medium.sound_speed, dtype=cp.float64)
+        c_min_val = float(c_gpu.min())
+        c_max_val = float(c_gpu.max())
+
+        # dc_map: rint(sound_speed + 1e-9) - rint(c_min) + 1
+        c_tmp = c_gpu.copy()
+        c_min_rounded = float(matlab_round(c_min_val))
+        offset = -c_min_rounded + 1
+        c_tmp += 1e-9
+        cp.rint(c_tmp, out=c_tmp)
+        c_tmp += offset
+        self._dc_map = cp.asnumpy(c_tmp.astype(cp.int32))
+        del c_tmp, c_gpu
+
+        # bulk_modulus: sound_speed^2 * density
+        c_f32 = cp.asarray(self.medium.sound_speed, dtype=cp.float32)
+        rho_f32 = cp.asarray(self.medium.density, dtype=cp.float32)
+        bulk = cp.multiply(c_f32 * c_f32, rho_f32)
+        self._precomputed_bulk_modulus = cp.asnumpy(bulk)
+        del c_f32, rho_f32, bulk
+        cp.get_default_memory_pool().free_all_blocks()
+
+        logger.debug("dc map and bulk modulus set (single GPU).")
+        return c_min_val, c_max_val
+
+    def _compute_dc_map_and_bulk_modulus_multi_gpu(
+        self,
+        n_gpus: int,
+    ) -> tuple[float, float]:
+        """Compute dc_map and bulk_modulus in parallel across multiple GPUs.
+
+        Each GPU processes a slice of the 3D array along axis 0.
+
+        Returns
+        -------
+        tuple[float, float]
+            (c_min_val, c_max_val) of the sound speed.
+
+        """
+        from concurrent.futures import ThreadPoolExecutor, as_completed  # noqa: PLC0415
+
+        import cupy as cp  # noqa: PLC0415
+
+        sound_speed_np = self.medium.sound_speed
+        density_np = self.medium.density
+
+        # Ensure numpy for slicing
+        if not isinstance(sound_speed_np, np.ndarray):
+            sound_speed_np = cp.asnumpy(sound_speed_np)
+        if not isinstance(density_np, np.ndarray):
+            density_np = cp.asnumpy(density_np)
+
+        n_slabs = sound_speed_np.shape[0]
+        n_workers = min(n_gpus, n_slabs)
+        chunk_size = -(-n_slabs // n_workers)  # ceil division
+
+        logger.info(
+            "Computing dc_map and bulk_modulus using %d GPUs (of %d available).",
+            n_workers,
+            n_gpus,
+        )
+
+        # Phase 1+2: Each GPU computes local min/max, dc_map chunk, bulk_modulus chunk
+        dc_map_result = np.empty(sound_speed_np.shape, dtype=np.int32)
+        bulk_result = np.empty(sound_speed_np.shape, dtype=np.float32)
+        local_mins = np.empty(n_workers, dtype=np.float64)
+        local_maxs = np.empty(n_workers, dtype=np.float64)
+
+        def _process_on_device(worker_id: int, device_id: int) -> None:
+            start = worker_id * chunk_size
+            end = min(start + chunk_size, n_slabs)
+            with cp.cuda.Device(device_id):
+                c_chunk = cp.asarray(sound_speed_np[start:end], dtype=cp.float64)
+                local_mins[worker_id] = float(c_chunk.min())
+                local_maxs[worker_id] = float(c_chunk.max())
+
+                # dc_map for this chunk (offset applied after global min is known)
+                c_chunk += 1e-9
+                cp.rint(c_chunk, out=c_chunk)
+                dc_chunk_f64 = c_chunk  # reuse buffer
+                dc_map_result[start:end] = cp.asnumpy(dc_chunk_f64.astype(cp.int32))
+                del dc_chunk_f64
+
+                # bulk_modulus for this chunk
+                c_f32 = cp.asarray(sound_speed_np[start:end], dtype=cp.float32)
+                rho_f32 = cp.asarray(density_np[start:end], dtype=cp.float32)
+                bulk_chunk = cp.multiply(c_f32 * c_f32, rho_f32)
+                bulk_result[start:end] = cp.asnumpy(bulk_chunk)
+                del c_f32, rho_f32, bulk_chunk
+                cp.get_default_memory_pool().free_all_blocks()
+
+        with ThreadPoolExecutor(max_workers=n_workers) as executor:
+            futures = [executor.submit(_process_on_device, i, i % n_gpus) for i in range(n_workers)]
+            for future in as_completed(futures):
+                future.result()
+
+        c_min_val = float(local_mins.min())
+        c_max_val = float(local_maxs.max())
+
+        # Apply global offset to dc_map
+        c_min_rounded = int(matlab_round(c_min_val))
+        offset = np.int32(-c_min_rounded + 1)
+        dc_map_result += offset
+
+        self._dc_map = dc_map_result
+        self._precomputed_bulk_modulus = bulk_result
+
+        logger.debug("dc map and bulk modulus set (multi-GPU, %d devices).", n_workers)
+        return c_min_val, c_max_val
+
     def _init_pending_writes(self) -> None:
         """Initialize the pending writes list for batch I/O."""
         self._pending_writes: list[tuple] = []
@@ -872,10 +988,15 @@ def _save_variables_into_dat_file(
         relaxation_param_map_dict_for_fw2: dict[str, NDArray[np.float64]],
         dim: int,
     ) -> None:
+        k_map = (
+            self._precomputed_bulk_modulus
+            if self._precomputed_bulk_modulus is not None
+            else self.medium.bulk_modulus
+        )
         self._save_maps(
             simulation_dir,
             c_map=self.medium.sound_speed,
-            k_map=self.medium.bulk_modulus,
+            k_map=k_map,
             rho_map=self.medium.density,
             beta_map=self.medium.beta,
         )
@@ -938,10 +1059,15 @@ def _save_variables_into_dat_file_exponential_attenuation(
         simulation_dir: Path,
         dim: int,
     ) -> None:
+        k_map = (
+            self._precomputed_bulk_modulus
+            if self._precomputed_bulk_modulus is not None
+            else self.medium.bulk_modulus
+        )
         self._save_maps(
             simulation_dir,
             c_map=self.medium.sound_speed,
-            k_map=self.medium.bulk_modulus,
+            k_map=k_map,
             rho_map=self.medium.density,
             beta_map=self.medium.beta,
             alpha_exp_map=self.medium.alpha_exp,
diff --git a/fullwave/solver/pml_builder.py b/fullwave/solver/pml_builder.py
index b799843..b2fbc9f 100644
--- a/fullwave/solver/pml_builder.py
+++ b/fullwave/solver/pml_builder.py
@@ -309,6 +309,8 @@ def __init__(  # noqa: PLR0912
                 extended = self._extend_arrays_cpu(named_arrays)
 
             extended_relaxation_param_dict = {key: extended[key] for key in relax_attrs}
+            # Extended arrays are numpy — skip re-upload to GPU to avoid
+            # wasting PCIe bandwidth. CPU numexpr handles subsequent computation.
             self.extended_medium = fullwave.MediumRelaxationMaps(
                 grid=self.extended_grid,
                 sound_speed=extended["sound_speed"],
@@ -319,7 +321,7 @@ def __init__(  # noqa: PLR0912
                 n_relaxation_mechanisms=self.medium_org.n_relaxation_mechanisms,
                 n_jobs=self.medium_org.n_jobs,
                 dtype=getattr(self.medium_org, "dtype", np.float64),
-                use_gpu=self.use_gpu,
+                use_gpu=False,
             )
         else:
             attr_names = ["sound_speed", "density", "beta", "alpha_coeff", "alpha_power"]
@@ -330,6 +332,8 @@ def __init__(  # noqa: PLR0912
                 named_arrays = [(name, getattr(self.medium_org, name)) for name in attr_names]
                 extended = self._extend_arrays_cpu(named_arrays)
 
+            # Extended arrays are numpy — skip re-upload to GPU to avoid
+            # wasting PCIe bandwidth. CPU numexpr handles subsequent computation.
             self.extended_medium = fullwave.Medium(
                 grid=self.extended_grid,
                 sound_speed=extended["sound_speed"],
@@ -343,7 +347,7 @@ def __init__(  # noqa: PLR0912
                 attenuation_builder=self.medium_org.attenuation_builder,
                 n_jobs=self.medium_org.n_jobs,
                 dtype=getattr(self.medium_org, "dtype", np.float64),
-                use_gpu=self.use_gpu,
+                use_gpu=False,
             )
         logger.debug("building extended medium for pml...done")
 
@@ -1833,6 +1837,8 @@ def __init__(
             named_arrays = [(name, getattr(self.medium_org, name)) for name in attr_names]
             extended = self._extend_arrays_cpu(named_arrays)
 
+        # Extended arrays are numpy — skip re-upload to GPU to avoid
+        # wasting PCIe bandwidth. CPU numexpr handles subsequent computation.
         self.extended_medium = fullwave.Medium(
             grid=self.extended_grid,
             sound_speed=extended["sound_speed"],
@@ -1845,7 +1851,7 @@ def __init__(
             path_relaxation_parameters_database=self.medium_org.path_relaxation_parameters_database,
             attenuation_builder=self.medium_org.attenuation_builder,
             dtype=getattr(self.medium_org, "dtype", np.float64),
-            use_gpu=self.use_gpu,
+            use_gpu=False,
         )
         logger.debug("Extended medium for PML built successfully.")
 
@@ -2101,6 +2107,10 @@ def _apply_pml_3d(
             nz=extended_medium.alpha_exp.shape[2],
             n_body=self.num_boundary_points,
         )
+        if isinstance(extended_medium.alpha_exp, np.ndarray) and not isinstance(a_mask, np.ndarray):
+            import cupy as cp  # noqa: PLC0415
+
+            a_mask = cp.asnumpy(a_mask)
         extended_medium.alpha_exp *= a_mask
         return extended_medium
 
@@ -2113,5 +2123,9 @@ def _apply_pml_2d(
             ny=extended_medium.alpha_exp.shape[1],
             n_body=self.num_boundary_points,
         )
+        if isinstance(extended_medium.alpha_exp, np.ndarray) and not isinstance(a_mask, np.ndarray):
+            import cupy as cp  # noqa: PLC0415
+
+            a_mask = cp.asnumpy(a_mask)
         extended_medium.alpha_exp *= a_mask
         return extended_medium
diff --git a/tests/test_cupy_equivalence.py b/tests/test_cupy_equivalence.py
index ec41bd2..ac64f54 100644
--- a/tests/test_cupy_equivalence.py
+++ b/tests/test_cupy_equivalence.py
@@ -712,6 +712,46 @@ def test_dc_map(self, setup_2d, tmp_path):
         )
         np.testing.assert_array_equal(_to_np(gpu_writer._dc_map), cpu_writer._dc_map)
 
+    def test_precomputed_bulk_modulus(self, setup_2d, tmp_path):
+        import fullwave
+
+        grid, medium = setup_2d
+
+        src_coords = np.array([[grid.nx // 2, grid.ny // 2]])
+        source = fullwave.Source(
+            p0=np.ones((1, 10)),
+            coords=src_coords,
+            grid_shape=(grid.nx, grid.ny),
+        )
+        sensor = fullwave.Sensor(
+            coords=src_coords,
+            grid_shape=(grid.nx, grid.ny),
+        )
+
+        gpu_writer = InputFileWriter(
+            work_dir=tmp_path / "gpu",
+            grid=grid,
+            medium=medium,
+            source=source,
+            sensor=sensor,
+            validate_input=False,
+            use_exponential_attenuation=True,
+            use_gpu=True,
+        )
+        # GPU path should precompute bulk_modulus
+        assert gpu_writer._precomputed_bulk_modulus is not None
+
+        # Compare with medium.bulk_modulus (CPU reference)
+        expected = np.multiply(
+            medium.sound_speed.astype(np.float32) ** 2,
+            medium.density.astype(np.float32),
+        )
+        np.testing.assert_allclose(
+            gpu_writer._precomputed_bulk_modulus,
+            expected,
+            rtol=1e-5,
+        )
+
 
 class TestPMLBuilderRelaxationCupyEquivalence:
     """Compare CPU vs GPU for PMLBuilder (multiple relaxation path)."""

From cb699a2af2737ee1b9e5dbd9a538ed87b129b30d Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Thu, 5 Mar 2026 17:35:21 -0500
Subject: [PATCH 26/31] Implement multi-GPU support for array uploads in
 PMLBuilder and Medium classes; enhance memory management and streamline data
 handling.

---
 fullwave/medium.py             | 145 +++++++++++++++++++++++++++------
 fullwave/solver/pml_builder.py |  35 +++++---
 2 files changed, 143 insertions(+), 37 deletions(-)

diff --git a/fullwave/medium.py b/fullwave/medium.py
index 3985976..d1a0b43 100644
--- a/fullwave/medium.py
+++ b/fullwave/medium.py
@@ -64,6 +64,98 @@ def _cleanup_gpu_arrays(obj: object, attr_names: list[str]) -> None:
         pass
 
 
+def _upload_arrays_multi_gpu(
+    named_arrays: list[tuple[str, np.ndarray]],
+    dtype: np.dtype,
+    target_device: int = 0,
+) -> dict[str, object]:
+    """Upload numpy arrays to *target_device* using parallel PCIe via multiple GPUs.
+
+    Each array is first transferred to a different GPU (one per PCIe link),
+    then copied to *target_device* via peer-to-peer (NVLink when available).
+
+    Parameters
+    ----------
+    named_arrays
+        List of (name, numpy_array) pairs to upload.
+    dtype
+        Target CuPy dtype for the arrays.
+    target_device
+        CUDA device ID where all arrays should end up (default 0).
+
+    Returns
+    -------
+    dict mapping name -> CuPy array on *target_device*.
+
+    """
+    from concurrent.futures import as_completed  # noqa: PLC0415
+
+    import cupy as cp  # noqa: PLC0415
+
+    n_gpus = cp.cuda.runtime.getDeviceCount()
+    n_arrays = len(named_arrays)
+    n_workers = min(n_gpus, n_arrays)
+    results: dict[str, object] = {}
+
+    logger.info(
+        "Uploading %d arrays to GPU using %d devices (of %d available).",
+        n_arrays,
+        n_workers,
+        n_gpus,
+    )
+
+    def _upload_one(name: str, arr_np: np.ndarray, device_id: int) -> tuple[str, object]:
+        with cp.cuda.Device(device_id):
+            gpu_arr = cp.asarray(arr_np, dtype=dtype)
+            if device_id != target_device:
+                with cp.cuda.Device(target_device):
+                    result = cp.array(gpu_arr)
+                del gpu_arr
+                cp.get_default_memory_pool().free_all_blocks()
+                return name, result
+            return name, gpu_arr
+
+    with ThreadPoolExecutor(max_workers=n_workers) as executor:
+        futures = [
+            executor.submit(_upload_one, name, arr, i % n_gpus)
+            for i, (name, arr) in enumerate(named_arrays)
+        ]
+        for future in as_completed(futures):
+            name, result = future.result()
+            results[name] = result
+
+    return results
+
+
+def _upload_or_convert_arrays(
+    xp: object,
+    dtype: np.dtype,
+    named_arrays: list[tuple[str, np.ndarray]],
+) -> dict[str, object]:
+    """Upload arrays to GPU (multi-GPU if available) or convert with numpy/cupy.
+
+    Returns dict mapping name -> array on the target backend.
+    """
+    if xp is np:
+        return {
+            name: np.atleast_2d(np.asarray(arr)).astype(dtype, copy=False)
+            for name, arr in named_arrays
+        }
+
+    sample_arr = np.asarray(named_arrays[0][1])
+    if sample_arr.ndim >= 3 and _check_cupy():
+        import cupy as cp  # noqa: PLC0415
+
+        n_gpus = cp.cuda.runtime.getDeviceCount()
+        if n_gpus > 1:
+            np_arrays = [(name, np.atleast_2d(np.asarray(arr))) for name, arr in named_arrays]
+            return _upload_arrays_multi_gpu(np_arrays, dtype)
+
+    return {
+        name: xp.atleast_2d(xp.asarray(arr)).astype(dtype, copy=False) for name, arr in named_arrays
+    }
+
+
 @dataclass
 class MediumRelaxationMaps:
     """Medium class for Fullwave."""
@@ -793,24 +885,28 @@ def __init__(
         self.dtype = np.dtype(dtype)
 
         xp = self.xp
+        attr_names = ["sound_speed", "density", "alpha_exp", "beta"]
+        named_inputs = [
+            ("sound_speed", sound_speed),
+            ("density", density),
+            ("alpha_exp", alpha_exp),
+            ("beta", beta),
+        ]
         try:
-            self.sound_speed = xp.atleast_2d(xp.asarray(sound_speed)).astype(self.dtype, copy=False)
-            self.density = xp.atleast_2d(xp.asarray(density)).astype(self.dtype, copy=False)
-            self.alpha_exp = xp.atleast_2d(xp.asarray(alpha_exp)).astype(self.dtype, copy=False)
-            self.beta = xp.atleast_2d(xp.asarray(beta)).astype(self.dtype, copy=False)
+            gpu_arrays = _upload_or_convert_arrays(xp, self.dtype, named_inputs)
+            for name in attr_names:
+                setattr(self, name, gpu_arrays[name])
         except Exception:
             if xp is np:
                 raise
             logger.warning(
                 "GPU OOM in MediumExponentialAttenuation.__init__. Falling back to CPU (numpy)."
             )
-            _cleanup_gpu_arrays(self, ["sound_speed", "density", "alpha_exp", "beta"])
+            _cleanup_gpu_arrays(self, attr_names)
             self.xp = np
-            xp = np
-            self.sound_speed = np.atleast_2d(np.asarray(sound_speed)).astype(self.dtype, copy=False)
-            self.density = np.atleast_2d(np.asarray(density)).astype(self.dtype, copy=False)
-            self.alpha_exp = np.atleast_2d(np.asarray(alpha_exp)).astype(self.dtype, copy=False)
-            self.beta = np.atleast_2d(np.asarray(beta)).astype(self.dtype, copy=False)
+            gpu_arrays = _upload_or_convert_arrays(np, self.dtype, named_inputs)
+            for name in attr_names:
+                setattr(self, name, gpu_arrays[name])
 
         if air_coords is not None:
             if air_map is not None:
@@ -1085,26 +1181,27 @@ def __init__(
         self.dtype = np.dtype(dtype)
 
         xp = self.xp
+        attr_names = ["sound_speed", "density", "alpha_coeff", "alpha_power", "beta"]
+        named_inputs = [
+            ("sound_speed", sound_speed),
+            ("density", density),
+            ("alpha_coeff", alpha_coeff),
+            ("alpha_power", alpha_power),
+            ("beta", beta),
+        ]
         try:
-            self.sound_speed = xp.atleast_2d(xp.asarray(sound_speed)).astype(self.dtype, copy=False)
-            self.density = xp.atleast_2d(xp.asarray(density)).astype(self.dtype, copy=False)
-            self.alpha_coeff = xp.atleast_2d(xp.asarray(alpha_coeff)).astype(self.dtype, copy=False)
-            self.alpha_power = xp.atleast_2d(xp.asarray(alpha_power)).astype(self.dtype, copy=False)
-            self.beta = xp.atleast_2d(xp.asarray(beta)).astype(self.dtype, copy=False)
+            gpu_arrays = _upload_or_convert_arrays(xp, self.dtype, named_inputs)
+            for name in attr_names:
+                setattr(self, name, gpu_arrays[name])
         except Exception:
             if xp is np:
                 raise
             logger.warning("GPU OOM in Medium.__init__. Falling back to CPU (numpy).")
-            _cleanup_gpu_arrays(
-                self, ["sound_speed", "density", "alpha_coeff", "alpha_power", "beta"]
-            )
+            _cleanup_gpu_arrays(self, attr_names)
             self.xp = np
-            xp = np
-            self.sound_speed = np.atleast_2d(np.asarray(sound_speed)).astype(self.dtype, copy=False)
-            self.density = np.atleast_2d(np.asarray(density)).astype(self.dtype, copy=False)
-            self.alpha_coeff = np.atleast_2d(np.asarray(alpha_coeff)).astype(self.dtype, copy=False)
-            self.alpha_power = np.atleast_2d(np.asarray(alpha_power)).astype(self.dtype, copy=False)
-            self.beta = np.atleast_2d(np.asarray(beta)).astype(self.dtype, copy=False)
+            gpu_arrays = _upload_or_convert_arrays(np, self.dtype, named_inputs)
+            for name in attr_names:
+                setattr(self, name, gpu_arrays[name])
 
         if air_coords is not None:
             if air_map is not None:
diff --git a/fullwave/solver/pml_builder.py b/fullwave/solver/pml_builder.py
index b2fbc9f..4466a24 100644
--- a/fullwave/solver/pml_builder.py
+++ b/fullwave/solver/pml_builder.py
@@ -288,20 +288,21 @@ def __init__(  # noqa: PLR0912
             relax_attrs = list(self.medium_org.relaxation_param_dict.keys())
 
             if self.xp is not np:
-                # Move medium + relaxation arrays to CPU, then extend via multi-GPU
-                named_arrays = self._ensure_numpy_medium_arrays(base_attrs)
+                # Pass CuPy arrays directly — multi-GPU extension uses D2D copy (NVLink)
+                named_arrays = [(name, getattr(self.medium_org, name)) for name in base_attrs]
+                named_arrays += [
+                    (key, self.medium_org.relaxation_param_dict[key]) for key in relax_attrs
+                ]
+                extended = self._extend_arrays_gpu(named_arrays)
+                # Free original GPU arrays to reclaim memory
+                self._ensure_numpy_medium_arrays(base_attrs)
                 for key in relax_attrs:
                     import cupy as cp  # noqa: PLC0415
 
                     val = self.medium_org.relaxation_param_dict[key]
                     if not isinstance(val, np.ndarray):
-                        val_np = cp.asnumpy(val)
-                        self.medium_org.relaxation_param_dict[key] = val_np
-                    else:
-                        val_np = val
-                    named_arrays.append((key, val_np))
+                        self.medium_org.relaxation_param_dict[key] = cp.asnumpy(val)
                 cp.get_default_memory_pool().free_all_blocks()
-                extended = self._extend_arrays_gpu(named_arrays)
             else:
                 named_arrays = [(name, getattr(self.medium_org, name)) for name in base_attrs] + [
                     (key, self.medium_org.relaxation_param_dict[key]) for key in relax_attrs
@@ -326,8 +327,9 @@ def __init__(  # noqa: PLR0912
         else:
             attr_names = ["sound_speed", "density", "beta", "alpha_coeff", "alpha_power"]
             if self.xp is not np:
-                named_arrays = self._ensure_numpy_medium_arrays(attr_names)
+                named_arrays = [(name, getattr(self.medium_org, name)) for name in attr_names]
                 extended = self._extend_arrays_gpu(named_arrays)
+                self._ensure_numpy_medium_arrays(attr_names)
             else:
                 named_arrays = [(name, getattr(self.medium_org, name)) for name in attr_names]
                 extended = self._extend_arrays_cpu(named_arrays)
@@ -617,17 +619,22 @@ def _extend_arrays_multi_gpu(
 
         Each thread sets its own CUDA device, transfers data, extends,
         and returns the result as a numpy array.
+        Accepts both numpy and CuPy input arrays. CuPy arrays are copied
+        device-to-device via NVLink when available (faster than PCIe).
         """
         import cupy as cp  # noqa: PLC0415
 
         def extend_on_device(
             args: tuple[str, NDArray, int],
         ) -> tuple[str, NDArray]:
-            name, arr_np, device_id = args
+            name, arr, device_id = args
             with cp.cuda.Device(device_id):
-                result_gpu = self._extend_map_for_pml(arr_np)
+                # cp.asarray handles both numpy (H2D) and CuPy from
+                # another device (D2D via NVLink when available)
+                arr_local = cp.asarray(arr)
+                result_gpu = self._extend_map_for_pml(arr_local)
                 result_np = cp.asnumpy(result_gpu)
-                del result_gpu
+                del arr_local, result_gpu
                 cp.get_default_memory_pool().free_all_blocks()
                 return name, result_np
 
@@ -1831,8 +1838,10 @@ def __init__(
         logger.debug("building extended medium for pml...")
         attr_names = ["sound_speed", "density", "beta", "alpha_coeff", "alpha_power"]
         if self.xp is not np:
-            named_arrays = self._ensure_numpy_medium_arrays(attr_names)
+            named_arrays = [(name, getattr(self.medium_org, name)) for name in attr_names]
             extended = self._extend_arrays_gpu(named_arrays)
+            # Free original GPU arrays and replace with numpy to reclaim GPU memory
+            self._ensure_numpy_medium_arrays(attr_names)
         else:
             named_arrays = [(name, getattr(self.medium_org, name)) for name in attr_names]
             extended = self._extend_arrays_cpu(named_arrays)

From 9ca9b7fecdfa16a4da4c231fe94834156cadcce3 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Thu, 5 Mar 2026 21:29:22 -0500
Subject: [PATCH 27/31] =?UTF-8?q?Bump=20version:=201.2.6-dev5=20=E2=86=92?=
 =?UTF-8?q?=201.2.6-dev6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .bumpversion.toml    | 2 +-
 fullwave/__init__.py | 2 +-
 pyproject.toml       | 2 +-
 uv.lock              | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.bumpversion.toml b/.bumpversion.toml
index 9d28346..570ee58 100644
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "1.2.6-dev5"
+current_version = "1.2.6-dev6"
 parse = """(?x)
     (?P<major>0|[1-9]\\d*)\\.
     (?P<minor>0|[1-9]\\d*)\\.
diff --git a/fullwave/__init__.py b/fullwave/__init__.py
index fdc07e7..b770144 100644
--- a/fullwave/__init__.py
+++ b/fullwave/__init__.py
@@ -60,7 +60,7 @@
     __version__ = version("fullwave")
 except PackageNotFoundError:
     # Update via bump-my-version, not manually
-    __version__ = "1.2.6-dev5"
+    __version__ = "1.2.6-dev6"
 
 VERSION = __version__  # for convenience
 logger.info("Fullwave version: %s", __version__)
diff --git a/pyproject.toml b/pyproject.toml
index d43c440..22eb163 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "fullwave25"
-version = "1.2.6-dev5" # Update via bump-my-version, not manually
+version = "1.2.6-dev6" # Update via bump-my-version, not manually
 description = "Fullwave 2.5: Ultrasound wave propagation simulation with heterogeneous power law attenuation modelling capabilities"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/uv.lock b/uv.lock
index 759c91c..2ff9103 100644
--- a/uv.lock
+++ b/uv.lock
@@ -735,7 +735,7 @@ wheels = [
 
 [[package]]
 name = "fullwave25"
-version = "1.2.6.dev5"
+version = "1.2.6.dev6"
 source = { editable = "." }
 dependencies = [
     { name = "matplotlib" },

From c256cc9fead4c02af0a8da3e742ba3f9b2c9c1c7 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Fri, 6 Mar 2026 09:52:13 -0500
Subject: [PATCH 28/31] Add GPU memory management in Solver class; implement
 method to release CuPy memory pools

---
 fullwave/solver/solver.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/fullwave/solver/solver.py b/fullwave/solver/solver.py
index a6f32f4..0a0f0d5 100644
--- a/fullwave/solver/solver.py
+++ b/fullwave/solver/solver.py
@@ -1,5 +1,6 @@
 """solver module."""
 
+import gc
 import logging
 import time
 from pathlib import Path
@@ -594,6 +595,24 @@ def __init__(  # noqa: PLR0912
                 use_gpu=use_gpu_pml,
             )
 
+    @staticmethod
+    def _release_gpu_memory_pools() -> None:
+        """Release all CuPy GPU memory pool blocks back to CUDA.
+
+        Call ``gc.collect()`` first so that Python releases references to
+        CuPy arrays, then drain both the device and pinned memory pools.
+        This prevents stale allocations from causing memory pressure when
+        subsequent operations allocate large GPU arrays.
+        """
+        gc.collect()
+        try:
+            import cupy as cp  # noqa: PLC0415
+
+            cp.get_default_memory_pool().free_all_blocks()
+            cp.get_default_pinned_memory_pool().free_all_blocks()
+        except ImportError:
+            pass
+
     @staticmethod
     def _check_input(
         grid: fullwave.Grid,
@@ -966,6 +985,7 @@ def run(
                 "Input data generation completed in %s. Skipping simulation execution.",
                 simulation_dir,
             )
+            self._release_gpu_memory_pools()
             return simulation_dir
 
         sim_result = self.fullwave_launcher.run(

From 8ba7f193bfa928d785578cab0f3988a6f8d15251 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Fri, 6 Mar 2026 09:55:51 -0500
Subject: [PATCH 29/31] =?UTF-8?q?Bump=20version:=201.2.6-dev6=20=E2=86=92?=
 =?UTF-8?q?=201.2.6-dev7?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .bumpversion.toml    | 2 +-
 fullwave/__init__.py | 2 +-
 pyproject.toml       | 2 +-
 uv.lock              | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.bumpversion.toml b/.bumpversion.toml
index 570ee58..20301ea 100644
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "1.2.6-dev6"
+current_version = "1.2.6-dev7"
 parse = """(?x)
     (?P<major>0|[1-9]\\d*)\\.
     (?P<minor>0|[1-9]\\d*)\\.
diff --git a/fullwave/__init__.py b/fullwave/__init__.py
index b770144..ba114ad 100644
--- a/fullwave/__init__.py
+++ b/fullwave/__init__.py
@@ -60,7 +60,7 @@
     __version__ = version("fullwave")
 except PackageNotFoundError:
     # Update via bump-my-version, not manually
-    __version__ = "1.2.6-dev6"
+    __version__ = "1.2.6-dev7"
 
 VERSION = __version__  # for convenience
 logger.info("Fullwave version: %s", __version__)
diff --git a/pyproject.toml b/pyproject.toml
index 22eb163..481b14e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "fullwave25"
-version = "1.2.6-dev6" # Update via bump-my-version, not manually
+version = "1.2.6-dev7" # Update via bump-my-version, not manually
 description = "Fullwave 2.5: Ultrasound wave propagation simulation with heterogeneous power law attenuation modelling capabilities"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/uv.lock b/uv.lock
index 2ff9103..7c83b41 100644
--- a/uv.lock
+++ b/uv.lock
@@ -735,7 +735,7 @@ wheels = [
 
 [[package]]
 name = "fullwave25"
-version = "1.2.6.dev6"
+version = "1.2.6.dev7"
 source = { editable = "." }
 dependencies = [
     { name = "matplotlib" },

From 3172375447a1b7438c875002b787a83b02202466 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Fri, 6 Mar 2026 10:00:00 -0500
Subject: [PATCH 30/31] Add GPU memory management in Solver class; implement
 method to release CuPy memory pools

---
 fullwave/solver/solver.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fullwave/solver/solver.py b/fullwave/solver/solver.py
index 0a0f0d5..fdc548f 100644
--- a/fullwave/solver/solver.py
+++ b/fullwave/solver/solver.py
@@ -988,6 +988,8 @@ def run(
             self._release_gpu_memory_pools()
             return simulation_dir
 
+        self._release_gpu_memory_pools()
+
         sim_result = self.fullwave_launcher.run(
             simulation_dir,
             load_results=load_results,

From 5f598ace8b86db4e20b74b1cb0e022678f2d7895 Mon Sep 17 00:00:00 2001
From: Masashi Sode <39261814+MasashiSode@users.noreply.github.com>
Date: Fri, 6 Mar 2026 10:00:11 -0500
Subject: [PATCH 31/31] =?UTF-8?q?Bump=20version:=201.2.6-dev7=20=E2=86=92?=
 =?UTF-8?q?=201.2.6-dev8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .bumpversion.toml    | 2 +-
 fullwave/__init__.py | 2 +-
 pyproject.toml       | 2 +-
 uv.lock              | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.bumpversion.toml b/.bumpversion.toml
index 20301ea..2905593 100644
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "1.2.6-dev7"
+current_version = "1.2.6-dev8"
 parse = """(?x)
     (?P<major>0|[1-9]\\d*)\\.
     (?P<minor>0|[1-9]\\d*)\\.
diff --git a/fullwave/__init__.py b/fullwave/__init__.py
index ba114ad..03caafd 100644
--- a/fullwave/__init__.py
+++ b/fullwave/__init__.py
@@ -60,7 +60,7 @@
     __version__ = version("fullwave")
 except PackageNotFoundError:
     # Update via bump-my-version, not manually
-    __version__ = "1.2.6-dev7"
+    __version__ = "1.2.6-dev8"
 
 VERSION = __version__  # for convenience
 logger.info("Fullwave version: %s", __version__)
diff --git a/pyproject.toml b/pyproject.toml
index 481b14e..9365b50 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "fullwave25"
-version = "1.2.6-dev7" # Update via bump-my-version, not manually
+version = "1.2.6-dev8" # Update via bump-my-version, not manually
 description = "Fullwave 2.5: Ultrasound wave propagation simulation with heterogeneous power law attenuation modelling capabilities"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/uv.lock b/uv.lock
index 7c83b41..fca99ac 100644
--- a/uv.lock
+++ b/uv.lock
@@ -735,7 +735,7 @@ wheels = [
 
 [[package]]
 name = "fullwave25"
-version = "1.2.6.dev7"
+version = "1.2.6.dev8"
 source = { editable = "." }
 dependencies = [
     { name = "matplotlib" },