Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/notebooks
254 changes: 226 additions & 28 deletions src/squidpy/tl/_sliding_window.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from itertools import product
from typing import Literal

import numpy as np
import pandas as pd
Expand All @@ -23,7 +24,8 @@ def sliding_window(
coord_columns: tuple[str, str] = ("globalX", "globalY"),
sliding_window_key: str = "sliding_window_assignment",
spatial_key: str = "spatial",
drop_partial_windows: bool = False,
partial_windows: Literal["adaptive", "drop", "split"] | None = None,
max_nr_cells: int | None = None,
copy: bool = False,
*,
table_key: str | None = None,
Expand All @@ -45,8 +47,14 @@ def sliding_window(
overlap: int

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

overlap and window_size should be optional if they are not required by partial_windows=split . It doesn't make sense to expect window_size from the user if aren't going to even use it right?

so it should be

window_size: int | None = None,

also the same for overlap.

Overlap size between consecutive windows. (0 = no overlap)
%(spatial_key)s
drop_partial_windows: bool
If True, drop windows that are smaller than the window size at the borders.
partial_windows: Literal["adaptive", "drop", "split"] | None
If None, possibly small windows at the edges are kept.
If `adaptive`, all windows might be shrunken a bit to avoid small windows at the edges.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

About adaptive mode. This mode still yields incomplete windows right? See this example:

Example (window_size=40, extent 0–100): number_x_windows = ceil(100/40) = 3, x_window_size = ceil(100/3) = 34 → starts [0, 34, 68], ends [34, 68, 100]. The last window is 32 wide, the others 34.

If all window sizes aren't guaranteed to be the same this should be noted here. We also need better explanation how the new window size is going to be calculated. Also we need to point that the main goal here is to make all window sizes closer to each other. This can be useful in your workflow but I'd like to see how this can be useful for researchers in general. Could you explain the use case with a concrete example to me?

If `drop`, possibly small windows at the edges are removed.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

drop argument shouldn't be so easily removed. We should first deprecate it and warn users if it's being used. Then in another release we can remove it. Because this is a breaking change.

If `split`, windows are split into subwindows until not exceeding `max_nr_cells`
max_nr_cells: int | None
The maximum number of cells allowed after merging two windows.
Required if `partial_windows = split`
copy: bool
If True, return the result, otherwise save it to the adata object.

Expand All @@ -55,8 +63,18 @@ def sliding_window(
If ``copy = True``, returns the sliding window annotation(s) as pandas dataframe
Otherwise, stores the sliding window annotation(s) in .obs.
"""
if overlap < 0:
raise ValueError("Overlap must be non-negative.")
if partial_windows == "split":
if max_nr_cells is None:
raise ValueError("`max_nr_cells` must be set when `partial_windows == split`.")
if window_size is not None:
logg.warning(f"Ingoring `window_size` when using `{partial_windows}`")
if overlap != 0:
logg.warning("Ignoring `overlap` as it cannot be used with `split`")
else:
if max_nr_cells is not None:
logg.warning("Ignoring `max_nr_cells` as `partial_windows != split`")
if overlap < 0:
raise ValueError("Overlap must be non-negative.")

adata = extract_adata_if_sdata(adata, table_key=table_key)

Expand Down Expand Up @@ -88,8 +106,13 @@ def sliding_window(
# mostly arbitrary choice, except that full integers usually generate windows with 1-2 cells at the borders
window_size = max(int(np.floor(coord_range // 3.95)), 1)

if window_size <= 0:
raise ValueError("Window size must be larger than 0.")
if partial_windows != "split":
if window_size <= 0:
raise ValueError("Window size must be larger than 0.")
if overlap >= window_size:
raise ValueError("Overlap must be less than the window size.")
if overlap >= window_size // 2 and partial_windows == "adaptive":
raise ValueError("Overlap must be less than `window_size` // 2 when using `adaptive`.")

if library_key is not None and library_key not in adata.obs:
raise ValueError(f"Library key '{library_key}' not found in adata.obs")
Expand Down Expand Up @@ -121,7 +144,10 @@ def sliding_window(
max_y=max_y,
window_size=window_size,
overlap=overlap,
drop_partial_windows=drop_partial_windows,
partial_windows=partial_windows,
lib_coords=lib_coords,
coord_columns=(x_col, y_col),
max_nr_cells=max_nr_cells,
)

lib_key = f"{lib}_" if lib is not None else ""
Expand All @@ -133,15 +159,17 @@ def sliding_window(
y_start = window["y_start"]
y_end = window["y_end"]

mask = (
(lib_coords[x_col] >= x_start)
& (lib_coords[x_col] <= x_end)
& (lib_coords[y_col] >= y_start)
& (lib_coords[y_col] <= y_end)
mask = _get_window_mask(
coord_columns=(x_col, y_col),
lib_coords=lib_coords,
x_start=x_start,
x_end=x_end,
y_start=y_start,
y_end=y_end,
)
obs_indices = lib_coords.index[mask]

if overlap == 0:
if overlap == 0 or partial_windows == "split":
mask = (
(lib_coords[x_col] >= x_start)
& (lib_coords[x_col] <= x_end)
Expand Down Expand Up @@ -177,14 +205,61 @@ def sliding_window(
_save_data(adata, attr="obs", key=col_name, data=col_data)


def _get_window_mask(
coord_columns: tuple[str, str],
lib_coords: pd.DataFrame,
x_start: int,
x_end: int,
y_start: int,
y_end: int,
) -> pd.Series:
"""
Compute a boolean mask selecting coordinates that fall within a given window.

Parameters
----------
coord_columns: Tuple[str, str]
Tuple of column names in `adata.obs` that specify the coordinates (x, y), i.e. ('globalX', 'globalY')
lib_coords: pd.DataFrame
DataFrame containing spatial coordinates (e.g. `adata.obs` subset for one library).
Coordinate values are expected to be integers.
x_start: int
Lower bound of the window in x-direction (inclusive).
x_end: int
Upper bound of the window in x-direction (inclusive).
y_start: int
Lower bound of the window in y-direction (inclusive).
y_end: int
Upper bound of the window in y-direction (inclusive).

Returns
-------
pd.Series
Boolean mask indicating which rows in `lib_coords` fall inside the specified window.
"""
x_col, y_col = coord_columns

mask = (

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The inclusive boundaries here can cause an infinite recursion. I made claude create a counterexample

The counterexample
Trigger: any window where more than max_nr_cells cells share the same coordinate on the split axis — i.e. a dense cluster of tied/duplicate coordinates. With quantized or integer-binned spatial data this is realistic, not just pathological.

Minimal case: 10 cells all at (0,0), max_nr_cells=5 → RecursionError.
Realistic case: a normal spread of points plus 8 cells piled at integer (7,7), max_nr_cells=5 → RecursionError.
Why it never terminates
Walk the recursion on a window that contains the tied pile (say all cells share x = m, width > height so it's a vertical split):

middle_pos = len // 2, x_middle = ...iloc[middle_pos] → x_middle = m (line 423).
Children are (x_start, m, …) and (m, x_end, …) (line 425).
The mask is inclusive on both ends (>= and <=, _get_window_mask:243-248), so every cell at x = m falls into the first child (x_start, m).
That child has the same cell count as the parent → step 1 again, with no progress. → infinite recursion until the stack limit.
The base case at line 407 (n_cells <= max_cells) can never be reached because the split fails to reduce the count. The same happens on the horizontal branch (line 428).

Root cause
The median value is used as a shared, inclusive boundary, so cells equal to the median can't be partitioned. A robust fix splits by rank/index rather than value (and makes the boundary half-open), or bails out when a window can't be reduced. Rough options:

Half-open boundaries: make sub-windows [start, mid) and [mid, end) so each cell lands in exactly one side. (Needs care so the overall coverage still includes max_x/max_y.)
Terminate on no-progress: if a child ends up with the same n_cells as its parent (all cells tied on both axes), stop and emit it as a single window even though it exceeds max_cells — and ideally log() a warning that the cap couldn't be honored.
Guard up front: detect that the split made no progress (child count == parent count) and break.

(lib_coords[x_col] >= x_start)
& (lib_coords[x_col] <= x_end)
& (lib_coords[y_col] >= y_start)
& (lib_coords[y_col] <= y_end)
)

return mask


def _calculate_window_corners(
min_x: int,
max_x: int,
min_y: int,
max_y: int,
window_size: int,
overlap: int = 0,
drop_partial_windows: bool = False,
partial_windows: Literal["adaptive", "drop", "split"] | None = None,
lib_coords: pd.DataFrame | None = None,
coord_columns: tuple[str, str] | None = None,
max_nr_cells: int | None = None,
) -> pd.DataFrame:
"""
Calculate the corner points of all windows covering the area from min_x to max_x and min_y to max_y,
Expand All @@ -202,23 +277,45 @@ def _calculate_window_corners(
maximum Y coordinate
window_size: float
size of each window
lib_coords: pd.DataFrame | None
coordinates of all samples for one library
coord_columns: Tuple[str, str]
Tuple of column names in `adata.obs` that specify the coordinates (x, y), i.e. ('globalX', 'globalY')
overlap: float
overlap between consecutive windows (must be less than window_size)
drop_partial_windows: bool
if True, drop border windows that are smaller than window_size;
if False, create smaller windows at the borders to cover the remaining space.
partial_windows: Literal["adaptive", "drop", "split"] | None
If None, possibly small windows at the edges are kept.
If 'adaptive', all windows might be shrunken a bit to avoid small windows at the edges.
If 'drop', possibly small windows at the edges are removed.
If 'split', windows are split into subwindows until not exceeding `max_nr_cells`

Returns
-------
windows: pandas DataFrame with columns ['x_start', 'x_end', 'y_start', 'y_end']
"""
if overlap < 0:
raise ValueError("Overlap must be non-negative.")
if overlap >= window_size:
raise ValueError("Overlap must be less than the window size.")
# adjust x and y window size if 'adaptive'
if partial_windows == "adaptive":
total_width = max_x - min_x
total_height = max_y - min_y

# number of windows in x and y direction
number_x_windows = np.ceil((total_width - overlap) / (window_size - overlap))
number_y_windows = np.ceil((total_height - overlap) / (window_size - overlap))

# window size in x and y direction
x_window_size = (total_width + (number_x_windows - 1) * overlap) / number_x_windows
y_window_size = (total_height + (number_y_windows - 1) * overlap) / number_y_windows

# avoid float errors
x_window_size = np.ceil(x_window_size)
y_window_size = np.ceil(y_window_size)
else:
x_window_size = window_size
y_window_size = window_size

x_step = window_size - overlap
y_step = window_size - overlap
# create the step sizes for each window
x_step = x_window_size - overlap
y_step = y_window_size - overlap

# Generate starting points
x_starts = np.arange(min_x, max_x, x_step)
Expand All @@ -227,16 +324,117 @@ def _calculate_window_corners(
# Create all combinations of x and y starting points
starts = list(product(x_starts, y_starts))
windows = pd.DataFrame(starts, columns=["x_start", "y_start"])
windows["x_end"] = windows["x_start"] + window_size
windows["y_end"] = windows["y_start"] + window_size
windows["x_end"] = windows["x_start"] + x_window_size
windows["y_end"] = windows["y_start"] + y_window_size

# Adjust windows that extend beyond the bounds
if not drop_partial_windows:
if partial_windows is None:
windows["x_end"] = windows["x_end"].clip(upper=max_x)
windows["y_end"] = windows["y_end"].clip(upper=max_y)
else:
elif partial_windows == "adaptive":
# as window_size is an integer to avoid float errors, it can exceed max_x and max_y -> clip
windows["x_end"] = windows["x_end"].clip(upper=max_x)
windows["y_end"] = windows["y_end"].clip(upper=max_y)

# remove redundant windows in the corners
redundant_windows = ((windows["x_end"] - windows["x_start"]) <= overlap) | (
(windows["y_end"] - windows["y_start"]) <= overlap
)
windows = windows[~redundant_windows]
elif partial_windows == "drop":
valid_windows = (windows["x_end"] <= max_x) & (windows["y_end"] <= max_y)
windows = windows[valid_windows]
elif partial_windows == "split":
# split the slide recursively into windows with at most max_nr_cells
x_col, y_col = coord_columns

coord_x_sorted = lib_coords.sort_values(by=[x_col])
coord_y_sorted = lib_coords.sort_values(by=[y_col])

windows = _split_window(
max_nr_cells, (x_col, y_col), coord_x_sorted, coord_y_sorted, min_x, max_x, min_y, max_y
).sort_values(["x_start", "x_end", "y_start", "y_end"])
else:
raise ValueError(f"{partial_windows} is not a valid partial_windows argument.")

windows = windows.reset_index(drop=True)
return windows[["x_start", "x_end", "y_start", "y_end"]]


def _split_window(
max_cells: int,
coord_columns: tuple[str, str],
coord_x_sorted: pd.DataFrame,
coord_y_sorted: pd.DataFrame,
x_start: int,
x_end: int,
y_start: int,
y_end: int,
) -> pd.DataFrame:
"""
Recursively split a rectangular window into subwindows such that each subwindow
contains at most `max_cells` cells and at least `max_cells` // 2 cells.

Parameters
----------
max_cells : int
Maximum number of cells allowed per window.
coord_columns: Tuple[str, str]
Tuple of column names in `adata.obs` that specify the coordinates (x, y), i.e. ('globalX', 'globalY')
coord_x_sorted : pandas.DataFrame
DataFrame containing cell coordinates, sorted by `x_col`.
coord_y_sorted : pandas.DataFrame
DataFrame containing cell coordinates, sorted by `y_col`.
x_start : int
Left (minimum) x coordinate of the current window.
x_end : int
Right (maximum) x coordinate of the current window.
y_start : int
Bottom (minimum) y coordinate of the current window.
y_end : int
Top (maximum) y coordinate of the current window.

Returns
-------
windows: pandas DataFrame with columns ['x_start', 'x_end', 'y_start', 'y_end']
"""
x_col, y_col = coord_columns

# return current window if it contains less cells than max_cells
n_cells = _get_window_mask(coord_columns, coord_x_sorted, x_start, x_end, y_start, y_end).sum()

if n_cells <= max_cells:
return pd.DataFrame({"x_start": [x_start], "x_end": [x_end], "y_start": [y_start], "y_end": [y_end]})

# define start and stop indices of subsetted windows
sub_coord_x_sorted = coord_x_sorted[
_get_window_mask(coord_columns, coord_x_sorted, x_start, x_end, y_start, y_end)
].reset_index(drop=True)

sub_coord_y_sorted = coord_y_sorted[
_get_window_mask(coord_columns, coord_y_sorted, x_start, x_end, y_start, y_end)
].reset_index(drop=True)

middle_pos = len(sub_coord_x_sorted) // 2

if (x_end - x_start) > (y_end - y_start):
# vertical split
x_middle = sub_coord_x_sorted[x_col].iloc[middle_pos]

indices = ((x_start, x_middle, y_start, y_end), (x_middle, x_end, y_start, y_end))
else:
# horizontal split
y_middle = sub_coord_y_sorted.loc[middle_pos, y_col]

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please be consistent with these and choose one of them

x_middle = sub_coord_x_sorted[x_col].iloc[middle_pos]
y_middle = sub_coord_y_sorted.loc[middle_pos, y_col]


indices = ((x_start, x_end, y_start, y_middle), (x_start, x_end, y_middle, y_end))

# recursively continue with either left&right or upper&lower windows pairs
windows = []
for x_start, x_end, y_start, y_end in indices:
windows.append(
_split_window(
max_cells, (x_col, y_col), sub_coord_x_sorted, sub_coord_y_sorted, x_start, x_end, y_start, y_end
)
)

return pd.concat(windows)
Loading
Loading