Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Fixed
- Fraud rings no longer draw overlapping account ranges. Each ring picked a contiguous block of accounts without excluding accounts already used by earlier rings, so ring ranges could overlap: two rings merged into a single non-cycle component and their `involved_accounts` labels shared accounts. Rings are now placed on disjoint ranges.

## [0.1.0] - 2026-05-26

### Added
Expand Down
28 changes: 19 additions & 9 deletions src/gen_fraud_graph/typologies.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,18 +85,28 @@ def generate(
tx_rows: list[list] = []
case_rows: list[list] = []
current_tx_id = start_tx_id
# Allocate every ring's accounts up front from one pool of distinct
# ids, then give each ring its own slice. Overlapping ranges would
# merge two rings into a single non-cycle component and make the
# per-ring involved_accounts labels ambiguous.
min_d, max_d = self.depth_range
depths = [random.randint(min_d, max_d) for _ in range(self.num_rings)]
total_needed = sum(depths)
if total_needed > max_account_id:
raise ValueError(
f"{self.num_rings} fraud rings need {total_needed} distinct "
f"accounts but only {max_account_id} exist; lower the ring "
f"count or raise the account scale"
)
account_pool = random.sample(range(max_account_id), total_needed)
pool_offset = 0

for pattern_id in tqdm(range(self.num_rings), desc="Generating fraud rings"):
min_d, max_d = self.depth_range
depth = random.randint(min_d, max_d)

# Pick a contiguous range of accounts for the ring
if max_account_id < depth + 1:
start_node = 0
else:
start_node = random.randint(0, max_account_id - depth - 1)
depth = depths[pattern_id]
ring_ids = account_pool[pool_offset : pool_offset + depth]
pool_offset += depth

accounts = [f"acc_{start_node + d}" for d in range(depth)]
accounts = [f"acc_{i}" for i in ring_ids]
involved = "|".join(accounts)

batch_texts: list[str] = []
Expand Down
22 changes: 12 additions & 10 deletions tests/test_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,18 +314,20 @@ def test_neptune_format(self, tmp_dir):
header = next(csv.reader(fh))
assert "~from" in header

def test_small_max_account_id(self, tmp_dir):
"""max_account_id < depth+1 must take the start_node=0 fallback."""
def test_oversubscribed_rings_raise(self, tmp_dir):
"""When the rings need more distinct accounts than exist they can't be
packed disjointly, so generate() must raise rather than emit rings that
reference nonexistent accounts."""
emb = EmbeddingGenerator("fake", dim=16)
gen = FraudRingGenerator(num_rings=2, depth_range=(4, 4))
n_tx, _ = gen.generate(
max_account_id=3,
start_tx_id=0,
embedder=emb,
output_dir=tmp_dir,
fmt="csv",
)
assert n_tx > 0
with pytest.raises(ValueError, match="distinct"):
gen.generate(
max_account_id=3,
start_tx_id=0,
embedder=emb,
output_dir=tmp_dir,
fmt="csv",
)

def test_compress(self, tmp_dir):
emb = EmbeddingGenerator("fake", dim=16)
Expand Down
24 changes: 24 additions & 0 deletions tests/test_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import csv
import os
import random
import shutil
import tempfile

Expand Down Expand Up @@ -176,6 +177,29 @@ def test_fraud_cases_have_correct_columns(self, tmp_dir):
assert "pattern_id" in rows[0]
assert "involved_accounts" in rows[0]

def test_rings_use_disjoint_accounts(self, tmp_dir):
# Each ring must occupy its own accounts. Overlapping ranges merge two
# rings into a single non-cycle component and make the per-ring
# involved_accounts labels ambiguous.
random.seed(0)
emb = EmbeddingGenerator("fake", dim=8)
gen = FraudRingGenerator(num_rings=15, depth_range=(4, 4))
gen.generate(
max_account_id=80,
start_tx_id=0,
embedder=emb,
output_dir=tmp_dir,
)
with open(os.path.join(tmp_dir, "fraud", "fraud_cases.csv")) as fh:
rows = list(csv.DictReader(fh))
seen: set[str] = set()
for row in rows:
accounts = row["involved_accounts"].split("|")
assert seen.isdisjoint(
accounts
), f"{row['pattern_id']} reuses accounts from an earlier ring"
seen.update(accounts)


# ---------------------------------------------------------------------------
# End-to-end generator tests
Expand Down
Loading