Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions tests/test_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,59 @@ def test_get_sqlalchemy_type_unsupported_raises(db):
db._get_sqlalchemy_type("GEOMETRY")


def test_get_sqlalchemy_type_typo_suggests_correction(db):
"""A close-typo unknown type must surface a 'Did you mean X?' hint.

A new user typing ``BIGINTEGER`` (mixing up MySQL's ``BIGINT`` with
Python's ``int`` keyword) used to get a bare ``Unsupported MySQL
type: BIGINTEGER`` — correct but unhelpful. With the suggestion
layer the error guides them toward a related supported type and
lists the full vocabulary alongside, turning a 'wait, what's the
right spelling?' round-trip into a zero-thought fix. Surfaced by
adversarial new-user testing N3 (parent #261).

``BIGINTEGER``'s nearest by Levenshtein distance is ``INTEGER``
(d=3 — drop the BIG prefix); ``BIGINT`` is d=4 (drop the EGER
suffix). Both are valid supported types, INTEGER wins by a single
edit. The full supported-types list (which the error also prints)
surfaces ``BIGINT`` for the user who actually wanted the 64-bit
range.
"""
with pytest.raises(ValueError, match="Did you mean 'INTEGER'") as excinfo:
db._get_sqlalchemy_type("BIGINTEGER")
# The full supported-types listing must also be present so the user
# discovers BIGINT (the better fit by intent) even though the
# suggestion is INTEGER by edit distance.
assert "Supported types" in str(excinfo.value)
assert "BIGINT" in str(excinfo.value)


def test_get_sqlalchemy_type_typo_no_suggestion_for_distant_input(db):
"""A type name that's NOT close to any supported entry must NOT
misleadingly suggest one — ``GEOMETRY`` is a real MySQL type but
semantically unrelated to anything we map; suggesting ``DATETIME``
for it would be worse than no suggestion."""
with pytest.raises(ValueError, match="Unsupported MySQL type") as excinfo:
db._get_sqlalchemy_type("GEOMETRY")
assert "Did you mean" not in str(excinfo.value)


@pytest.mark.parametrize("typo,suggestion", [
("INTGER", "INTEGER"),
("NUMRIC", "NUMERIC"),
("BOLEAN", "BOOLEAN"),
("VARCAHR", "VARCHAR"),
])
def test_get_sqlalchemy_type_typo_suggestions_cover_common_mistakes(
db, typo, suggestion
):
"""Several real-world typos a new user might make. Each is within
edit distance 2 of the suggested correction and far enough from
other candidates that the suggestion is unambiguous."""
with pytest.raises(ValueError, match=f"Did you mean '{suggestion}'"):
db._get_sqlalchemy_type(typo)


def test_get_sqlalchemy_type_decimal_precision_scale(db):
# Regression (#190 bugbot): DECIMAL(10,2) used to fail int("10,2") and
# fall back to a bare Numeric() — declared precision and scale silently
Expand Down
73 changes: 71 additions & 2 deletions tracebloc_ingestor/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from sqlalchemy.exc import OperationalError, InterfaceError, DBAPIError
import logging
from urllib.parse import quote
from typing import List, Dict, Any, Optional
from typing import Iterable, List, Dict, Any, Optional
from datetime import datetime
from tenacity import (
retry,
Expand Down Expand Up @@ -99,6 +99,58 @@ def _execute_with_retry(connection, stmt):
raise


def _suggest_type(unknown: str, known: Iterable[str]) -> Optional[str]:
"""Return the closest match from ``known`` to ``unknown`` if any
candidate is within edit distance 3 (Levenshtein), else None.

Used by ``_get_sqlalchemy_type`` to surface a "Did you mean BIGINT?"
hint when a customer types BIGINTEGER, BOOLEAN→BOOL, NUMRIC→NUMERIC,
etc. Distance 3 is the empirical sweet spot — close enough to catch
every realistic typo we've seen in the wild (single-letter swaps,
common prefix/suffix confusion like INT-vs-INTEGER, missing or
duplicated letters), wide enough to fail-silently on entries that
are genuinely different vocabulary (no false "Did you mean DATE?"
for a GEOMETRY).

Returns the FIRST best match at the minimum distance — type_mapping
has stable insertion order so the deterministic result is fine for
tests.
"""
if not unknown:
return None

def _levenshtein(a: str, b: str) -> int:
if a == b:
return 0
if not a:
return len(b)
if not b:
return len(a)
# Two-row DP — O(len(a)*len(b)) time, O(min(a,b)) space.
prev = list(range(len(b) + 1))
for i, ca in enumerate(a, start=1):
curr = [i] + [0] * len(b)
for j, cb in enumerate(b, start=1):
cost = 0 if ca == cb else 1
curr[j] = min(
prev[j] + 1,
curr[j - 1] + 1,
prev[j - 1] + cost,
)
prev = curr
return prev[-1]

target = unknown.upper()
best: Optional[str] = None
best_d = 99
for candidate in known:
d = _levenshtein(target, candidate)
if d < best_d:
best = candidate
best_d = d
return best if best_d <= 3 else None


class Database:
def __init__(self, config: Config):
self.config = config
Expand Down Expand Up @@ -181,7 +233,24 @@ def _get_sqlalchemy_type(self, mysql_type: str):
return alchemy_type(parts[0])
return alchemy_type

raise ValueError(f"Unsupported MySQL type: {mysql_type}")
# Surface a "did you mean" hint when the typo is one edit away from
# a supported type — BIGINTEGER → BIGINT, BOOLEAN → BOOL, NUMRIC →
# NUMERIC, etc. A user-facing schema error that just says
# "Unsupported" leaves the customer guessing; a single short hint
# turns a 5-minute "what's the right spelling?" round trip into a
# zero-thought fix. Levenshtein distance ≤ 3 catches every realistic
# typo we've seen without false-flagging unrelated types.
suggestion = _suggest_type(base_type, type_mapping.keys())
if suggestion:
raise ValueError(
f"Unsupported MySQL type: {mysql_type}. Did you mean "
f"'{suggestion}'? Supported types: "
f"{sorted(type_mapping.keys())}"
)
raise ValueError(
f"Unsupported MySQL type: {mysql_type}. Supported types: "
f"{sorted(type_mapping.keys())}"
)

def create_table(self, table_name: str, schema: Dict[str, str]):
"""
Expand Down
Loading