From 26090eb1cf65eba4dab64ceb4c6e404e97d0ef94 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 11 Jun 2026 17:05:49 +0200 Subject: [PATCH 1/4] fix: code hardening to prevent bugs --- .github/workflows/tests.yml | 8 +-- docs/contributing.md | 2 +- pyproject.toml | 9 ++- simplemma/language_detector.py | 5 +- simplemma/lemmatizer.py | 16 +++-- simplemma/strategies/affix_decomposition.py | 69 +++++++++---------- .../dictionaries/dictionary_factory.py | 35 +++++++--- .../dictionaries/trie_dictionary_factory.py | 5 ++ simplemma/strategies/dictionary_lookup.py | 2 + simplemma/token_sampler.py | 2 +- simplemma/utils.py | 19 +++++ .../dictionaries/test_dictionary_factory.py | 9 ++- tests/strategies/test_strategies.py | 21 ++++++ tests/test_language_detector.py | 5 ++ tests/test_lemmatizer.py | 36 ++++++++-- tests/test_token_sampler.py | 6 ++ tests/test_utils.py | 34 +++++---- 17 files changed, 200 insertions(+), 83 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 59ab589..58d1d2c 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -54,12 +54,8 @@ jobs: run: python -m pip install --ignore-installed ".[marisa-trie]" # tests - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Lint with ruff + run: ruff check . - name: Code format with black run: black --check --diff simplemma training tests diff --git a/docs/contributing.md b/docs/contributing.md index fbac532..0a31a60 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -22,7 +22,7 @@ checks that CI runs: # Code style black --check --diff simplemma training tests # Linting -flake8 simplemma training tests +ruff check . # Type checking mypy -p simplemma -p training -p tests # Tests diff --git a/pyproject.toml b/pyproject.toml index 0b7e0fa..136ca02 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -116,8 +116,8 @@ test = [ dev = [ "simplemma[test]", "black == 26.5.1", - "flake8 == 7.3.0", "mypy == 2.1.0", + "ruff == 0.15.16", "types-requests == 2.33.0.20260518", ] docs = [ @@ -140,6 +140,13 @@ disallow_untyped_defs = false disallow_incomplete_defs = false check_untyped_defs = false +[tool.ruff] +target-version = "py310" +line-length = 88 +# Linting uses ruff's default rule set (E4, E7, E9, F): pyflakes plus the +# import, statement and runtime-error pycodestyle checks. Formatting is handled +# by black. + [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/simplemma/language_detector.py b/simplemma/language_detector.py index 24938da..102787b 100644 --- a/simplemma/language_detector.py +++ b/simplemma/language_detector.py @@ -15,7 +15,7 @@ RelaxedMostCommonTokenSampler, TokenSampler, ) -from .utils import validate_lang_input +from .utils import normalize_token, validate_lang_input def in_target_language( @@ -67,6 +67,7 @@ def langdetect( and their respective proportions. """ + list_results: list[tuple[str, float]] = [] for token_sampler in token_samplers: results = LanguageDetector( lang, token_sampler, DefaultStrategy(greedy) @@ -159,6 +160,7 @@ def proportion_in_each_language( known_tokens_count = dict.fromkeys(self._lang, 0) unknown_tokens_count = 0 for token in tokens: + token = normalize_token(token) token_found = False for lang_code in self._lang: candidate = self._lemmatization_strategy.get_lemma(token, lang_code) @@ -194,6 +196,7 @@ def proportion_in_target_languages( in_target = 0 for token in tokens: + token = normalize_token(token) for lang_code in self._lang: candidate = self._lemmatization_strategy.get_lemma(token, lang_code) if candidate is not None: diff --git a/simplemma/lemmatizer.py b/simplemma/lemmatizer.py index 0678dfb..36b2404 100644 --- a/simplemma/lemmatizer.py +++ b/simplemma/lemmatizer.py @@ -22,7 +22,7 @@ ToLowercaseFallbackStrategy, ) from .tokenizer import RegexTokenizer, Tokenizer -from .utils import validate_lang_input +from .utils import normalize_token, validate_lang_input PUNCTUATION = {".", "?", "!", "…", "¿", "¡"} @@ -41,7 +41,7 @@ def _control_input_type(token: Any) -> None: if not isinstance(token, str): raise TypeError(f"Wrong input type, expected string, got {type(token)}") if token == "": - raise ValueError("Wrong input type: empty string") + raise ValueError("Wrong input value: empty string") class Lemmatizer: @@ -56,7 +56,7 @@ class Lemmatizer: def __init__( self, - cache_max_size: int = 1048576, + cache_max_size: int = 65536, tokenizer: Tokenizer = RegexTokenizer(), lemmatization_strategy: LemmatizationStrategy = DefaultStrategy(), fallback_lemmatization_strategy: LemmatizationFallbackStrategy = ToLowercaseFallbackStrategy(), @@ -66,7 +66,7 @@ def __init__( Args: cache_max_size (int, optional): The maximum size of the cache for the lemmatization results. - Defaults to `1048576`. + Defaults to `65536`. tokenizer (Tokenizer, optional): The tokenizer to use for tokenization. Defaults to `RegexTokenizer()`. lemmatization_strategy (LemmatizationStrategy, optional): The lemmatization strategy to use. @@ -94,7 +94,8 @@ def lemmatize( Returns: str: The lemmatized form of the token. """ - return self._cached_lemmatize(token, lang) + # NFC before caching: canonical key, matches the NFC dictionaries. + return self._cached_lemmatize(normalize_token(token), lang) def _lemmatize( self, @@ -103,6 +104,10 @@ def _lemmatize( ) -> str: """Internal method to lemmatize a token in the specified language(s). + The token arrives NFC-normalized by ``lemmatize``. Input validation + happens here so it only runs on cache misses, keeping hits cheap + (exceptions are never cached by ``lru_cache``). + Args: token: The token to lemmatize. lang: The language or languages for lemmatization. @@ -167,6 +172,7 @@ def is_known(token: str, lang: str | tuple[str, ...]) -> bool: """ _control_input_type(token) + token = normalize_token(token) lang = validate_lang_input(lang) dictionary_lookup = DictionaryLookupStrategy(_legacy_dictionary_factory) diff --git a/simplemma/strategies/affix_decomposition.py b/simplemma/strategies/affix_decomposition.py index 163adfd..a19a2d0 100644 --- a/simplemma/strategies/affix_decomposition.py +++ b/simplemma/strategies/affix_decomposition.py @@ -29,6 +29,8 @@ AFFIXLEN = 2 LONGAFFIXLEN = 5 # better for some languages MINCOMPLEN = 4 +# Decomposition is ~O(len²); cap long tokens (longest real form is 86 chars). +MAXLEN = 100 class AffixDecompositionStrategy(LemmatizationStrategy): @@ -74,7 +76,11 @@ def get_lemma(self, token: str, lang: str) -> str | None: str | None: The lemma of the token if found, or None otherwise. """ limit = 6 if lang in SHORTER_GREEDY else 8 - if (not self._greedy and lang not in AFFIX_LANGS) or len(token) <= limit: + if ( + (not self._greedy and lang not in AFFIX_LANGS) + or len(token) <= limit + or len(token) > MAXLEN + ): return None # define parameters @@ -104,43 +110,30 @@ def _affix_decomposition( Returns: str | None: The lemma of the token if found, or None otherwise. """ - # this only makes sense for languages written from left to right - # AFFIXLEN or MINCOMPLEN can spare time for some languages - for affixlen in range(max_affix_len, 1, -1): - for count in range(1, len(token) - min_complem_len + 1): - part1 = token[:-count] - # part1_aff = candidate[:-(count + affixlen)]p - lempart1 = self._dictionary_lookup.get_lemma(part1, lang) - if lempart1 is None: - continue - # maybe an affix? discard it - if count <= affixlen: - return lempart1 - # account for case before looking for second part - part2 = token[-count:] - if token[0].isupper(): - part2 = part2.capitalize() - lempart2 = self._dictionary_lookup.get_lemma(part2, lang) - if lempart2 is None: - continue - # candidate must be shorter - # try other case - candidate = self._greedy_dictionary_lookup.get_lemma(part2, lang) - # shorten the second known part of the token - if candidate is not None and len(candidate) < len(part2): - return part1 + candidate.lower() - # backup: equal length or further candidates accepted - # try without capitalizing - # even greedier - # with capital letter? - if len(lempart2) < len(part2) + affixlen: - return part1 + lempart2.lower() - # print(part1, part2, affixlen, count, newcandidate, planb) - # elif newcandidate and len(newcandidate) < len(part2) + affixlen: - # plan_b = part1 + newcandidate.lower() - # print(part1, part2, affixlen, count, newcandidate, planb) - # else: - # print(part1, part2, affixlen, count, newcandidate) + # Left-to-right languages only. A single pass at the largest affix + # length is equivalent to looping over smaller ones (first match wins). + for count in range(1, len(token) - min_complem_len + 1): + part1 = token[:-count] + lempart1 = self._dictionary_lookup.get_lemma(part1, lang) + if lempart1 is None: + continue + # maybe an affix? discard it + if count <= max_affix_len: + return lempart1 + # account for case before looking for second part + part2 = token[-count:] + if token[0].isupper(): + part2 = part2.capitalize() + lempart2 = self._dictionary_lookup.get_lemma(part2, lang) + if lempart2 is None: + continue + # prefer a shorter greedy form of the second part + candidate = self._greedy_dictionary_lookup.get_lemma(part2, lang) + if candidate is not None and len(candidate) < len(part2): + return part1 + candidate.lower() + # backup: accept the dictionary form if not longer than the affix bound + if len(lempart2) < len(part2) + max_affix_len: + return part1 + lempart2.lower() return None def _suffix_decomposition( diff --git a/simplemma/strategies/dictionaries/dictionary_factory.py b/simplemma/strategies/dictionaries/dictionary_factory.py index 005cda6..ad9cb62 100644 --- a/simplemma/strategies/dictionaries/dictionary_factory.py +++ b/simplemma/strategies/dictionaries/dictionary_factory.py @@ -13,11 +13,14 @@ from abc import abstractmethod from functools import lru_cache from pathlib import Path -from typing import Protocol +from typing import Protocol, TypeVar, overload from collections.abc import Iterator, Mapping +_T = TypeVar("_T") + DATA_FOLDER = Path(__file__).parent / "data" -SUPPORTED_LANGUAGES = [f.stem for f in DATA_FOLDER.glob("*.plzma")] +# frozenset: O(1) membership, checked on every get_dictionary call. +SUPPORTED_LANGUAGES = frozenset(f.stem for f in DATA_FOLDER.glob("*.plzma")) def _load_dictionary_from_disk(langcode: str) -> dict[bytes, bytes]: @@ -86,6 +89,16 @@ def __init__(self, dictionary: dict[bytes, bytes]) -> None: def __getitem__(self, item: str) -> str: return self._dict[item.encode()].decode() + # The overloads mirror Mapping.get's signature for strict mypy. + @overload + def get(self, key: str) -> str | None: ... + @overload + def get(self, key: str, default: str | _T) -> str | _T: ... + def get(self, key: str, default: str | _T | None = None) -> str | _T | None: + # Avoids Mapping.get's EAFP path (a KeyError raised on every miss). + value = self._dict.get(key.encode()) + return value.decode() if value is not None else default + def __iter__(self) -> Iterator[str]: for key in self._dict: yield key.decode() @@ -102,7 +115,7 @@ class DefaultDictionaryFactory(DictionaryFactory): It provides functionality for loading and caching dictionaries from disk that are included in Simplemma. """ - __slots__ = ["_load_dictionary_from_disk"] + __slots__ = ["_get_dictionary"] def __init__(self, cache_max_size: int = 8) -> None: """ @@ -112,10 +125,18 @@ def __init__(self, cache_max_size: int = 8) -> None: cache_max_size (int): The maximum size of the cache for loaded dictionaries. Defaults to `8`. """ - self._load_dictionary_from_disk = lru_cache(maxsize=cache_max_size)( - _load_dictionary_from_disk + # Cache the wrapper, not the raw dict, to avoid re-wrapping on every + # call; the lru evicts wrapper and dict together, bounding memory. + self._get_dictionary = lru_cache(maxsize=cache_max_size)( + self._get_dictionary_uncached ) + def _get_dictionary_uncached(self, lang: str) -> Mapping[str, str]: + """Build the dictionary for a language, without caching.""" + if lang not in SUPPORTED_LANGUAGES: + raise ValueError(f"Unsupported language: {lang}") + return MappingStrToByteString(_load_dictionary_from_disk(lang)) + def get_dictionary( self, lang: str, @@ -132,6 +153,4 @@ def get_dictionary( Raises: ValueError: If the specified language is not supported. """ - if lang not in SUPPORTED_LANGUAGES: - raise ValueError(f"Unsupported language: {lang}") - return MappingStrToByteString(self._load_dictionary_from_disk(lang)) + return self._get_dictionary(lang) diff --git a/simplemma/strategies/dictionaries/trie_dictionary_factory.py b/simplemma/strategies/dictionaries/trie_dictionary_factory.py index 742ac8d..7624f87 100644 --- a/simplemma/strategies/dictionaries/trie_dictionary_factory.py +++ b/simplemma/strategies/dictionaries/trie_dictionary_factory.py @@ -36,6 +36,11 @@ def __init__(self, trie: BytesTrie) -> None: def __getitem__(self, item: str) -> Any: return self._trie[item][0].decode() + def get(self, key: str, default: Any = None) -> Any: + # Avoids MutableMapping.get's EAFP path (a KeyError raised on every miss). + value = self._trie.get(key) + return value[0].decode() if value else default + def __setitem__(self, key: Any, value: Any) -> None: raise NotImplementedError diff --git a/simplemma/strategies/dictionary_lookup.py b/simplemma/strategies/dictionary_lookup.py index 059ec64..e18c1b0 100644 --- a/simplemma/strategies/dictionary_lookup.py +++ b/simplemma/strategies/dictionary_lookup.py @@ -39,6 +39,8 @@ def get_lemma(self, token: str, lang: str) -> str | None: str | None: The lemma for the token, or `None` if not found in the dictionary. """ + if not token: + return None # Search the language data, reverse case to extend coverage. dictionary = self._dictionary_factory.get_dictionary(lang) if result := dictionary.get(token): diff --git a/simplemma/token_sampler.py b/simplemma/token_sampler.py index 5c130ba..0516a1b 100644 --- a/simplemma/token_sampler.py +++ b/simplemma/token_sampler.py @@ -150,7 +150,7 @@ def sample_tokens(self, tokens: Iterable[str]) -> list[str]: counter = Counter(tokens) if self._capitalized_threshold > 0: - deletions = [token for token in counter if token[0].isupper()] + deletions = [token for token in counter if token and token[0].isupper()] if len(deletions) < self._capitalized_threshold * len(counter): for token in deletions: del counter[token] diff --git a/simplemma/utils.py b/simplemma/utils.py index 98cd51e..a4297b0 100644 --- a/simplemma/utils.py +++ b/simplemma/utils.py @@ -4,8 +4,24 @@ - [levenshtein_dist][simplemma.utils.levenshtein_dist]: Calculates the Levenshtein distance between two strings. - [validate_lang_input][simplemma.utils.validate_lang_input]: Validates the language input and ensures it is a valid tuple. +- [normalize_token][simplemma.utils.normalize_token]: Normalizes a token to Unicode NFC form. """ +import unicodedata + + +def normalize_token(token: str) -> str: + """ + Normalize a token to Unicode NFC, matching the shipped dictionaries. + + Args: + token (str): The input token. + + Returns: + str: The token in NFC form. + """ + return unicodedata.normalize("NFC", token) + def validate_lang_input(lang: str | tuple[str, ...]) -> tuple[str, ...]: """ @@ -19,6 +35,7 @@ def validate_lang_input(lang: str | tuple[str, ...]) -> tuple[str, ...]: Raises: TypeError: If the lang argument is not a tuple or a string. + ValueError: If the lang argument is empty. """ # convert string @@ -26,6 +43,8 @@ def validate_lang_input(lang: str | tuple[str, ...]) -> tuple[str, ...]: lang = (lang,) if not isinstance(lang, tuple): raise TypeError("lang argument must be a two-letter language code") + if not lang: + raise ValueError("lang argument is empty: provide at least one language code") return lang diff --git a/tests/strategies/dictionaries/test_dictionary_factory.py b/tests/strategies/dictionaries/test_dictionary_factory.py index b6b7743..39f7ac0 100644 --- a/tests/strategies/dictionaries/test_dictionary_factory.py +++ b/tests/strategies/dictionaries/test_dictionary_factory.py @@ -36,8 +36,7 @@ def test_dictionary_cache() -> None: for _ in range(iterations): dictionaries.get_dictionary("en") dictionaries.get_dictionary("de") - assert dictionaries._load_dictionary_from_disk.cache_info().misses == 2 - assert ( - dictionaries._load_dictionary_from_disk.cache_info().hits - == (iterations - 1) * 2 - ) + assert dictionaries._get_dictionary.cache_info().misses == 2 + assert dictionaries._get_dictionary.cache_info().hits == (iterations - 1) * 2 + # the cached wrapper itself is reused, not just the underlying dict + assert dictionaries.get_dictionary("en") is dictionaries.get_dictionary("en") diff --git a/tests/strategies/test_strategies.py b/tests/strategies/test_strategies.py index 2196522..5e63c4f 100644 --- a/tests/strategies/test_strategies.py +++ b/tests/strategies/test_strategies.py @@ -15,6 +15,8 @@ def test_search() -> None: assert DictionaryLookupStrategy().get_lemma("dritte", "de") == "dritt" assert DictionaryLookupStrategy().get_lemma("Dritte", "de") == "Dritter" + # empty token must not crash the case-flip retry + assert DictionaryLookupStrategy().get_lemma("", "en") is None assert HyphenRemovalStrategy().get_lemma("magni-ficent", "en") == "magnificent" assert HyphenRemovalStrategy().get_lemma("magni-ficents", "en") is None @@ -53,4 +55,23 @@ def test_search() -> None: assert AffixDecompositionStrategy(greedy=True).get_lemma("ccc", "de") is None + # tokens longer than the safety cap are rejected outright (guards against + # the quadratic blow-up that would otherwise hang on hostile/long input) + affix = AffixDecompositionStrategy(greedy=True) + assert affix.get_lemma("a" * 101, "fi") is None + assert affix.get_lemma("a" * 100000, "fi") is None + assert PrefixDecompositionStrategy().get_lemma("auf", "de") is None + + +def test_affix_decomposition() -> None: + """Regression anchors for the long-affix languages (max_affix_len=5). + + These guard the single-pass affix decomposition: each token is resolved by + stripping a multi-character affix, the path that the former decreasing-affixlen + loop exercised. + """ + affix = AffixDecompositionStrategy(greedy=True) + assert affix.get_lemma("kissammeko", "fi") == "kissa" # FI "and our cat?" -> cat + assert affix.get_lemma("könyveiteket", "hu") == "könyv" # HU "your books" -> book + assert affix.get_lemma("raamatutest", "et") == "raamat" # ET "from books" -> book diff --git a/tests/test_language_detector.py b/tests/test_language_detector.py index 21d78d8..4a24bad 100644 --- a/tests/test_language_detector.py +++ b/tests/test_language_detector.py @@ -6,6 +6,11 @@ from .test_token_sampler import CustomTokenSampler +def test_langdetect_no_samplers() -> None: + # no samplers means no results, not an UnboundLocalError + assert langdetect("Dies ist ein Test.", lang=("de", "en"), token_samplers=[]) == [] + + def test_proportion_in_each_language() -> None: # sanity checks assert LanguageDetector( diff --git a/tests/test_lemmatizer.py b/tests/test_lemmatizer.py index ad3a06b..ea76d73 100644 --- a/tests/test_lemmatizer.py +++ b/tests/test_lemmatizer.py @@ -1,5 +1,6 @@ """Tests for `simplemma` package.""" +import unicodedata from collections.abc import Mapping import pytest @@ -477,11 +478,11 @@ def test_is_known() -> None: with pytest.raises(ValueError): assert is_known("", lang="en") is None - assert is_known("FanCY", lang="en") == True - assert is_known("Fancy-String", lang="en") == False + assert is_known("FanCY", lang="en") + assert not is_known("Fancy-String", lang="en") - assert is_known("espejos", lang=("es", "de")) == True - assert is_known("espejos", lang=("de", "es")) == True + assert is_known("espejos", lang=("es", "de")) + assert is_known("espejos", lang=("de", "es")) def test_get_lemmas_in_text() -> None: @@ -583,3 +584,30 @@ def test_get_lemmas_in_text() -> None: ".", ] ) + + +def test_nfc_normalization() -> None: + """Decomposed (NFD) input must lemmatize like its composed (NFC) form.""" + nfc = "Häuser" + nfd = unicodedata.normalize("NFD", nfc) + assert nfd != nfc # genuinely different byte sequences + # both forms resolve to the same lemma (previously NFD silently failed) + assert lemmatize(nfd, lang="de") == lemmatize(nfc, lang="de") == "Haus" + # is_known agrees on both forms + assert is_known(nfd, lang="de") == is_known(nfc, lang="de") is True + # output is always NFC, even when the input was decomposed + café_nfd = unicodedata.normalize("NFD", "café") + assert lemmatize(café_nfd, lang="fr") == unicodedata.normalize( + "NFC", lemmatize(café_nfd, lang="fr") + ) + + +def test_long_token_does_not_hang() -> None: + """A pathologically long token must return quickly, not trigger O(n²) decomposition.""" + # Before the affix-length cap this hung for ~minutes; it must now be instant + # and fall through to the fallback (the token is unknown), returned unchanged. + long_token = "a" * 50000 + assert lemmatize(long_token, lang="fi") == long_token + # a token just over the cap is also handled, while normal-length input still lemmatizes + assert lemmatize("a" * 101, lang="fi") == "a" * 101 + assert lemmatize("masks", lang="en") == "mask" diff --git a/tests/test_token_sampler.py b/tests/test_token_sampler.py index 9e1de3a..75a786b 100644 --- a/tests/test_token_sampler.py +++ b/tests/test_token_sampler.py @@ -34,6 +34,12 @@ def test_token_sampler(): assert custom.sample_text("ABCD Efgh ijkl mn") == [] +def test_sample_tokens_empty_token() -> None: + # an empty string in the iterable must not crash the capitalization filter + sampler = MostCommonTokenSampler() + assert sampler.sample_tokens(["hello", "", "World", "hello"]) == ["hello", ""] + + def test_capitalized_threshold() -> None: sampler = MostCommonTokenSampler() # default capitalized_threshold=0.8 # capitalized tokens are a minority (2 of 3 < 0.8 * 3) -> they are removed diff --git a/tests/test_utils.py b/tests/test_utils.py index bbc88f2..dfa30b9 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,29 +1,37 @@ """Tests for `simplemma.utils`.""" +import unicodedata + import pytest -from simplemma import is_known -from simplemma.utils import levenshtein_dist, validate_lang_input +from simplemma import is_known, lemmatize +from simplemma.utils import levenshtein_dist, normalize_token, validate_lang_input + + +def test_normalize_token() -> None: + # decomposed (NFD) input is recomposed to NFC + nfd = unicodedata.normalize("NFD", "Häuser") + assert nfd != "Häuser" # the two byte sequences genuinely differ + assert normalize_token(nfd) == "Häuser" + # already-NFC input is returned byte-identical (idempotent) + assert normalize_token("Häuser") == "Häuser" + assert normalize_token(normalize_token(nfd)) == normalize_token(nfd) + # NFC, not NFKC: compatibility characters are preserved, not folded + assert normalize_token("fi") == "fi" # ligature stays, would become "fi" under NFKC def test_validate_lang_input() -> None: - # a string is wrapped into a one-element tuple assert validate_lang_input("en") == ("en",) - # a tuple is passed through unchanged assert validate_lang_input(("de", "en")) == ("de", "en") - # an empty tuple is passed through unchanged (there is no guard against it; - # downstream this makes the lemmatizer fallback raise StopIteration on - # next(iter(lang))) - assert validate_lang_input(()) == () - # non-str / non-tuple inputs raise TypeError. This branch is otherwise - # never reached through lemmatize(), where an unhashable lang argument - # fails earlier in the lru_cache rather than in validate_lang_input. + # empty lang must fail cleanly, not as a downstream StopIteration + with pytest.raises(ValueError): + validate_lang_input(()) + with pytest.raises(ValueError): + lemmatize("test", ()) with pytest.raises(TypeError): validate_lang_input(123) # type: ignore[arg-type] - # a list is the realistic mistake (mutable, looks tuple-like) with pytest.raises(TypeError): validate_lang_input(["en"]) # type: ignore[arg-type] - # end-to-end through is_known(), which validates directly without caching with pytest.raises(TypeError): is_known("test", lang=123) # type: ignore[arg-type] From 73e555806fe19c262caa928d14de53a19abb01e9 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 11 Jun 2026 19:37:04 +0200 Subject: [PATCH 2/4] fix: simplify --- simplemma/strategies/dictionary_lookup.py | 6 ++---- simplemma/token_sampler.py | 2 +- tests/strategies/test_strategies.py | 10 ++-------- tests/test_lemmatizer.py | 22 ++++++---------------- tests/test_utils.py | 5 +---- 5 files changed, 12 insertions(+), 33 deletions(-) diff --git a/simplemma/strategies/dictionary_lookup.py b/simplemma/strategies/dictionary_lookup.py index e18c1b0..9f98fd0 100644 --- a/simplemma/strategies/dictionary_lookup.py +++ b/simplemma/strategies/dictionary_lookup.py @@ -39,12 +39,10 @@ def get_lemma(self, token: str, lang: str) -> str | None: str | None: The lemma for the token, or `None` if not found in the dictionary. """ - if not token: - return None # Search the language data, reverse case to extend coverage. dictionary = self._dictionary_factory.get_dictionary(lang) if result := dictionary.get(token): return result - # Try upper or lowercase. - token = token.lower() if token[0].isupper() else token.capitalize() + # Try upper or lowercase (token[:1] stays empty-safe for empty input). + token = token.lower() if token[:1].isupper() else token.capitalize() return dictionary.get(token) diff --git a/simplemma/token_sampler.py b/simplemma/token_sampler.py index 0516a1b..f9a3353 100644 --- a/simplemma/token_sampler.py +++ b/simplemma/token_sampler.py @@ -150,7 +150,7 @@ def sample_tokens(self, tokens: Iterable[str]) -> list[str]: counter = Counter(tokens) if self._capitalized_threshold > 0: - deletions = [token for token in counter if token and token[0].isupper()] + deletions = [token for token in counter if token[:1].isupper()] if len(deletions) < self._capitalized_threshold * len(counter): for token in deletions: del counter[token] diff --git a/tests/strategies/test_strategies.py b/tests/strategies/test_strategies.py index 5e63c4f..72e7562 100644 --- a/tests/strategies/test_strategies.py +++ b/tests/strategies/test_strategies.py @@ -55,8 +55,7 @@ def test_search() -> None: assert AffixDecompositionStrategy(greedy=True).get_lemma("ccc", "de") is None - # tokens longer than the safety cap are rejected outright (guards against - # the quadratic blow-up that would otherwise hang on hostile/long input) + # tokens over the safety cap are rejected outright (quadratic blow-up guard) affix = AffixDecompositionStrategy(greedy=True) assert affix.get_lemma("a" * 101, "fi") is None assert affix.get_lemma("a" * 100000, "fi") is None @@ -65,12 +64,7 @@ def test_search() -> None: def test_affix_decomposition() -> None: - """Regression anchors for the long-affix languages (max_affix_len=5). - - These guard the single-pass affix decomposition: each token is resolved by - stripping a multi-character affix, the path that the former decreasing-affixlen - loop exercised. - """ + """Single-pass affix decomposition resolves multi-character affixes (max_affix_len=5).""" affix = AffixDecompositionStrategy(greedy=True) assert affix.get_lemma("kissammeko", "fi") == "kissa" # FI "and our cat?" -> cat assert affix.get_lemma("könyveiteket", "hu") == "könyv" # HU "your books" -> book diff --git a/tests/test_lemmatizer.py b/tests/test_lemmatizer.py index ea76d73..05cb3f6 100644 --- a/tests/test_lemmatizer.py +++ b/tests/test_lemmatizer.py @@ -588,26 +588,16 @@ def test_get_lemmas_in_text() -> None: def test_nfc_normalization() -> None: """Decomposed (NFD) input must lemmatize like its composed (NFC) form.""" - nfc = "Häuser" - nfd = unicodedata.normalize("NFD", nfc) - assert nfd != nfc # genuinely different byte sequences - # both forms resolve to the same lemma (previously NFD silently failed) - assert lemmatize(nfd, lang="de") == lemmatize(nfc, lang="de") == "Haus" - # is_known agrees on both forms - assert is_known(nfd, lang="de") == is_known(nfc, lang="de") is True + nfd = unicodedata.normalize("NFD", "Häuser") + assert lemmatize(nfd, lang="de") == lemmatize("Häuser", lang="de") == "Haus" + assert is_known(nfd, lang="de") == is_known("Häuser", lang="de") is True # output is always NFC, even when the input was decomposed - café_nfd = unicodedata.normalize("NFD", "café") - assert lemmatize(café_nfd, lang="fr") == unicodedata.normalize( - "NFC", lemmatize(café_nfd, lang="fr") - ) + out = lemmatize(unicodedata.normalize("NFD", "café"), lang="fr") + assert out == unicodedata.normalize("NFC", out) def test_long_token_does_not_hang() -> None: """A pathologically long token must return quickly, not trigger O(n²) decomposition.""" - # Before the affix-length cap this hung for ~minutes; it must now be instant - # and fall through to the fallback (the token is unknown), returned unchanged. - long_token = "a" * 50000 - assert lemmatize(long_token, lang="fi") == long_token - # a token just over the cap is also handled, while normal-length input still lemmatizes + assert lemmatize("a" * 50000, lang="fi") == "a" * 50000 # was minutes, now instant assert lemmatize("a" * 101, lang="fi") == "a" * 101 assert lemmatize("masks", lang="en") == "mask" diff --git a/tests/test_utils.py b/tests/test_utils.py index dfa30b9..56937f3 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -9,13 +9,10 @@ def test_normalize_token() -> None: - # decomposed (NFD) input is recomposed to NFC nfd = unicodedata.normalize("NFD", "Häuser") - assert nfd != "Häuser" # the two byte sequences genuinely differ + assert nfd != "Häuser" assert normalize_token(nfd) == "Häuser" - # already-NFC input is returned byte-identical (idempotent) assert normalize_token("Häuser") == "Häuser" - assert normalize_token(normalize_token(nfd)) == normalize_token(nfd) # NFC, not NFKC: compatibility characters are preserved, not folded assert normalize_token("fi") == "fi" # ligature stays, would become "fi" under NFKC From 63b336f9e8952a6ac8f3d72d5eac2eef011b977b Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 11 Jun 2026 21:13:42 +0200 Subject: [PATCH 3/4] further hardening and fixes --- simplemma/lemmatizer.py | 5 +++-- simplemma/strategies/affix_decomposition.py | 7 +++---- simplemma/strategies/dictionaries/dictionary_factory.py | 3 ++- simplemma/strategies/dictionary_lookup.py | 2 +- simplemma/strategies/greedy_dictionary_lookup.py | 8 ++++++-- simplemma/strategies/hyphen_removal.py | 4 +++- 6 files changed, 18 insertions(+), 11 deletions(-) diff --git a/simplemma/lemmatizer.py b/simplemma/lemmatizer.py index 36b2404..7eb37ec 100644 --- a/simplemma/lemmatizer.py +++ b/simplemma/lemmatizer.py @@ -158,6 +158,7 @@ def get_lemmas_in_text( greedy=True, dictionary_factory=_legacy_dictionary_factory ) ) +_legacy_dictionary_lookup = DictionaryLookupStrategy(_legacy_dictionary_factory) def is_known(token: str, lang: str | tuple[str, ...]) -> bool: @@ -175,9 +176,9 @@ def is_known(token: str, lang: str | tuple[str, ...]) -> bool: token = normalize_token(token) lang = validate_lang_input(lang) - dictionary_lookup = DictionaryLookupStrategy(_legacy_dictionary_factory) return any( - dictionary_lookup.get_lemma(token, lang_code) is not None for lang_code in lang + _legacy_dictionary_lookup.get_lemma(token, lang_code) is not None + for lang_code in lang ) diff --git a/simplemma/strategies/affix_decomposition.py b/simplemma/strategies/affix_decomposition.py index a19a2d0..c568bcd 100644 --- a/simplemma/strategies/affix_decomposition.py +++ b/simplemma/strategies/affix_decomposition.py @@ -3,7 +3,7 @@ """ from .dictionary_lookup import DictionaryLookupStrategy -from .greedy_dictionary_lookup import SHORTER_GREEDY, GreedyDictionaryLookupStrategy +from .greedy_dictionary_lookup import GreedyDictionaryLookupStrategy, greedy_min_length from .lemmatization_strategy import LemmatizationStrategy # TODO: This custom behavior has to be simplified before it becomes unmaintainable @@ -75,10 +75,9 @@ def get_lemma(self, token: str, lang: str) -> str | None: Returns: str | None: The lemma of the token if found, or None otherwise. """ - limit = 6 if lang in SHORTER_GREEDY else 8 if ( (not self._greedy and lang not in AFFIX_LANGS) - or len(token) <= limit + or len(token) <= greedy_min_length(lang) or len(token) > MAXLEN ): return None @@ -158,7 +157,7 @@ def _suffix_decomposition( suffix = self._dictionary_lookup.get_lemma( token[-count:].capitalize(), lang ) - if suffix is not None and len(suffix) <= len(token[-count:]): + if suffix is not None and len(suffix) <= count: return token[:-count] + suffix.lower() return None diff --git a/simplemma/strategies/dictionaries/dictionary_factory.py b/simplemma/strategies/dictionaries/dictionary_factory.py index ad9cb62..a07ba14 100644 --- a/simplemma/strategies/dictionaries/dictionary_factory.py +++ b/simplemma/strategies/dictionaries/dictionary_factory.py @@ -43,7 +43,8 @@ def _load_dictionary_from_disk(langcode: str) -> dict[bytes, bytes]: filepath = DATA_FOLDER / f"{langcode}.plzma" with lzma.open(filepath, "rb") as filehandle: pickled_dict = pickle.load(filehandle) - assert isinstance(pickled_dict, dict) + if not isinstance(pickled_dict, dict): + raise TypeError(f"unexpected data in {filepath}: {type(pickled_dict)}") return pickled_dict diff --git a/simplemma/strategies/dictionary_lookup.py b/simplemma/strategies/dictionary_lookup.py index 9f98fd0..25c6957 100644 --- a/simplemma/strategies/dictionary_lookup.py +++ b/simplemma/strategies/dictionary_lookup.py @@ -41,7 +41,7 @@ def get_lemma(self, token: str, lang: str) -> str | None: """ # Search the language data, reverse case to extend coverage. dictionary = self._dictionary_factory.get_dictionary(lang) - if result := dictionary.get(token): + if (result := dictionary.get(token)) is not None: return result # Try upper or lowercase (token[:1] stays empty-safe for empty input). token = token.lower() if token[:1].isupper() else token.capitalize() diff --git a/simplemma/strategies/greedy_dictionary_lookup.py b/simplemma/strategies/greedy_dictionary_lookup.py index fea33da..10a8ef2 100644 --- a/simplemma/strategies/greedy_dictionary_lookup.py +++ b/simplemma/strategies/greedy_dictionary_lookup.py @@ -10,6 +10,11 @@ SHORTER_GREEDY = {"bg", "et", "fi", "lv"} +def greedy_min_length(lang: str) -> int: + """Shortest token worth decomposing; shorter ones are returned/skipped as-is.""" + return 6 if lang in SHORTER_GREEDY else 8 + + class GreedyDictionaryLookupStrategy(LemmatizationStrategy): """ This class represents a lemmatization strategy that performs lemmatization using a greedy dictionary lookup strategy. @@ -53,8 +58,7 @@ def get_lemma(self, token: str, lang: str) -> str: str: The lemma for the token. """ - limit = 6 if lang in SHORTER_GREEDY else 8 - if len(token) <= limit: + if len(token) <= greedy_min_length(lang): return token dictionary = self._dictionary_factory.get_dictionary(lang) diff --git a/simplemma/strategies/hyphen_removal.py b/simplemma/strategies/hyphen_removal.py index c44a3d0..2e7d5ee 100644 --- a/simplemma/strategies/hyphen_removal.py +++ b/simplemma/strategies/hyphen_removal.py @@ -53,8 +53,10 @@ def get_lemma(self, token: str, lang: str) -> str | None: str | None: The lemma for the token, or None if no lemma is found. """ + if not any(hyphen in token for hyphen in HYPHENS): + return None token_parts = HYPHEN_REGEX.split(token) - if len(token_parts) <= 1 or not token_parts[-1]: + if not token_parts[-1]: return None # try to find a word form without hyphen From b07b5d07c2a2674f8b15fad36696980a610cb1f8 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 11 Jun 2026 21:27:48 +0200 Subject: [PATCH 4/4] restore coverage --- .../strategies/dictionaries/dictionary_factory.py | 2 +- .../dictionaries/test_dictionary_factory.py | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/simplemma/strategies/dictionaries/dictionary_factory.py b/simplemma/strategies/dictionaries/dictionary_factory.py index a07ba14..e9616af 100644 --- a/simplemma/strategies/dictionaries/dictionary_factory.py +++ b/simplemma/strategies/dictionaries/dictionary_factory.py @@ -34,7 +34,7 @@ def _load_dictionary_from_disk(langcode: str) -> dict[bytes, bytes]: dict[str, str]: The loaded dictionary. Raises: - AssertionError: If the loaded object is not a dictionary. + TypeError: If the loaded object is not a dictionary. Note: This function assumes that the dictionary file is stored in the 'data' folder relative to this module. diff --git a/tests/strategies/dictionaries/test_dictionary_factory.py b/tests/strategies/dictionaries/test_dictionary_factory.py index 39f7ac0..a6ac44c 100644 --- a/tests/strategies/dictionaries/test_dictionary_factory.py +++ b/tests/strategies/dictionaries/test_dictionary_factory.py @@ -1,7 +1,19 @@ import pytest from simplemma.strategies import DefaultDictionaryFactory -from simplemma.strategies.dictionaries.dictionary_factory import MappingStrToByteString +from simplemma.strategies.dictionaries.dictionary_factory import ( + MappingStrToByteString, + _load_dictionary_from_disk, +) + + +def test_load_dictionary_rejects_non_dict(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + "simplemma.strategies.dictionaries.dictionary_factory.pickle.load", + lambda _filehandle: ["not", "a", "dict"], + ) + with pytest.raises(TypeError, match="unexpected data"): + _load_dictionary_from_disk("en") def test_mapping_str_to_bytestring() -> None: