From 26090eb1cf65eba4dab64ceb4c6e404e97d0ef94 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <adbar@users.noreply.github.com>
Date: Thu, 11 Jun 2026 17:05:49 +0200
Subject: [PATCH 1/4] fix: code hardening to prevent bugs

---
 .github/workflows/tests.yml                   |  8 +--
 docs/contributing.md                          |  2 +-
 pyproject.toml                                |  9 ++-
 simplemma/language_detector.py                |  5 +-
 simplemma/lemmatizer.py                       | 16 +++--
 simplemma/strategies/affix_decomposition.py   | 69 +++++++++----------
 .../dictionaries/dictionary_factory.py        | 35 +++++++---
 .../dictionaries/trie_dictionary_factory.py   |  5 ++
 simplemma/strategies/dictionary_lookup.py     |  2 +
 simplemma/token_sampler.py                    |  2 +-
 simplemma/utils.py                            | 19 +++++
 .../dictionaries/test_dictionary_factory.py   |  9 ++-
 tests/strategies/test_strategies.py           | 21 ++++++
 tests/test_language_detector.py               |  5 ++
 tests/test_lemmatizer.py                      | 36 ++++++++--
 tests/test_token_sampler.py                   |  6 ++
 tests/test_utils.py                           | 34 +++++----
 17 files changed, 200 insertions(+), 83 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 59ab589..58d1d2c 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -54,12 +54,8 @@ jobs:
       run: python -m pip install --ignore-installed ".[marisa-trie]"
 
     # tests
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Lint with ruff
+      run: ruff check .
 
     - name: Code format with black
       run: black --check --diff simplemma training tests
diff --git a/docs/contributing.md b/docs/contributing.md
index fbac532..0a31a60 100644
--- a/docs/contributing.md
+++ b/docs/contributing.md
@@ -22,7 +22,7 @@ checks that CI runs:
 # Code style
 black --check --diff simplemma training tests
 # Linting
-flake8 simplemma training tests
+ruff check .
 # Type checking
 mypy -p simplemma -p training -p tests
 # Tests
diff --git a/pyproject.toml b/pyproject.toml
index 0b7e0fa..136ca02 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -116,8 +116,8 @@ test = [
 dev = [
     "simplemma[test]",
     "black == 26.5.1",
-    "flake8 == 7.3.0",
     "mypy == 2.1.0",
+    "ruff == 0.15.16",
     "types-requests == 2.33.0.20260518",
 ]
 docs = [
@@ -140,6 +140,13 @@ disallow_untyped_defs = false
 disallow_incomplete_defs = false
 check_untyped_defs = false
 
+[tool.ruff]
+target-version = "py310"
+line-length = 88
+# Linting uses ruff's default rule set (E4, E7, E9, F): pyflakes plus the
+# import, statement and runtime-error pycodestyle checks. Formatting is handled
+# by black.
+
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 
diff --git a/simplemma/language_detector.py b/simplemma/language_detector.py
index 24938da..102787b 100644
--- a/simplemma/language_detector.py
+++ b/simplemma/language_detector.py
@@ -15,7 +15,7 @@
     RelaxedMostCommonTokenSampler,
     TokenSampler,
 )
-from .utils import validate_lang_input
+from .utils import normalize_token, validate_lang_input
 
 
 def in_target_language(
@@ -67,6 +67,7 @@ def langdetect(
             and their respective proportions.
     """
 
+    list_results: list[tuple[str, float]] = []
     for token_sampler in token_samplers:
         results = LanguageDetector(
             lang, token_sampler, DefaultStrategy(greedy)
@@ -159,6 +160,7 @@ def proportion_in_each_language(
         known_tokens_count = dict.fromkeys(self._lang, 0)
         unknown_tokens_count = 0
         for token in tokens:
+            token = normalize_token(token)
             token_found = False
             for lang_code in self._lang:
                 candidate = self._lemmatization_strategy.get_lemma(token, lang_code)
@@ -194,6 +196,7 @@ def proportion_in_target_languages(
 
         in_target = 0
         for token in tokens:
+            token = normalize_token(token)
             for lang_code in self._lang:
                 candidate = self._lemmatization_strategy.get_lemma(token, lang_code)
                 if candidate is not None:
diff --git a/simplemma/lemmatizer.py b/simplemma/lemmatizer.py
index 0678dfb..36b2404 100644
--- a/simplemma/lemmatizer.py
+++ b/simplemma/lemmatizer.py
@@ -22,7 +22,7 @@
     ToLowercaseFallbackStrategy,
 )
 from .tokenizer import RegexTokenizer, Tokenizer
-from .utils import validate_lang_input
+from .utils import normalize_token, validate_lang_input
 
 PUNCTUATION = {".", "?", "!", "…", "¿", "¡"}
 
@@ -41,7 +41,7 @@ def _control_input_type(token: Any) -> None:
     if not isinstance(token, str):
         raise TypeError(f"Wrong input type, expected string, got {type(token)}")
     if token == "":
-        raise ValueError("Wrong input type: empty string")
+        raise ValueError("Wrong input value: empty string")
 
 
 class Lemmatizer:
@@ -56,7 +56,7 @@ class Lemmatizer:
 
     def __init__(
         self,
-        cache_max_size: int = 1048576,
+        cache_max_size: int = 65536,
         tokenizer: Tokenizer = RegexTokenizer(),
         lemmatization_strategy: LemmatizationStrategy = DefaultStrategy(),
         fallback_lemmatization_strategy: LemmatizationFallbackStrategy = ToLowercaseFallbackStrategy(),
@@ -66,7 +66,7 @@ def __init__(
 
         Args:
             cache_max_size (int, optional): The maximum size of the cache for the lemmatization results.
-                Defaults to `1048576`.
+                Defaults to `65536`.
             tokenizer (Tokenizer, optional): The tokenizer to use for tokenization.
                 Defaults to `RegexTokenizer()`.
             lemmatization_strategy (LemmatizationStrategy, optional): The lemmatization strategy to use.
@@ -94,7 +94,8 @@ def lemmatize(
         Returns:
             str: The lemmatized form of the token.
         """
-        return self._cached_lemmatize(token, lang)
+        # NFC before caching: canonical key, matches the NFC dictionaries.
+        return self._cached_lemmatize(normalize_token(token), lang)
 
     def _lemmatize(
         self,
@@ -103,6 +104,10 @@ def _lemmatize(
     ) -> str:
         """Internal method to lemmatize a token in the specified language(s).
 
+        The token arrives NFC-normalized by ``lemmatize``. Input validation
+        happens here so it only runs on cache misses, keeping hits cheap
+        (exceptions are never cached by ``lru_cache``).
+
         Args:
             token: The token to lemmatize.
             lang: The language or languages for lemmatization.
@@ -167,6 +172,7 @@ def is_known(token: str, lang: str | tuple[str, ...]) -> bool:
     """
 
     _control_input_type(token)
+    token = normalize_token(token)
     lang = validate_lang_input(lang)
 
     dictionary_lookup = DictionaryLookupStrategy(_legacy_dictionary_factory)
diff --git a/simplemma/strategies/affix_decomposition.py b/simplemma/strategies/affix_decomposition.py
index 163adfd..a19a2d0 100644
--- a/simplemma/strategies/affix_decomposition.py
+++ b/simplemma/strategies/affix_decomposition.py
@@ -29,6 +29,8 @@
 AFFIXLEN = 2
 LONGAFFIXLEN = 5  # better for some languages
 MINCOMPLEN = 4
+# Decomposition is ~O(len²); cap long tokens (longest real form is 86 chars).
+MAXLEN = 100
 
 
 class AffixDecompositionStrategy(LemmatizationStrategy):
@@ -74,7 +76,11 @@ def get_lemma(self, token: str, lang: str) -> str | None:
             str | None: The lemma of the token if found, or None otherwise.
         """
         limit = 6 if lang in SHORTER_GREEDY else 8
-        if (not self._greedy and lang not in AFFIX_LANGS) or len(token) <= limit:
+        if (
+            (not self._greedy and lang not in AFFIX_LANGS)
+            or len(token) <= limit
+            or len(token) > MAXLEN
+        ):
             return None
 
         # define parameters
@@ -104,43 +110,30 @@ def _affix_decomposition(
         Returns:
             str | None: The lemma of the token if found, or None otherwise.
         """
-        # this only makes sense for languages written from left to right
-        # AFFIXLEN or MINCOMPLEN can spare time for some languages
-        for affixlen in range(max_affix_len, 1, -1):
-            for count in range(1, len(token) - min_complem_len + 1):
-                part1 = token[:-count]
-                # part1_aff = candidate[:-(count + affixlen)]p
-                lempart1 = self._dictionary_lookup.get_lemma(part1, lang)
-                if lempart1 is None:
-                    continue
-                # maybe an affix? discard it
-                if count <= affixlen:
-                    return lempart1
-                # account for case before looking for second part
-                part2 = token[-count:]
-                if token[0].isupper():
-                    part2 = part2.capitalize()
-                lempart2 = self._dictionary_lookup.get_lemma(part2, lang)
-                if lempart2 is None:
-                    continue
-                # candidate must be shorter
-                # try other case
-                candidate = self._greedy_dictionary_lookup.get_lemma(part2, lang)
-                # shorten the second known part of the token
-                if candidate is not None and len(candidate) < len(part2):
-                    return part1 + candidate.lower()
-                # backup: equal length or further candidates accepted
-                # try without capitalizing
-                # even greedier
-                # with capital letter?
-                if len(lempart2) < len(part2) + affixlen:
-                    return part1 + lempart2.lower()
-                    # print(part1, part2, affixlen, count, newcandidate, planb)
-                # elif newcandidate and len(newcandidate) < len(part2) + affixlen:
-                # plan_b = part1 + newcandidate.lower()
-                # print(part1, part2, affixlen, count, newcandidate, planb)
-                # else:
-                #    print(part1, part2, affixlen, count, newcandidate)
+        # Left-to-right languages only. A single pass at the largest affix
+        # length is equivalent to looping over smaller ones (first match wins).
+        for count in range(1, len(token) - min_complem_len + 1):
+            part1 = token[:-count]
+            lempart1 = self._dictionary_lookup.get_lemma(part1, lang)
+            if lempart1 is None:
+                continue
+            # maybe an affix? discard it
+            if count <= max_affix_len:
+                return lempart1
+            # account for case before looking for second part
+            part2 = token[-count:]
+            if token[0].isupper():
+                part2 = part2.capitalize()
+            lempart2 = self._dictionary_lookup.get_lemma(part2, lang)
+            if lempart2 is None:
+                continue
+            # prefer a shorter greedy form of the second part
+            candidate = self._greedy_dictionary_lookup.get_lemma(part2, lang)
+            if candidate is not None and len(candidate) < len(part2):
+                return part1 + candidate.lower()
+            # backup: accept the dictionary form if not longer than the affix bound
+            if len(lempart2) < len(part2) + max_affix_len:
+                return part1 + lempart2.lower()
         return None
 
     def _suffix_decomposition(
diff --git a/simplemma/strategies/dictionaries/dictionary_factory.py b/simplemma/strategies/dictionaries/dictionary_factory.py
index 005cda6..ad9cb62 100644
--- a/simplemma/strategies/dictionaries/dictionary_factory.py
+++ b/simplemma/strategies/dictionaries/dictionary_factory.py
@@ -13,11 +13,14 @@
 from abc import abstractmethod
 from functools import lru_cache
 from pathlib import Path
-from typing import Protocol
+from typing import Protocol, TypeVar, overload
 from collections.abc import Iterator, Mapping
 
+_T = TypeVar("_T")
+
 DATA_FOLDER = Path(__file__).parent / "data"
-SUPPORTED_LANGUAGES = [f.stem for f in DATA_FOLDER.glob("*.plzma")]
+# frozenset: O(1) membership, checked on every get_dictionary call.
+SUPPORTED_LANGUAGES = frozenset(f.stem for f in DATA_FOLDER.glob("*.plzma"))
 
 
 def _load_dictionary_from_disk(langcode: str) -> dict[bytes, bytes]:
@@ -86,6 +89,16 @@ def __init__(self, dictionary: dict[bytes, bytes]) -> None:
     def __getitem__(self, item: str) -> str:
         return self._dict[item.encode()].decode()
 
+    # The overloads mirror Mapping.get's signature for strict mypy.
+    @overload
+    def get(self, key: str) -> str | None: ...
+    @overload
+    def get(self, key: str, default: str | _T) -> str | _T: ...
+    def get(self, key: str, default: str | _T | None = None) -> str | _T | None:
+        # Avoids Mapping.get's EAFP path (a KeyError raised on every miss).
+        value = self._dict.get(key.encode())
+        return value.decode() if value is not None else default
+
     def __iter__(self) -> Iterator[str]:
         for key in self._dict:
             yield key.decode()
@@ -102,7 +115,7 @@ class DefaultDictionaryFactory(DictionaryFactory):
     It provides functionality for loading and caching dictionaries from disk that are included in Simplemma.
     """
 
-    __slots__ = ["_load_dictionary_from_disk"]
+    __slots__ = ["_get_dictionary"]
 
     def __init__(self, cache_max_size: int = 8) -> None:
         """
@@ -112,10 +125,18 @@ def __init__(self, cache_max_size: int = 8) -> None:
             cache_max_size (int): The maximum size of the cache for loaded dictionaries.
                 Defaults to `8`.
         """
-        self._load_dictionary_from_disk = lru_cache(maxsize=cache_max_size)(
-            _load_dictionary_from_disk
+        # Cache the wrapper, not the raw dict, to avoid re-wrapping on every
+        # call; the lru evicts wrapper and dict together, bounding memory.
+        self._get_dictionary = lru_cache(maxsize=cache_max_size)(
+            self._get_dictionary_uncached
         )
 
+    def _get_dictionary_uncached(self, lang: str) -> Mapping[str, str]:
+        """Build the dictionary for a language, without caching."""
+        if lang not in SUPPORTED_LANGUAGES:
+            raise ValueError(f"Unsupported language: {lang}")
+        return MappingStrToByteString(_load_dictionary_from_disk(lang))
+
     def get_dictionary(
         self,
         lang: str,
@@ -132,6 +153,4 @@ def get_dictionary(
         Raises:
             ValueError: If the specified language is not supported.
         """
-        if lang not in SUPPORTED_LANGUAGES:
-            raise ValueError(f"Unsupported language: {lang}")
-        return MappingStrToByteString(self._load_dictionary_from_disk(lang))
+        return self._get_dictionary(lang)
diff --git a/simplemma/strategies/dictionaries/trie_dictionary_factory.py b/simplemma/strategies/dictionaries/trie_dictionary_factory.py
index 742ac8d..7624f87 100644
--- a/simplemma/strategies/dictionaries/trie_dictionary_factory.py
+++ b/simplemma/strategies/dictionaries/trie_dictionary_factory.py
@@ -36,6 +36,11 @@ def __init__(self, trie: BytesTrie) -> None:
     def __getitem__(self, item: str) -> Any:
         return self._trie[item][0].decode()
 
+    def get(self, key: str, default: Any = None) -> Any:
+        # Avoids MutableMapping.get's EAFP path (a KeyError raised on every miss).
+        value = self._trie.get(key)
+        return value[0].decode() if value else default
+
     def __setitem__(self, key: Any, value: Any) -> None:
         raise NotImplementedError
 
diff --git a/simplemma/strategies/dictionary_lookup.py b/simplemma/strategies/dictionary_lookup.py
index 059ec64..e18c1b0 100644
--- a/simplemma/strategies/dictionary_lookup.py
+++ b/simplemma/strategies/dictionary_lookup.py
@@ -39,6 +39,8 @@ def get_lemma(self, token: str, lang: str) -> str | None:
             str | None: The lemma for the token, or `None` if not found in the dictionary.
 
         """
+        if not token:
+            return None
         # Search the language data, reverse case to extend coverage.
         dictionary = self._dictionary_factory.get_dictionary(lang)
         if result := dictionary.get(token):
diff --git a/simplemma/token_sampler.py b/simplemma/token_sampler.py
index 5c130ba..0516a1b 100644
--- a/simplemma/token_sampler.py
+++ b/simplemma/token_sampler.py
@@ -150,7 +150,7 @@ def sample_tokens(self, tokens: Iterable[str]) -> list[str]:
         counter = Counter(tokens)
 
         if self._capitalized_threshold > 0:
-            deletions = [token for token in counter if token[0].isupper()]
+            deletions = [token for token in counter if token and token[0].isupper()]
             if len(deletions) < self._capitalized_threshold * len(counter):
                 for token in deletions:
                     del counter[token]
diff --git a/simplemma/utils.py b/simplemma/utils.py
index 98cd51e..a4297b0 100644
--- a/simplemma/utils.py
+++ b/simplemma/utils.py
@@ -4,8 +4,24 @@
 
 - [levenshtein_dist][simplemma.utils.levenshtein_dist]: Calculates the Levenshtein distance between two strings.
 - [validate_lang_input][simplemma.utils.validate_lang_input]: Validates the language input and ensures it is a valid tuple.
+- [normalize_token][simplemma.utils.normalize_token]: Normalizes a token to Unicode NFC form.
 """
 
+import unicodedata
+
+
+def normalize_token(token: str) -> str:
+    """
+    Normalize a token to Unicode NFC, matching the shipped dictionaries.
+
+    Args:
+        token (str): The input token.
+
+    Returns:
+        str: The token in NFC form.
+    """
+    return unicodedata.normalize("NFC", token)
+
 
 def validate_lang_input(lang: str | tuple[str, ...]) -> tuple[str, ...]:
     """
@@ -19,6 +35,7 @@ def validate_lang_input(lang: str | tuple[str, ...]) -> tuple[str, ...]:
 
     Raises:
         TypeError: If the lang argument is not a tuple or a string.
+        ValueError: If the lang argument is empty.
 
     """
     # convert string
@@ -26,6 +43,8 @@ def validate_lang_input(lang: str | tuple[str, ...]) -> tuple[str, ...]:
         lang = (lang,)
     if not isinstance(lang, tuple):
         raise TypeError("lang argument must be a two-letter language code")
+    if not lang:
+        raise ValueError("lang argument is empty: provide at least one language code")
     return lang
 
 
diff --git a/tests/strategies/dictionaries/test_dictionary_factory.py b/tests/strategies/dictionaries/test_dictionary_factory.py
index b6b7743..39f7ac0 100644
--- a/tests/strategies/dictionaries/test_dictionary_factory.py
+++ b/tests/strategies/dictionaries/test_dictionary_factory.py
@@ -36,8 +36,7 @@ def test_dictionary_cache() -> None:
     for _ in range(iterations):
         dictionaries.get_dictionary("en")
         dictionaries.get_dictionary("de")
-    assert dictionaries._load_dictionary_from_disk.cache_info().misses == 2
-    assert (
-        dictionaries._load_dictionary_from_disk.cache_info().hits
-        == (iterations - 1) * 2
-    )
+    assert dictionaries._get_dictionary.cache_info().misses == 2
+    assert dictionaries._get_dictionary.cache_info().hits == (iterations - 1) * 2
+    # the cached wrapper itself is reused, not just the underlying dict
+    assert dictionaries.get_dictionary("en") is dictionaries.get_dictionary("en")
diff --git a/tests/strategies/test_strategies.py b/tests/strategies/test_strategies.py
index 2196522..5e63c4f 100644
--- a/tests/strategies/test_strategies.py
+++ b/tests/strategies/test_strategies.py
@@ -15,6 +15,8 @@ def test_search() -> None:
 
     assert DictionaryLookupStrategy().get_lemma("dritte", "de") == "dritt"
     assert DictionaryLookupStrategy().get_lemma("Dritte", "de") == "Dritter"
+    # empty token must not crash the case-flip retry
+    assert DictionaryLookupStrategy().get_lemma("", "en") is None
 
     assert HyphenRemovalStrategy().get_lemma("magni-ficent", "en") == "magnificent"
     assert HyphenRemovalStrategy().get_lemma("magni-ficents", "en") is None
@@ -53,4 +55,23 @@ def test_search() -> None:
 
     assert AffixDecompositionStrategy(greedy=True).get_lemma("ccc", "de") is None
 
+    # tokens longer than the safety cap are rejected outright (guards against
+    # the quadratic blow-up that would otherwise hang on hostile/long input)
+    affix = AffixDecompositionStrategy(greedy=True)
+    assert affix.get_lemma("a" * 101, "fi") is None
+    assert affix.get_lemma("a" * 100000, "fi") is None
+
     assert PrefixDecompositionStrategy().get_lemma("auf", "de") is None
+
+
+def test_affix_decomposition() -> None:
+    """Regression anchors for the long-affix languages (max_affix_len=5).
+
+    These guard the single-pass affix decomposition: each token is resolved by
+    stripping a multi-character affix, the path that the former decreasing-affixlen
+    loop exercised.
+    """
+    affix = AffixDecompositionStrategy(greedy=True)
+    assert affix.get_lemma("kissammeko", "fi") == "kissa"  # FI "and our cat?" -> cat
+    assert affix.get_lemma("könyveiteket", "hu") == "könyv"  # HU "your books" -> book
+    assert affix.get_lemma("raamatutest", "et") == "raamat"  # ET "from books" -> book
diff --git a/tests/test_language_detector.py b/tests/test_language_detector.py
index 21d78d8..4a24bad 100644
--- a/tests/test_language_detector.py
+++ b/tests/test_language_detector.py
@@ -6,6 +6,11 @@
 from .test_token_sampler import CustomTokenSampler
 
 
+def test_langdetect_no_samplers() -> None:
+    # no samplers means no results, not an UnboundLocalError
+    assert langdetect("Dies ist ein Test.", lang=("de", "en"), token_samplers=[]) == []
+
+
 def test_proportion_in_each_language() -> None:
     # sanity checks
     assert LanguageDetector(
diff --git a/tests/test_lemmatizer.py b/tests/test_lemmatizer.py
index ad3a06b..ea76d73 100644
--- a/tests/test_lemmatizer.py
+++ b/tests/test_lemmatizer.py
@@ -1,5 +1,6 @@
 """Tests for `simplemma` package."""
 
+import unicodedata
 from collections.abc import Mapping
 
 import pytest
@@ -477,11 +478,11 @@ def test_is_known() -> None:
     with pytest.raises(ValueError):
         assert is_known("", lang="en") is None
 
-    assert is_known("FanCY", lang="en") == True
-    assert is_known("Fancy-String", lang="en") == False
+    assert is_known("FanCY", lang="en")
+    assert not is_known("Fancy-String", lang="en")
 
-    assert is_known("espejos", lang=("es", "de")) == True
-    assert is_known("espejos", lang=("de", "es")) == True
+    assert is_known("espejos", lang=("es", "de"))
+    assert is_known("espejos", lang=("de", "es"))
 
 
 def test_get_lemmas_in_text() -> None:
@@ -583,3 +584,30 @@ def test_get_lemmas_in_text() -> None:
             ".",
         ]
     )
+
+
+def test_nfc_normalization() -> None:
+    """Decomposed (NFD) input must lemmatize like its composed (NFC) form."""
+    nfc = "Häuser"
+    nfd = unicodedata.normalize("NFD", nfc)
+    assert nfd != nfc  # genuinely different byte sequences
+    # both forms resolve to the same lemma (previously NFD silently failed)
+    assert lemmatize(nfd, lang="de") == lemmatize(nfc, lang="de") == "Haus"
+    # is_known agrees on both forms
+    assert is_known(nfd, lang="de") == is_known(nfc, lang="de") is True
+    # output is always NFC, even when the input was decomposed
+    café_nfd = unicodedata.normalize("NFD", "café")
+    assert lemmatize(café_nfd, lang="fr") == unicodedata.normalize(
+        "NFC", lemmatize(café_nfd, lang="fr")
+    )
+
+
+def test_long_token_does_not_hang() -> None:
+    """A pathologically long token must return quickly, not trigger O(n²) decomposition."""
+    # Before the affix-length cap this hung for ~minutes; it must now be instant
+    # and fall through to the fallback (the token is unknown), returned unchanged.
+    long_token = "a" * 50000
+    assert lemmatize(long_token, lang="fi") == long_token
+    # a token just over the cap is also handled, while normal-length input still lemmatizes
+    assert lemmatize("a" * 101, lang="fi") == "a" * 101
+    assert lemmatize("masks", lang="en") == "mask"
diff --git a/tests/test_token_sampler.py b/tests/test_token_sampler.py
index 9e1de3a..75a786b 100644
--- a/tests/test_token_sampler.py
+++ b/tests/test_token_sampler.py
@@ -34,6 +34,12 @@ def test_token_sampler():
     assert custom.sample_text("ABCD Efgh ijkl mn") == []
 
 
+def test_sample_tokens_empty_token() -> None:
+    # an empty string in the iterable must not crash the capitalization filter
+    sampler = MostCommonTokenSampler()
+    assert sampler.sample_tokens(["hello", "", "World", "hello"]) == ["hello", ""]
+
+
 def test_capitalized_threshold() -> None:
     sampler = MostCommonTokenSampler()  # default capitalized_threshold=0.8
     # capitalized tokens are a minority (2 of 3 < 0.8 * 3) -> they are removed
diff --git a/tests/test_utils.py b/tests/test_utils.py
index bbc88f2..dfa30b9 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,29 +1,37 @@
 """Tests for `simplemma.utils`."""
 
+import unicodedata
+
 import pytest
 
-from simplemma import is_known
-from simplemma.utils import levenshtein_dist, validate_lang_input
+from simplemma import is_known, lemmatize
+from simplemma.utils import levenshtein_dist, normalize_token, validate_lang_input
+
+
+def test_normalize_token() -> None:
+    # decomposed (NFD) input is recomposed to NFC
+    nfd = unicodedata.normalize("NFD", "Häuser")
+    assert nfd != "Häuser"  # the two byte sequences genuinely differ
+    assert normalize_token(nfd) == "Häuser"
+    # already-NFC input is returned byte-identical (idempotent)
+    assert normalize_token("Häuser") == "Häuser"
+    assert normalize_token(normalize_token(nfd)) == normalize_token(nfd)
+    # NFC, not NFKC: compatibility characters are preserved, not folded
+    assert normalize_token("ﬁ") == "ﬁ"  # ligature stays, would become "fi" under NFKC
 
 
 def test_validate_lang_input() -> None:
-    # a string is wrapped into a one-element tuple
     assert validate_lang_input("en") == ("en",)
-    # a tuple is passed through unchanged
     assert validate_lang_input(("de", "en")) == ("de", "en")
-    # an empty tuple is passed through unchanged (there is no guard against it;
-    # downstream this makes the lemmatizer fallback raise StopIteration on
-    # next(iter(lang)))
-    assert validate_lang_input(()) == ()
-    # non-str / non-tuple inputs raise TypeError. This branch is otherwise
-    # never reached through lemmatize(), where an unhashable lang argument
-    # fails earlier in the lru_cache rather than in validate_lang_input.
+    # empty lang must fail cleanly, not as a downstream StopIteration
+    with pytest.raises(ValueError):
+        validate_lang_input(())
+    with pytest.raises(ValueError):
+        lemmatize("test", ())
     with pytest.raises(TypeError):
         validate_lang_input(123)  # type: ignore[arg-type]
-    # a list is the realistic mistake (mutable, looks tuple-like)
     with pytest.raises(TypeError):
         validate_lang_input(["en"])  # type: ignore[arg-type]
-    # end-to-end through is_known(), which validates directly without caching
     with pytest.raises(TypeError):
         is_known("test", lang=123)  # type: ignore[arg-type]
 

From 73e555806fe19c262caa928d14de53a19abb01e9 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <adbar@users.noreply.github.com>
Date: Thu, 11 Jun 2026 19:37:04 +0200
Subject: [PATCH 2/4] fix: simplify

---
 simplemma/strategies/dictionary_lookup.py |  6 ++----
 simplemma/token_sampler.py                |  2 +-
 tests/strategies/test_strategies.py       | 10 ++--------
 tests/test_lemmatizer.py                  | 22 ++++++----------------
 tests/test_utils.py                       |  5 +----
 5 files changed, 12 insertions(+), 33 deletions(-)

diff --git a/simplemma/strategies/dictionary_lookup.py b/simplemma/strategies/dictionary_lookup.py
index e18c1b0..9f98fd0 100644
--- a/simplemma/strategies/dictionary_lookup.py
+++ b/simplemma/strategies/dictionary_lookup.py
@@ -39,12 +39,10 @@ def get_lemma(self, token: str, lang: str) -> str | None:
             str | None: The lemma for the token, or `None` if not found in the dictionary.
 
         """
-        if not token:
-            return None
         # Search the language data, reverse case to extend coverage.
         dictionary = self._dictionary_factory.get_dictionary(lang)
         if result := dictionary.get(token):
             return result
-        # Try upper or lowercase.
-        token = token.lower() if token[0].isupper() else token.capitalize()
+        # Try upper or lowercase (token[:1] stays empty-safe for empty input).
+        token = token.lower() if token[:1].isupper() else token.capitalize()
         return dictionary.get(token)
diff --git a/simplemma/token_sampler.py b/simplemma/token_sampler.py
index 0516a1b..f9a3353 100644
--- a/simplemma/token_sampler.py
+++ b/simplemma/token_sampler.py
@@ -150,7 +150,7 @@ def sample_tokens(self, tokens: Iterable[str]) -> list[str]:
         counter = Counter(tokens)
 
         if self._capitalized_threshold > 0:
-            deletions = [token for token in counter if token and token[0].isupper()]
+            deletions = [token for token in counter if token[:1].isupper()]
             if len(deletions) < self._capitalized_threshold * len(counter):
                 for token in deletions:
                     del counter[token]
diff --git a/tests/strategies/test_strategies.py b/tests/strategies/test_strategies.py
index 5e63c4f..72e7562 100644
--- a/tests/strategies/test_strategies.py
+++ b/tests/strategies/test_strategies.py
@@ -55,8 +55,7 @@ def test_search() -> None:
 
     assert AffixDecompositionStrategy(greedy=True).get_lemma("ccc", "de") is None
 
-    # tokens longer than the safety cap are rejected outright (guards against
-    # the quadratic blow-up that would otherwise hang on hostile/long input)
+    # tokens over the safety cap are rejected outright (quadratic blow-up guard)
     affix = AffixDecompositionStrategy(greedy=True)
     assert affix.get_lemma("a" * 101, "fi") is None
     assert affix.get_lemma("a" * 100000, "fi") is None
@@ -65,12 +64,7 @@ def test_search() -> None:
 
 
 def test_affix_decomposition() -> None:
-    """Regression anchors for the long-affix languages (max_affix_len=5).
-
-    These guard the single-pass affix decomposition: each token is resolved by
-    stripping a multi-character affix, the path that the former decreasing-affixlen
-    loop exercised.
-    """
+    """Single-pass affix decomposition resolves multi-character affixes (max_affix_len=5)."""
     affix = AffixDecompositionStrategy(greedy=True)
     assert affix.get_lemma("kissammeko", "fi") == "kissa"  # FI "and our cat?" -> cat
     assert affix.get_lemma("könyveiteket", "hu") == "könyv"  # HU "your books" -> book
diff --git a/tests/test_lemmatizer.py b/tests/test_lemmatizer.py
index ea76d73..05cb3f6 100644
--- a/tests/test_lemmatizer.py
+++ b/tests/test_lemmatizer.py
@@ -588,26 +588,16 @@ def test_get_lemmas_in_text() -> None:
 
 def test_nfc_normalization() -> None:
     """Decomposed (NFD) input must lemmatize like its composed (NFC) form."""
-    nfc = "Häuser"
-    nfd = unicodedata.normalize("NFD", nfc)
-    assert nfd != nfc  # genuinely different byte sequences
-    # both forms resolve to the same lemma (previously NFD silently failed)
-    assert lemmatize(nfd, lang="de") == lemmatize(nfc, lang="de") == "Haus"
-    # is_known agrees on both forms
-    assert is_known(nfd, lang="de") == is_known(nfc, lang="de") is True
+    nfd = unicodedata.normalize("NFD", "Häuser")
+    assert lemmatize(nfd, lang="de") == lemmatize("Häuser", lang="de") == "Haus"
+    assert is_known(nfd, lang="de") == is_known("Häuser", lang="de") is True
     # output is always NFC, even when the input was decomposed
-    café_nfd = unicodedata.normalize("NFD", "café")
-    assert lemmatize(café_nfd, lang="fr") == unicodedata.normalize(
-        "NFC", lemmatize(café_nfd, lang="fr")
-    )
+    out = lemmatize(unicodedata.normalize("NFD", "café"), lang="fr")
+    assert out == unicodedata.normalize("NFC", out)
 
 
 def test_long_token_does_not_hang() -> None:
     """A pathologically long token must return quickly, not trigger O(n²) decomposition."""
-    # Before the affix-length cap this hung for ~minutes; it must now be instant
-    # and fall through to the fallback (the token is unknown), returned unchanged.
-    long_token = "a" * 50000
-    assert lemmatize(long_token, lang="fi") == long_token
-    # a token just over the cap is also handled, while normal-length input still lemmatizes
+    assert lemmatize("a" * 50000, lang="fi") == "a" * 50000  # was minutes, now instant
     assert lemmatize("a" * 101, lang="fi") == "a" * 101
     assert lemmatize("masks", lang="en") == "mask"
diff --git a/tests/test_utils.py b/tests/test_utils.py
index dfa30b9..56937f3 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -9,13 +9,10 @@
 
 
 def test_normalize_token() -> None:
-    # decomposed (NFD) input is recomposed to NFC
     nfd = unicodedata.normalize("NFD", "Häuser")
-    assert nfd != "Häuser"  # the two byte sequences genuinely differ
+    assert nfd != "Häuser"
     assert normalize_token(nfd) == "Häuser"
-    # already-NFC input is returned byte-identical (idempotent)
     assert normalize_token("Häuser") == "Häuser"
-    assert normalize_token(normalize_token(nfd)) == normalize_token(nfd)
     # NFC, not NFKC: compatibility characters are preserved, not folded
     assert normalize_token("ﬁ") == "ﬁ"  # ligature stays, would become "fi" under NFKC
 

From 63b336f9e8952a6ac8f3d72d5eac2eef011b977b Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <adbar@users.noreply.github.com>
Date: Thu, 11 Jun 2026 21:13:42 +0200
Subject: [PATCH 3/4] further hardening and fixes

---
 simplemma/lemmatizer.py                                 | 5 +++--
 simplemma/strategies/affix_decomposition.py             | 7 +++----
 simplemma/strategies/dictionaries/dictionary_factory.py | 3 ++-
 simplemma/strategies/dictionary_lookup.py               | 2 +-
 simplemma/strategies/greedy_dictionary_lookup.py        | 8 ++++++--
 simplemma/strategies/hyphen_removal.py                  | 4 +++-
 6 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/simplemma/lemmatizer.py b/simplemma/lemmatizer.py
index 36b2404..7eb37ec 100644
--- a/simplemma/lemmatizer.py
+++ b/simplemma/lemmatizer.py
@@ -158,6 +158,7 @@ def get_lemmas_in_text(
         greedy=True, dictionary_factory=_legacy_dictionary_factory
     )
 )
+_legacy_dictionary_lookup = DictionaryLookupStrategy(_legacy_dictionary_factory)
 
 
 def is_known(token: str, lang: str | tuple[str, ...]) -> bool:
@@ -175,9 +176,9 @@ def is_known(token: str, lang: str | tuple[str, ...]) -> bool:
     token = normalize_token(token)
     lang = validate_lang_input(lang)
 
-    dictionary_lookup = DictionaryLookupStrategy(_legacy_dictionary_factory)
     return any(
-        dictionary_lookup.get_lemma(token, lang_code) is not None for lang_code in lang
+        _legacy_dictionary_lookup.get_lemma(token, lang_code) is not None
+        for lang_code in lang
     )
 
 
diff --git a/simplemma/strategies/affix_decomposition.py b/simplemma/strategies/affix_decomposition.py
index a19a2d0..c568bcd 100644
--- a/simplemma/strategies/affix_decomposition.py
+++ b/simplemma/strategies/affix_decomposition.py
@@ -3,7 +3,7 @@
 """
 
 from .dictionary_lookup import DictionaryLookupStrategy
-from .greedy_dictionary_lookup import SHORTER_GREEDY, GreedyDictionaryLookupStrategy
+from .greedy_dictionary_lookup import GreedyDictionaryLookupStrategy, greedy_min_length
 from .lemmatization_strategy import LemmatizationStrategy
 
 # TODO: This custom behavior has to be simplified before it becomes unmaintainable
@@ -75,10 +75,9 @@ def get_lemma(self, token: str, lang: str) -> str | None:
         Returns:
             str | None: The lemma of the token if found, or None otherwise.
         """
-        limit = 6 if lang in SHORTER_GREEDY else 8
         if (
             (not self._greedy and lang not in AFFIX_LANGS)
-            or len(token) <= limit
+            or len(token) <= greedy_min_length(lang)
             or len(token) > MAXLEN
         ):
             return None
@@ -158,7 +157,7 @@ def _suffix_decomposition(
             suffix = self._dictionary_lookup.get_lemma(
                 token[-count:].capitalize(), lang
             )
-            if suffix is not None and len(suffix) <= len(token[-count:]):
+            if suffix is not None and len(suffix) <= count:
                 return token[:-count] + suffix.lower()
 
         return None
diff --git a/simplemma/strategies/dictionaries/dictionary_factory.py b/simplemma/strategies/dictionaries/dictionary_factory.py
index ad9cb62..a07ba14 100644
--- a/simplemma/strategies/dictionaries/dictionary_factory.py
+++ b/simplemma/strategies/dictionaries/dictionary_factory.py
@@ -43,7 +43,8 @@ def _load_dictionary_from_disk(langcode: str) -> dict[bytes, bytes]:
     filepath = DATA_FOLDER / f"{langcode}.plzma"
     with lzma.open(filepath, "rb") as filehandle:
         pickled_dict = pickle.load(filehandle)
-        assert isinstance(pickled_dict, dict)
+        if not isinstance(pickled_dict, dict):
+            raise TypeError(f"unexpected data in {filepath}: {type(pickled_dict)}")
         return pickled_dict
 
 
diff --git a/simplemma/strategies/dictionary_lookup.py b/simplemma/strategies/dictionary_lookup.py
index 9f98fd0..25c6957 100644
--- a/simplemma/strategies/dictionary_lookup.py
+++ b/simplemma/strategies/dictionary_lookup.py
@@ -41,7 +41,7 @@ def get_lemma(self, token: str, lang: str) -> str | None:
         """
         # Search the language data, reverse case to extend coverage.
         dictionary = self._dictionary_factory.get_dictionary(lang)
-        if result := dictionary.get(token):
+        if (result := dictionary.get(token)) is not None:
             return result
         # Try upper or lowercase (token[:1] stays empty-safe for empty input).
         token = token.lower() if token[:1].isupper() else token.capitalize()
diff --git a/simplemma/strategies/greedy_dictionary_lookup.py b/simplemma/strategies/greedy_dictionary_lookup.py
index fea33da..10a8ef2 100644
--- a/simplemma/strategies/greedy_dictionary_lookup.py
+++ b/simplemma/strategies/greedy_dictionary_lookup.py
@@ -10,6 +10,11 @@
 SHORTER_GREEDY = {"bg", "et", "fi", "lv"}
 
 
+def greedy_min_length(lang: str) -> int:
+    """Shortest token worth decomposing; shorter ones are returned/skipped as-is."""
+    return 6 if lang in SHORTER_GREEDY else 8
+
+
 class GreedyDictionaryLookupStrategy(LemmatizationStrategy):
     """
     This class represents a lemmatization strategy that performs lemmatization using a greedy dictionary lookup strategy.
@@ -53,8 +58,7 @@ def get_lemma(self, token: str, lang: str) -> str:
             str: The lemma for the token.
 
         """
-        limit = 6 if lang in SHORTER_GREEDY else 8
-        if len(token) <= limit:
+        if len(token) <= greedy_min_length(lang):
             return token
 
         dictionary = self._dictionary_factory.get_dictionary(lang)
diff --git a/simplemma/strategies/hyphen_removal.py b/simplemma/strategies/hyphen_removal.py
index c44a3d0..2e7d5ee 100644
--- a/simplemma/strategies/hyphen_removal.py
+++ b/simplemma/strategies/hyphen_removal.py
@@ -53,8 +53,10 @@ def get_lemma(self, token: str, lang: str) -> str | None:
             str | None: The lemma for the token, or None if no lemma is found.
 
         """
+        if not any(hyphen in token for hyphen in HYPHENS):
+            return None
         token_parts = HYPHEN_REGEX.split(token)
-        if len(token_parts) <= 1 or not token_parts[-1]:
+        if not token_parts[-1]:
             return None
 
         # try to find a word form without hyphen

From b07b5d07c2a2674f8b15fad36696980a610cb1f8 Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <adbar@users.noreply.github.com>
Date: Thu, 11 Jun 2026 21:27:48 +0200
Subject: [PATCH 4/4] restore coverage

---
 .../strategies/dictionaries/dictionary_factory.py  |  2 +-
 .../dictionaries/test_dictionary_factory.py        | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/simplemma/strategies/dictionaries/dictionary_factory.py b/simplemma/strategies/dictionaries/dictionary_factory.py
index a07ba14..e9616af 100644
--- a/simplemma/strategies/dictionaries/dictionary_factory.py
+++ b/simplemma/strategies/dictionaries/dictionary_factory.py
@@ -34,7 +34,7 @@ def _load_dictionary_from_disk(langcode: str) -> dict[bytes, bytes]:
         dict[str, str]: The loaded dictionary.
 
     Raises:
-        AssertionError: If the loaded object is not a dictionary.
+        TypeError: If the loaded object is not a dictionary.
 
     Note:
         This function assumes that the dictionary file is stored in the 'data' folder relative to this module.
diff --git a/tests/strategies/dictionaries/test_dictionary_factory.py b/tests/strategies/dictionaries/test_dictionary_factory.py
index 39f7ac0..a6ac44c 100644
--- a/tests/strategies/dictionaries/test_dictionary_factory.py
+++ b/tests/strategies/dictionaries/test_dictionary_factory.py
@@ -1,7 +1,19 @@
 import pytest
 
 from simplemma.strategies import DefaultDictionaryFactory
-from simplemma.strategies.dictionaries.dictionary_factory import MappingStrToByteString
+from simplemma.strategies.dictionaries.dictionary_factory import (
+    MappingStrToByteString,
+    _load_dictionary_from_disk,
+)
+
+
+def test_load_dictionary_rejects_non_dict(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr(
+        "simplemma.strategies.dictionaries.dictionary_factory.pickle.load",
+        lambda _filehandle: ["not", "a", "dict"],
+    )
+    with pytest.raises(TypeError, match="unexpected data"):
+        _load_dictionary_from_disk("en")
 
 
 def test_mapping_str_to_bytestring() -> None: