adbar · adbar · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -54,12 +54,8 @@ jobs:
       run: python -m pip install --ignore-installed ".[marisa-trie]"
 
     # tests
-    - name: Lint with flake8
-      run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Lint with ruff
+      run: ruff check .
 
     - name: Code format with black
       run: black --check --diff simplemma training tests

diff --git a/docs/contributing.md b/docs/contributing.md
@@ -22,7 +22,7 @@ checks that CI runs:
 # Code style
 black --check --diff simplemma training tests
 # Linting
-flake8 simplemma training tests
+ruff check .
 # Type checking
 mypy -p simplemma -p training -p tests
 # Tests

diff --git a/pyproject.toml b/pyproject.toml
@@ -116,8 +116,8 @@ test = [
 dev = [
     "simplemma[test]",
     "black == 26.5.1",
-    "flake8 == 7.3.0",
     "mypy == 2.1.0",
+    "ruff == 0.15.16",
     "types-requests == 2.33.0.20260518",
 ]
 docs = [
@@ -140,6 +140,13 @@ disallow_untyped_defs = false
 disallow_incomplete_defs = false
 check_untyped_defs = false
 
+[tool.ruff]
+target-version = "py310"
+line-length = 88
+# Linting uses ruff's default rule set (E4, E7, E9, F): pyflakes plus the
+# import, statement and runtime-error pycodestyle checks. Formatting is handled
+# by black.
+
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 

diff --git a/simplemma/language_detector.py b/simplemma/language_detector.py
@@ -15,7 +15,7 @@
     RelaxedMostCommonTokenSampler,
     TokenSampler,
 )
-from .utils import validate_lang_input
+from .utils import normalize_token, validate_lang_input
 
 
 def in_target_language(
@@ -67,6 +67,7 @@ def langdetect(
             and their respective proportions.
     """
 
+    list_results: list[tuple[str, float]] = []
     for token_sampler in token_samplers:
         results = LanguageDetector(
             lang, token_sampler, DefaultStrategy(greedy)
@@ -159,6 +160,7 @@ def proportion_in_each_language(
         known_tokens_count = dict.fromkeys(self._lang, 0)
         unknown_tokens_count = 0
         for token in tokens:
+            token = normalize_token(token)
             token_found = False
             for lang_code in self._lang:
                 candidate = self._lemmatization_strategy.get_lemma(token, lang_code)
@@ -194,6 +196,7 @@ def proportion_in_target_languages(
 
         in_target = 0
         for token in tokens:
+            token = normalize_token(token)
             for lang_code in self._lang:
                 candidate = self._lemmatization_strategy.get_lemma(token, lang_code)
                 if candidate is not None:

diff --git a/simplemma/lemmatizer.py b/simplemma/lemmatizer.py
@@ -22,7 +22,7 @@
     ToLowercaseFallbackStrategy,
 )
 from .tokenizer import RegexTokenizer, Tokenizer
-from .utils import validate_lang_input
+from .utils import normalize_token, validate_lang_input
 
 PUNCTUATION = {".", "?", "!", "…", "¿", "¡"}
 
@@ -41,7 +41,7 @@ def _control_input_type(token: Any) -> None:
     if not isinstance(token, str):
         raise TypeError(f"Wrong input type, expected string, got {type(token)}")
     if token == "":
-        raise ValueError("Wrong input type: empty string")
+        raise ValueError("Wrong input value: empty string")
 
 
 class Lemmatizer:
@@ -56,7 +56,7 @@ class Lemmatizer:
 
     def __init__(
         self,
-        cache_max_size: int = 1048576,
+        cache_max_size: int = 65536,
         tokenizer: Tokenizer = RegexTokenizer(),
         lemmatization_strategy: LemmatizationStrategy = DefaultStrategy(),
         fallback_lemmatization_strategy: LemmatizationFallbackStrategy = ToLowercaseFallbackStrategy(),
@@ -66,7 +66,7 @@ def __init__(
 
         Args:
             cache_max_size (int, optional): The maximum size of the cache for the lemmatization results.
-                Defaults to `1048576`.
+                Defaults to `65536`.
             tokenizer (Tokenizer, optional): The tokenizer to use for tokenization.
                 Defaults to `RegexTokenizer()`.
             lemmatization_strategy (LemmatizationStrategy, optional): The lemmatization strategy to use.
@@ -94,7 +94,8 @@ def lemmatize(
         Returns:
             str: The lemmatized form of the token.
         """
-        return self._cached_lemmatize(token, lang)
+        # NFC before caching: canonical key, matches the NFC dictionaries.
+        return self._cached_lemmatize(normalize_token(token), lang)
 
     def _lemmatize(
         self,
@@ -103,6 +104,10 @@ def _lemmatize(
     ) -> str:
         """Internal method to lemmatize a token in the specified language(s).
 
+        The token arrives NFC-normalized by ``lemmatize``. Input validation
+        happens here so it only runs on cache misses, keeping hits cheap
+        (exceptions are never cached by ``lru_cache``).
+
         Args:
             token: The token to lemmatize.
             lang: The language or languages for lemmatization.
@@ -153,6 +158,7 @@ def get_lemmas_in_text(
         greedy=True, dictionary_factory=_legacy_dictionary_factory
     )
 )
+_legacy_dictionary_lookup = DictionaryLookupStrategy(_legacy_dictionary_factory)
 
 
 def is_known(token: str, lang: str | tuple[str, ...]) -> bool:
@@ -167,11 +173,12 @@ def is_known(token: str, lang: str | tuple[str, ...]) -> bool:
     """
 
     _control_input_type(token)
+    token = normalize_token(token)
     lang = validate_lang_input(lang)
 
-    dictionary_lookup = DictionaryLookupStrategy(_legacy_dictionary_factory)
     return any(
-        dictionary_lookup.get_lemma(token, lang_code) is not None for lang_code in lang
+        _legacy_dictionary_lookup.get_lemma(token, lang_code) is not None
+        for lang_code in lang
     )
 
 

diff --git a/simplemma/strategies/affix_decomposition.py b/simplemma/strategies/affix_decomposition.py
@@ -3,7 +3,7 @@
 """
 
 from .dictionary_lookup import DictionaryLookupStrategy
-from .greedy_dictionary_lookup import SHORTER_GREEDY, GreedyDictionaryLookupStrategy
+from .greedy_dictionary_lookup import GreedyDictionaryLookupStrategy, greedy_min_length
 from .lemmatization_strategy import LemmatizationStrategy
 
 # TODO: This custom behavior has to be simplified before it becomes unmaintainable
@@ -29,6 +29,8 @@
 AFFIXLEN = 2
 LONGAFFIXLEN = 5  # better for some languages
 MINCOMPLEN = 4
+# Decomposition is ~O(len²); cap long tokens (longest real form is 86 chars).
+MAXLEN = 100
 
 
 class AffixDecompositionStrategy(LemmatizationStrategy):
@@ -73,8 +75,11 @@ def get_lemma(self, token: str, lang: str) -> str | None:
         Returns:
             str | None: The lemma of the token if found, or None otherwise.
         """
-        limit = 6 if lang in SHORTER_GREEDY else 8
-        if (not self._greedy and lang not in AFFIX_LANGS) or len(token) <= limit:
+        if (
+            (not self._greedy and lang not in AFFIX_LANGS)
+            or len(token) <= greedy_min_length(lang)
+            or len(token) > MAXLEN
+        ):
             return None
 
         # define parameters
@@ -104,43 +109,30 @@ def _affix_decomposition(
         Returns:
             str | None: The lemma of the token if found, or None otherwise.
         """
-        # this only makes sense for languages written from left to right
-        # AFFIXLEN or MINCOMPLEN can spare time for some languages
-        for affixlen in range(max_affix_len, 1, -1):
-            for count in range(1, len(token) - min_complem_len + 1):
-                part1 = token[:-count]
-                # part1_aff = candidate[:-(count + affixlen)]p
-                lempart1 = self._dictionary_lookup.get_lemma(part1, lang)
-                if lempart1 is None:
-                    continue
-                # maybe an affix? discard it
-                if count <= affixlen:
-                    return lempart1
-                # account for case before looking for second part
-                part2 = token[-count:]
-                if token[0].isupper():
-                    part2 = part2.capitalize()
-                lempart2 = self._dictionary_lookup.get_lemma(part2, lang)
-                if lempart2 is None:
-                    continue
-                # candidate must be shorter
-                # try other case
-                candidate = self._greedy_dictionary_lookup.get_lemma(part2, lang)
-                # shorten the second known part of the token
-                if candidate is not None and len(candidate) < len(part2):
-                    return part1 + candidate.lower()
-                # backup: equal length or further candidates accepted
-                # try without capitalizing
-                # even greedier
-                # with capital letter?
-                if len(lempart2) < len(part2) + affixlen:
-                    return part1 + lempart2.lower()
-                    # print(part1, part2, affixlen, count, newcandidate, planb)
-                # elif newcandidate and len(newcandidate) < len(part2) + affixlen:
-                # plan_b = part1 + newcandidate.lower()
-                # print(part1, part2, affixlen, count, newcandidate, planb)
-                # else:
-                #    print(part1, part2, affixlen, count, newcandidate)
+        # Left-to-right languages only. A single pass at the largest affix
+        # length is equivalent to looping over smaller ones (first match wins).
+        for count in range(1, len(token) - min_complem_len + 1):
+            part1 = token[:-count]
+            lempart1 = self._dictionary_lookup.get_lemma(part1, lang)
+            if lempart1 is None:
+                continue
+            # maybe an affix? discard it
+            if count <= max_affix_len:
+                return lempart1
+            # account for case before looking for second part
+            part2 = token[-count:]
+            if token[0].isupper():
+                part2 = part2.capitalize()
+            lempart2 = self._dictionary_lookup.get_lemma(part2, lang)
+            if lempart2 is None:
+                continue
+            # prefer a shorter greedy form of the second part
+            candidate = self._greedy_dictionary_lookup.get_lemma(part2, lang)
+            if candidate is not None and len(candidate) < len(part2):
+                return part1 + candidate.lower()
+            # backup: accept the dictionary form if not longer than the affix bound
+            if len(lempart2) < len(part2) + max_affix_len:
+                return part1 + lempart2.lower()
         return None
 
     def _suffix_decomposition(
@@ -165,7 +157,7 @@ def _suffix_decomposition(
             suffix = self._dictionary_lookup.get_lemma(
                 token[-count:].capitalize(), lang
             )
-            if suffix is not None and len(suffix) <= len(token[-count:]):
+            if suffix is not None and len(suffix) <= count:
                 return token[:-count] + suffix.lower()
 
         return None
diff --git a/simplemma/strategies/dictionaries/dictionary_factory.py b/simplemma/strategies/dictionaries/dictionary_factory.py
@@ -13,11 +13,14 @@
 from abc import abstractmethod
 from functools import lru_cache
 from pathlib import Path
-from typing import Protocol
+from typing import Protocol, TypeVar, overload
 from collections.abc import Iterator, Mapping
 
+_T = TypeVar("_T")
+
 DATA_FOLDER = Path(__file__).parent / "data"
-SUPPORTED_LANGUAGES = [f.stem for f in DATA_FOLDER.glob("*.plzma")]
+# frozenset: O(1) membership, checked on every get_dictionary call.
+SUPPORTED_LANGUAGES = frozenset(f.stem for f in DATA_FOLDER.glob("*.plzma"))
 
 
 def _load_dictionary_from_disk(langcode: str) -> dict[bytes, bytes]:
@@ -31,7 +34,7 @@ def _load_dictionary_from_disk(langcode: str) -> dict[bytes, bytes]:
         dict[str, str]: The loaded dictionary.
 
     Raises:
-        AssertionError: If the loaded object is not a dictionary.
+        TypeError: If the loaded object is not a dictionary.
 
     Note:
         This function assumes that the dictionary file is stored in the 'data' folder relative to this module.
@@ -40,7 +43,8 @@ def _load_dictionary_from_disk(langcode: str) -> dict[bytes, bytes]:
     filepath = DATA_FOLDER / f"{langcode}.plzma"
     with lzma.open(filepath, "rb") as filehandle:
         pickled_dict = pickle.load(filehandle)
-        assert isinstance(pickled_dict, dict)
+        if not isinstance(pickled_dict, dict):
+            raise TypeError(f"unexpected data in {filepath}: {type(pickled_dict)}")
         return pickled_dict
 
 
@@ -86,6 +90,16 @@ def __init__(self, dictionary: dict[bytes, bytes]) -> None:
     def __getitem__(self, item: str) -> str:
         return self._dict[item.encode()].decode()
 
+    # The overloads mirror Mapping.get's signature for strict mypy.
+    @overload
+    def get(self, key: str) -> str | None: ...
+    @overload
+    def get(self, key: str, default: str | _T) -> str | _T: ...
+    def get(self, key: str, default: str | _T | None = None) -> str | _T | None:
+        # Avoids Mapping.get's EAFP path (a KeyError raised on every miss).
+        value = self._dict.get(key.encode())
+        return value.decode() if value is not None else default
+
     def __iter__(self) -> Iterator[str]:
         for key in self._dict:
             yield key.decode()
@@ -102,7 +116,7 @@ class DefaultDictionaryFactory(DictionaryFactory):
     It provides functionality for loading and caching dictionaries from disk that are included in Simplemma.
     """
 
-    __slots__ = ["_load_dictionary_from_disk"]
+    __slots__ = ["_get_dictionary"]
 
     def __init__(self, cache_max_size: int = 8) -> None:
         """
@@ -112,10 +126,18 @@ def __init__(self, cache_max_size: int = 8) -> None:
             cache_max_size (int): The maximum size of the cache for loaded dictionaries.
                 Defaults to `8`.
         """
-        self._load_dictionary_from_disk = lru_cache(maxsize=cache_max_size)(
-            _load_dictionary_from_disk
+        # Cache the wrapper, not the raw dict, to avoid re-wrapping on every
+        # call; the lru evicts wrapper and dict together, bounding memory.
+        self._get_dictionary = lru_cache(maxsize=cache_max_size)(
+            self._get_dictionary_uncached
         )
 
+    def _get_dictionary_uncached(self, lang: str) -> Mapping[str, str]:
+        """Build the dictionary for a language, without caching."""
+        if lang not in SUPPORTED_LANGUAGES:
+            raise ValueError(f"Unsupported language: {lang}")
+        return MappingStrToByteString(_load_dictionary_from_disk(lang))
+
     def get_dictionary(
         self,
         lang: str,
@@ -132,6 +154,4 @@ def get_dictionary(
         Raises:
             ValueError: If the specified language is not supported.
         """
-        if lang not in SUPPORTED_LANGUAGES:
-            raise ValueError(f"Unsupported language: {lang}")
-        return MappingStrToByteString(self._load_dictionary_from_disk(lang))
+        return self._get_dictionary(lang)
diff --git a/simplemma/strategies/dictionaries/trie_dictionary_factory.py b/simplemma/strategies/dictionaries/trie_dictionary_factory.py
@@ -36,6 +36,11 @@ def __init__(self, trie: BytesTrie) -> None:
     def __getitem__(self, item: str) -> Any:
         return self._trie[item][0].decode()
 
+    def get(self, key: str, default: Any = None) -> Any:
+        # Avoids MutableMapping.get's EAFP path (a KeyError raised on every miss).
+        value = self._trie.get(key)
+        return value[0].decode() if value else default
+
     def __setitem__(self, key: Any, value: Any) -> None:
         raise NotImplementedError
 

diff --git a/simplemma/strategies/dictionary_lookup.py b/simplemma/strategies/dictionary_lookup.py
@@ -41,8 +41,8 @@ def get_lemma(self, token: str, lang: str) -> str | None:
         """
         # Search the language data, reverse case to extend coverage.
         dictionary = self._dictionary_factory.get_dictionary(lang)
-        if result := dictionary.get(token):
+        if (result := dictionary.get(token)) is not None:
             return result
-        # Try upper or lowercase.
-        token = token.lower() if token[0].isupper() else token.capitalize()
+        # Try upper or lowercase (token[:1] stays empty-safe for empty input).
+        token = token.lower() if token[:1].isupper() else token.capitalize()
         return dictionary.get(token)