Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,8 @@ jobs:
run: python -m pip install --ignore-installed ".[marisa-trie]"

# tests
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Lint with ruff
run: ruff check .

- name: Code format with black
run: black --check --diff simplemma training tests
Expand Down
2 changes: 1 addition & 1 deletion docs/contributing.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ checks that CI runs:
# Code style
black --check --diff simplemma training tests
# Linting
flake8 simplemma training tests
ruff check .
# Type checking
mypy -p simplemma -p training -p tests
# Tests
Expand Down
9 changes: 8 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ test = [
dev = [
"simplemma[test]",
"black == 26.5.1",
"flake8 == 7.3.0",
"mypy == 2.1.0",
"ruff == 0.15.16",
"types-requests == 2.33.0.20260518",
]
docs = [
Expand All @@ -140,6 +140,13 @@ disallow_untyped_defs = false
disallow_incomplete_defs = false
check_untyped_defs = false

[tool.ruff]
target-version = "py310"
line-length = 88
# Linting uses ruff's default rule set (E4, E7, E9, F): pyflakes plus the
# import, statement and runtime-error pycodestyle checks. Formatting is handled
# by black.

[tool.pytest.ini_options]
testpaths = ["tests"]

Expand Down
5 changes: 4 additions & 1 deletion simplemma/language_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
RelaxedMostCommonTokenSampler,
TokenSampler,
)
from .utils import validate_lang_input
from .utils import normalize_token, validate_lang_input


def in_target_language(
Expand Down Expand Up @@ -67,6 +67,7 @@ def langdetect(
and their respective proportions.
"""

list_results: list[tuple[str, float]] = []
for token_sampler in token_samplers:
results = LanguageDetector(
lang, token_sampler, DefaultStrategy(greedy)
Expand Down Expand Up @@ -159,6 +160,7 @@ def proportion_in_each_language(
known_tokens_count = dict.fromkeys(self._lang, 0)
unknown_tokens_count = 0
for token in tokens:
token = normalize_token(token)
token_found = False
for lang_code in self._lang:
candidate = self._lemmatization_strategy.get_lemma(token, lang_code)
Expand Down Expand Up @@ -194,6 +196,7 @@ def proportion_in_target_languages(

in_target = 0
for token in tokens:
token = normalize_token(token)
for lang_code in self._lang:
candidate = self._lemmatization_strategy.get_lemma(token, lang_code)
if candidate is not None:
Expand Down
21 changes: 14 additions & 7 deletions simplemma/lemmatizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
ToLowercaseFallbackStrategy,
)
from .tokenizer import RegexTokenizer, Tokenizer
from .utils import validate_lang_input
from .utils import normalize_token, validate_lang_input

PUNCTUATION = {".", "?", "!", "…", "¿", "¡"}

Expand All @@ -41,7 +41,7 @@ def _control_input_type(token: Any) -> None:
if not isinstance(token, str):
raise TypeError(f"Wrong input type, expected string, got {type(token)}")
if token == "":
raise ValueError("Wrong input type: empty string")
raise ValueError("Wrong input value: empty string")


class Lemmatizer:
Expand All @@ -56,7 +56,7 @@ class Lemmatizer:

def __init__(
self,
cache_max_size: int = 1048576,
cache_max_size: int = 65536,
tokenizer: Tokenizer = RegexTokenizer(),
lemmatization_strategy: LemmatizationStrategy = DefaultStrategy(),
fallback_lemmatization_strategy: LemmatizationFallbackStrategy = ToLowercaseFallbackStrategy(),
Expand All @@ -66,7 +66,7 @@ def __init__(

Args:
cache_max_size (int, optional): The maximum size of the cache for the lemmatization results.
Defaults to `1048576`.
Defaults to `65536`.
tokenizer (Tokenizer, optional): The tokenizer to use for tokenization.
Defaults to `RegexTokenizer()`.
lemmatization_strategy (LemmatizationStrategy, optional): The lemmatization strategy to use.
Expand Down Expand Up @@ -94,7 +94,8 @@ def lemmatize(
Returns:
str: The lemmatized form of the token.
"""
return self._cached_lemmatize(token, lang)
# NFC before caching: canonical key, matches the NFC dictionaries.
return self._cached_lemmatize(normalize_token(token), lang)

def _lemmatize(
self,
Expand All @@ -103,6 +104,10 @@ def _lemmatize(
) -> str:
"""Internal method to lemmatize a token in the specified language(s).

The token arrives NFC-normalized by ``lemmatize``. Input validation
happens here so it only runs on cache misses, keeping hits cheap
(exceptions are never cached by ``lru_cache``).

Args:
token: The token to lemmatize.
lang: The language or languages for lemmatization.
Expand Down Expand Up @@ -153,6 +158,7 @@ def get_lemmas_in_text(
greedy=True, dictionary_factory=_legacy_dictionary_factory
)
)
_legacy_dictionary_lookup = DictionaryLookupStrategy(_legacy_dictionary_factory)


def is_known(token: str, lang: str | tuple[str, ...]) -> bool:
Expand All @@ -167,11 +173,12 @@ def is_known(token: str, lang: str | tuple[str, ...]) -> bool:
"""

_control_input_type(token)
token = normalize_token(token)
lang = validate_lang_input(lang)

dictionary_lookup = DictionaryLookupStrategy(_legacy_dictionary_factory)
return any(
dictionary_lookup.get_lemma(token, lang_code) is not None for lang_code in lang
_legacy_dictionary_lookup.get_lemma(token, lang_code) is not None
for lang_code in lang
)


Expand Down
74 changes: 33 additions & 41 deletions simplemma/strategies/affix_decomposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""

from .dictionary_lookup import DictionaryLookupStrategy
from .greedy_dictionary_lookup import SHORTER_GREEDY, GreedyDictionaryLookupStrategy
from .greedy_dictionary_lookup import GreedyDictionaryLookupStrategy, greedy_min_length
from .lemmatization_strategy import LemmatizationStrategy

# TODO: This custom behavior has to be simplified before it becomes unmaintainable
Expand All @@ -29,6 +29,8 @@
AFFIXLEN = 2
LONGAFFIXLEN = 5 # better for some languages
MINCOMPLEN = 4
# Decomposition is ~O(len²); cap long tokens (longest real form is 86 chars).
MAXLEN = 100


class AffixDecompositionStrategy(LemmatizationStrategy):
Expand Down Expand Up @@ -73,8 +75,11 @@ def get_lemma(self, token: str, lang: str) -> str | None:
Returns:
str | None: The lemma of the token if found, or None otherwise.
"""
limit = 6 if lang in SHORTER_GREEDY else 8
if (not self._greedy and lang not in AFFIX_LANGS) or len(token) <= limit:
if (
(not self._greedy and lang not in AFFIX_LANGS)
or len(token) <= greedy_min_length(lang)
or len(token) > MAXLEN
):
return None

# define parameters
Expand Down Expand Up @@ -104,43 +109,30 @@ def _affix_decomposition(
Returns:
str | None: The lemma of the token if found, or None otherwise.
"""
# this only makes sense for languages written from left to right
# AFFIXLEN or MINCOMPLEN can spare time for some languages
for affixlen in range(max_affix_len, 1, -1):
for count in range(1, len(token) - min_complem_len + 1):
part1 = token[:-count]
# part1_aff = candidate[:-(count + affixlen)]p
lempart1 = self._dictionary_lookup.get_lemma(part1, lang)
if lempart1 is None:
continue
# maybe an affix? discard it
if count <= affixlen:
return lempart1
# account for case before looking for second part
part2 = token[-count:]
if token[0].isupper():
part2 = part2.capitalize()
lempart2 = self._dictionary_lookup.get_lemma(part2, lang)
if lempart2 is None:
continue
# candidate must be shorter
# try other case
candidate = self._greedy_dictionary_lookup.get_lemma(part2, lang)
# shorten the second known part of the token
if candidate is not None and len(candidate) < len(part2):
return part1 + candidate.lower()
# backup: equal length or further candidates accepted
# try without capitalizing
# even greedier
# with capital letter?
if len(lempart2) < len(part2) + affixlen:
return part1 + lempart2.lower()
# print(part1, part2, affixlen, count, newcandidate, planb)
# elif newcandidate and len(newcandidate) < len(part2) + affixlen:
# plan_b = part1 + newcandidate.lower()
# print(part1, part2, affixlen, count, newcandidate, planb)
# else:
# print(part1, part2, affixlen, count, newcandidate)
# Left-to-right languages only. A single pass at the largest affix
# length is equivalent to looping over smaller ones (first match wins).
for count in range(1, len(token) - min_complem_len + 1):
part1 = token[:-count]
lempart1 = self._dictionary_lookup.get_lemma(part1, lang)
if lempart1 is None:
continue
# maybe an affix? discard it
if count <= max_affix_len:
return lempart1
# account for case before looking for second part
part2 = token[-count:]
if token[0].isupper():
part2 = part2.capitalize()
lempart2 = self._dictionary_lookup.get_lemma(part2, lang)
if lempart2 is None:
continue
# prefer a shorter greedy form of the second part
candidate = self._greedy_dictionary_lookup.get_lemma(part2, lang)
if candidate is not None and len(candidate) < len(part2):
return part1 + candidate.lower()
# backup: accept the dictionary form if not longer than the affix bound
if len(lempart2) < len(part2) + max_affix_len:
return part1 + lempart2.lower()
return None

def _suffix_decomposition(
Expand All @@ -165,7 +157,7 @@ def _suffix_decomposition(
suffix = self._dictionary_lookup.get_lemma(
token[-count:].capitalize(), lang
)
if suffix is not None and len(suffix) <= len(token[-count:]):
if suffix is not None and len(suffix) <= count:
return token[:-count] + suffix.lower()

return None
40 changes: 30 additions & 10 deletions simplemma/strategies/dictionaries/dictionary_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,14 @@
from abc import abstractmethod
from functools import lru_cache
from pathlib import Path
from typing import Protocol
from typing import Protocol, TypeVar, overload
from collections.abc import Iterator, Mapping

_T = TypeVar("_T")

DATA_FOLDER = Path(__file__).parent / "data"
SUPPORTED_LANGUAGES = [f.stem for f in DATA_FOLDER.glob("*.plzma")]
# frozenset: O(1) membership, checked on every get_dictionary call.
SUPPORTED_LANGUAGES = frozenset(f.stem for f in DATA_FOLDER.glob("*.plzma"))


def _load_dictionary_from_disk(langcode: str) -> dict[bytes, bytes]:
Expand All @@ -31,7 +34,7 @@ def _load_dictionary_from_disk(langcode: str) -> dict[bytes, bytes]:
dict[str, str]: The loaded dictionary.

Raises:
AssertionError: If the loaded object is not a dictionary.
TypeError: If the loaded object is not a dictionary.

Note:
This function assumes that the dictionary file is stored in the 'data' folder relative to this module.
Expand All @@ -40,7 +43,8 @@ def _load_dictionary_from_disk(langcode: str) -> dict[bytes, bytes]:
filepath = DATA_FOLDER / f"{langcode}.plzma"
with lzma.open(filepath, "rb") as filehandle:
pickled_dict = pickle.load(filehandle)
assert isinstance(pickled_dict, dict)
if not isinstance(pickled_dict, dict):
raise TypeError(f"unexpected data in {filepath}: {type(pickled_dict)}")
return pickled_dict


Expand Down Expand Up @@ -86,6 +90,16 @@ def __init__(self, dictionary: dict[bytes, bytes]) -> None:
def __getitem__(self, item: str) -> str:
return self._dict[item.encode()].decode()

# The overloads mirror Mapping.get's signature for strict mypy.
@overload
def get(self, key: str) -> str | None: ...
Comment thread
adbar marked this conversation as resolved.
Dismissed
@overload
def get(self, key: str, default: str | _T) -> str | _T: ...
Comment thread
adbar marked this conversation as resolved.
Dismissed
def get(self, key: str, default: str | _T | None = None) -> str | _T | None:
# Avoids Mapping.get's EAFP path (a KeyError raised on every miss).
value = self._dict.get(key.encode())
return value.decode() if value is not None else default

def __iter__(self) -> Iterator[str]:
for key in self._dict:
yield key.decode()
Expand All @@ -102,7 +116,7 @@ class DefaultDictionaryFactory(DictionaryFactory):
It provides functionality for loading and caching dictionaries from disk that are included in Simplemma.
"""

__slots__ = ["_load_dictionary_from_disk"]
__slots__ = ["_get_dictionary"]

def __init__(self, cache_max_size: int = 8) -> None:
"""
Expand All @@ -112,10 +126,18 @@ def __init__(self, cache_max_size: int = 8) -> None:
cache_max_size (int): The maximum size of the cache for loaded dictionaries.
Defaults to `8`.
"""
self._load_dictionary_from_disk = lru_cache(maxsize=cache_max_size)(
_load_dictionary_from_disk
# Cache the wrapper, not the raw dict, to avoid re-wrapping on every
# call; the lru evicts wrapper and dict together, bounding memory.
self._get_dictionary = lru_cache(maxsize=cache_max_size)(
self._get_dictionary_uncached
)

def _get_dictionary_uncached(self, lang: str) -> Mapping[str, str]:
"""Build the dictionary for a language, without caching."""
if lang not in SUPPORTED_LANGUAGES:
raise ValueError(f"Unsupported language: {lang}")
return MappingStrToByteString(_load_dictionary_from_disk(lang))

def get_dictionary(
self,
lang: str,
Expand All @@ -132,6 +154,4 @@ def get_dictionary(
Raises:
ValueError: If the specified language is not supported.
"""
if lang not in SUPPORTED_LANGUAGES:
raise ValueError(f"Unsupported language: {lang}")
return MappingStrToByteString(self._load_dictionary_from_disk(lang))
return self._get_dictionary(lang)
5 changes: 5 additions & 0 deletions simplemma/strategies/dictionaries/trie_dictionary_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ def __init__(self, trie: BytesTrie) -> None:
def __getitem__(self, item: str) -> Any:
return self._trie[item][0].decode()

def get(self, key: str, default: Any = None) -> Any:
# Avoids MutableMapping.get's EAFP path (a KeyError raised on every miss).
value = self._trie.get(key)
return value[0].decode() if value else default

def __setitem__(self, key: Any, value: Any) -> None:
raise NotImplementedError

Expand Down
6 changes: 3 additions & 3 deletions simplemma/strategies/dictionary_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ def get_lemma(self, token: str, lang: str) -> str | None:
"""
# Search the language data, reverse case to extend coverage.
dictionary = self._dictionary_factory.get_dictionary(lang)
if result := dictionary.get(token):
if (result := dictionary.get(token)) is not None:
return result
# Try upper or lowercase.
token = token.lower() if token[0].isupper() else token.capitalize()
# Try upper or lowercase (token[:1] stays empty-safe for empty input).
token = token.lower() if token[:1].isupper() else token.capitalize()
return dictionary.get(token)
Loading
Loading