diff --git a/tests/embedder/conftest.py b/tests/embedder/conftest.py index c7e91893..b26dfbff 100644 --- a/tests/embedder/conftest.py +++ b/tests/embedder/conftest.py @@ -1,5 +1,8 @@ +from __future__ import annotations + import importlib.util import platform +from typing import Any import pytest import torch @@ -77,9 +80,9 @@ def on_windows() -> bool: ] -def create_sentence_transformer_config(**kwargs) -> SentenceTransformerEmbeddingConfig: +def create_sentence_transformer_config(**kwargs: Any) -> SentenceTransformerEmbeddingConfig: """Helper function to create SentenceTransformer config with defaults.""" - defaults = { + defaults: dict[str, Any] = { "model_name": "sergeyzh/rubert-tiny-turbo", "batch_size": 4, "device": "cpu", @@ -90,9 +93,9 @@ def create_sentence_transformer_config(**kwargs) -> SentenceTransformerEmbedding return SentenceTransformerEmbeddingConfig(**defaults) -def create_openai_config(**kwargs) -> OpenaiEmbeddingConfig: +def create_openai_config(**kwargs: Any) -> OpenaiEmbeddingConfig: """Helper function to create OpenAI config with defaults.""" - defaults = { + defaults: dict[str, Any] = { "model_name": "text-embedding-3-small", "batch_size": 2, "use_cache": False, @@ -103,9 +106,9 @@ def create_openai_config(**kwargs) -> OpenaiEmbeddingConfig: return OpenaiEmbeddingConfig(**defaults) -def create_vllm_config(**kwargs) -> VllmEmbeddingConfig: +def create_vllm_config(**kwargs: Any) -> VllmEmbeddingConfig: """Helper function to create VllmEmbeddingConfig with test-friendly defaults.""" - defaults = { + defaults: dict[str, Any] = { "model_name": "BAAI/bge-base-en-v1.5", "batch_size": 4, "use_cache": False, @@ -117,5 +120,5 @@ def create_vllm_config(**kwargs) -> VllmEmbeddingConfig: @pytest.fixture(autouse=True) -def _autouse_fake_openai_embedding(patch_openai_embedding_backend): +def _autouse_fake_openai_embedding(patch_openai_embedding_backend: None) -> None: """Within tests/embedder/, every OpenaiEmbeddingConfig resolves to FakeOpenaiEmbeddingBackend.""" diff --git a/tests/embedder/test_basic.py b/tests/embedder/test_basic.py index d11bd150..d3d73ef7 100644 --- a/tests/embedder/test_basic.py +++ b/tests/embedder/test_basic.py @@ -22,7 +22,7 @@ def embedder(self, embedder_config: EmbedderConfig) -> Embedder: """Create an Embedder instance for testing.""" return Embedder(embedder_config) - def test_embedding_calculation(self, embedder: Embedder): + def test_embedding_calculation(self, embedder: Embedder) -> None: """Test basic embedding calculation functionality.""" test_utterances = ["Hello world", "Test sentence", "Another example"] @@ -34,7 +34,7 @@ def test_embedding_calculation(self, embedder: Embedder): if hasattr(embedder.config, "similarity_fn_name"): assert np.allclose(np.linalg.norm(embeddings, axis=1), 1.0, atol=1e-5) # normalized - def test_embedding_reproducibility(self, embedder: Embedder): + def test_embedding_reproducibility(self, embedder: Embedder) -> None: """Test that embeddings are reproducible for same input.""" test_utterances = ["Hello world", "Test sentence"] @@ -43,13 +43,13 @@ def test_embedding_reproducibility(self, embedder: Embedder): np.testing.assert_allclose(embeddings1, embeddings2, rtol=1e-5) - def test_single_utterance(self, embedder: Embedder): + def test_single_utterance(self, embedder: Embedder) -> None: """Test embedding calculation for single utterance.""" embeddings = embedder.embed(["Single test sentence"]) assert embeddings.shape[0] == 1 assert embeddings.shape[1] > 0 - def test_similarity_calculation(self, embedder: Embedder): + def test_similarity_calculation(self, embedder: Embedder) -> None: """Test similarity calculation between embeddings.""" utterances = ["Hello world", "Test sentence", "Another test"] embeddings = embedder.embed(utterances) @@ -62,7 +62,7 @@ def test_similarity_calculation(self, embedder: Embedder): assert np.all(sim_matrix >= -1.0) assert np.all(sim_matrix <= 1.0) - def test_similarity_symmetry(self, embedder: Embedder): + def test_similarity_symmetry(self, embedder: Embedder) -> None: """Test that similarity is symmetric.""" utterances = ["Hello world", "Test sentence"] embeddings = embedder.embed(utterances) diff --git a/tests/embedder/test_caching.py b/tests/embedder/test_caching.py index 12bff6d2..52d39cb9 100644 --- a/tests/embedder/test_caching.py +++ b/tests/embedder/test_caching.py @@ -18,7 +18,7 @@ class TestEmbedderCaching: """Test caching functionality for different embedder backends.""" - def test_caching_consistency(self, embedder_config: EmbedderConfig): + def test_caching_consistency(self, embedder_config: EmbedderConfig) -> None: """Test that caching produces consistent results when enabled.""" # Create config with caching enabled if hasattr(embedder_config, "model_copy"): @@ -40,7 +40,7 @@ def test_caching_consistency(self, embedder_config: EmbedderConfig): # Verify results are identical np.testing.assert_allclose(embeddings1, embeddings2, rtol=1e-5) - def test_caching_disabled_consistency(self, embedder_config: EmbedderConfig): + def test_caching_disabled_consistency(self, embedder_config: EmbedderConfig) -> None: """Test behavior when caching is disabled.""" # Ensure caching is disabled if hasattr(embedder_config, "model_copy"): @@ -63,7 +63,7 @@ def test_caching_disabled_consistency(self, embedder_config: EmbedderConfig): class TestSentenceTransformerCachingSpecific: """Test caching functionality specific to SentenceTransformer backend.""" - def test_caching_performance_improvement(self): + def test_caching_performance_improvement(self) -> None: """Test that caching provides performance improvement.""" config = create_sentence_transformer_config(use_cache=True) embedder = Embedder(config) @@ -83,7 +83,7 @@ def test_caching_performance_improvement(self): # but we can at least verify the caching mechanism works assert embeddings1.shape == embeddings2.shape - def test_different_inputs_no_cache_collision(self): + def test_different_inputs_no_cache_collision(self) -> None: """Test that different inputs don't collide in cache.""" config = create_sentence_transformer_config(use_cache=True) embedder = Embedder(config) @@ -94,7 +94,7 @@ def test_different_inputs_no_cache_collision(self): # Different inputs should produce different embeddings assert not np.allclose(embeddings1, embeddings2, rtol=1e-3) - def test_cache_with_different_prompts(self): + def test_cache_with_different_prompts(self) -> None: """Test that prompts are considered in caching.""" config = create_sentence_transformer_config( use_cache=True, diff --git a/tests/embedder/test_dump_load.py b/tests/embedder/test_dump_load.py index 5d79a42b..96a5f756 100644 --- a/tests/embedder/test_dump_load.py +++ b/tests/embedder/test_dump_load.py @@ -2,13 +2,16 @@ import tempfile from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import numpy as np import pytest from autointent._wrappers.embedder import Embedder -from autointent.configs import SentenceTransformerEmbeddingConfig +from autointent.configs import ( + OpenaiEmbeddingConfig, + SentenceTransformerEmbeddingConfig, +) from tests.conftest import tiny_sentence_transformer from .conftest import backend_configs @@ -17,7 +20,7 @@ from autointent.configs import EmbedderConfig -def test_load_from_disk(on_windows): +def test_load_from_disk(on_windows: bool) -> None: """Test loading embedder from disk with custom saved model.""" model = tiny_sentence_transformer() @@ -41,7 +44,12 @@ def embedder(self, embedder_config: EmbedderConfig) -> Embedder: """Create an Embedder instance for testing.""" return Embedder(embedder_config) - def test_dump_load_cycle(self, embedder: Embedder, on_windows, embedder_config: EmbedderConfig): # noqa: ARG002 + def test_dump_load_cycle( + self, + embedder: Embedder, + on_windows: bool, + embedder_config: EmbedderConfig, # noqa: ARG002 + ) -> None: """Test complete dump/load cycle preserves functionality.""" with tempfile.TemporaryDirectory(ignore_cleanup_errors=on_windows) as temp_dir: temp_path = Path(temp_dir) @@ -60,17 +68,28 @@ def test_dump_load_cycle(self, embedder: Embedder, on_windows, embedder_config: loaded_embeddings = embedder_loaded.embed(test_utterances) np.testing.assert_allclose(original_embeddings, loaded_embeddings, rtol=1e-3) - # Test configuration preservation (only for configs that have these attributes) + # Test configuration preservation (only for configs that have these attributes). + # The BaseEmbedderConfig union doesn't expose backend-specific fields; the hasattr + # checks are runtime guards, so cast to a concrete subclass with the attribute. if hasattr(embedder.config, "model_name"): - assert embedder_loaded.config.model_name == embedder.config.model_name + loaded_named = cast("SentenceTransformerEmbeddingConfig", embedder_loaded.config) + original_named = cast("SentenceTransformerEmbeddingConfig", embedder.config) + assert loaded_named.model_name == original_named.model_name if hasattr(embedder.config, "default_prompt"): assert embedder_loaded.config.default_prompt == embedder.config.default_prompt if hasattr(embedder.config, "batch_size"): - assert embedder_loaded.config.batch_size == embedder.config.batch_size - - def test_load_with_config_override(self, embedder: Embedder, on_windows, embedder_config: EmbedderConfig): # noqa: ARG002 + loaded_batched = cast("OpenaiEmbeddingConfig", embedder_loaded.config) + original_batched = cast("OpenaiEmbeddingConfig", embedder.config) + assert loaded_batched.batch_size == original_batched.batch_size + + def test_load_with_config_override( + self, + embedder: Embedder, + on_windows: bool, + embedder_config: EmbedderConfig, # noqa: ARG002 + ) -> None: """Test loading with configuration override.""" - from autointent.configs import HashingVectorizerEmbeddingConfig, OpenaiEmbeddingConfig + from autointent.configs import HashingVectorizerEmbeddingConfig # Skip for HashingVectorizer as it doesn't support batch_size override if isinstance(embedder.config, HashingVectorizerEmbeddingConfig): @@ -83,6 +102,7 @@ def test_load_with_config_override(self, embedder: Embedder, on_windows, embedde embedder.dump(temp_path) # Create appropriate override config based on backend type + override_config: EmbedderConfig if isinstance(embedder.config, SentenceTransformerEmbeddingConfig): override_config = SentenceTransformerEmbeddingConfig(batch_size=16) else: @@ -92,12 +112,16 @@ def test_load_with_config_override(self, embedder: Embedder, on_windows, embedde # Load with override embedder_loaded = Embedder.load(temp_path, override_config) - # Verify override took effect - assert embedder_loaded.config.batch_size == 16 + # Verify override took effect. embedder_loaded.config is the union + # BaseEmbedderConfig | ...; both SentenceTransformer and Openai + # subclasses carry batch_size/model_name, so cast for attribute access. + loaded_specific = cast("OpenaiEmbeddingConfig", embedder_loaded.config) + original_specific = cast("OpenaiEmbeddingConfig", embedder.config) + assert loaded_specific.batch_size == 16 # Verify original config preserved where not overridden - assert embedder_loaded.config.model_name == embedder.config.model_name + assert loaded_specific.model_name == original_specific.model_name - def test_similarity_preserved_after_load(self, embedder: Embedder, on_windows): + def test_similarity_preserved_after_load(self, embedder: Embedder, on_windows: bool) -> None: """Test that similarity function works correctly after dump/load.""" with tempfile.TemporaryDirectory(ignore_cleanup_errors=on_windows) as temp_dir: temp_path = Path(temp_dir) @@ -118,7 +142,7 @@ def test_similarity_preserved_after_load(self, embedder: Embedder, on_windows): # Similarities should be the same np.testing.assert_allclose(original_similarity, loaded_similarity, rtol=1e-3) - def test_multiple_dump_load_cycles(self, embedder: Embedder, on_windows): + def test_multiple_dump_load_cycles(self, embedder: Embedder, on_windows: bool) -> None: """Test multiple dump/load cycles maintain consistency.""" with tempfile.TemporaryDirectory(ignore_cleanup_errors=on_windows) as temp_dir: temp_path = Path(temp_dir) diff --git a/tests/embedder/test_fine_tuned.py b/tests/embedder/test_fine_tuned.py index b6227d8a..64b734b5 100644 --- a/tests/embedder/test_fine_tuned.py +++ b/tests/embedder/test_fine_tuned.py @@ -1,3 +1,7 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, cast + import numpy as np import pytest @@ -6,8 +10,12 @@ from autointent.context.data_handler import DataHandler from tests.conftest import tiny_sentence_transformer_config +if TYPE_CHECKING: + from autointent import Dataset + from autointent.custom_types import ListOfLabels + -def test_model_updates_after_training(dataset): +def test_model_updates_after_training(dataset: Dataset) -> None: """Test that model weights actually change after training""" pytest.importorskip("accelerate", reason="Accelerate library is required for this test") @@ -35,9 +43,11 @@ def test_model_updates_after_training(dataset): param.data.detach().cpu().numpy().copy() for param in backend._model.parameters() if param.requires_grad ] + # data_handler.train_labels returns ListOfGenericLabels (may contain None for OOS); + # the test dataset has no OOS, so cast to the strict ListOfLabels for the typed API. backend.train( utterances=data_handler.train_utterances(0)[:1000], - labels=data_handler.train_labels(0)[:1000], + labels=cast("ListOfLabels", data_handler.train_labels(0)[:1000]), config=train_config, ) diff --git a/tests/embedder/test_fine_tuned_dump_load.py b/tests/embedder/test_fine_tuned_dump_load.py index 5aef9bbd..e72ff82e 100644 --- a/tests/embedder/test_fine_tuned_dump_load.py +++ b/tests/embedder/test_fine_tuned_dump_load.py @@ -1,5 +1,8 @@ +from __future__ import annotations + import tempfile from pathlib import Path +from typing import TYPE_CHECKING, cast import numpy as np import pytest @@ -9,10 +12,14 @@ from autointent.context.data_handler import DataHandler from tests.conftest import tiny_sentence_transformer, tiny_sentence_transformer_config +if TYPE_CHECKING: + from autointent import Dataset + from autointent.custom_types import ListOfLabels + pytest.importorskip("sentence_transformers", reason="Sentence Transformers library is required for these tests") -def test_finetune_dump_load(dataset, on_windows): +def test_finetune_dump_load(dataset: Dataset, on_windows: bool) -> None: """Test scenario: fine-tune -> dump -> load.""" pytest.importorskip("accelerate", reason="Accelerate library is required for this test") @@ -37,10 +44,12 @@ def test_finetune_dump_load(dataset, on_windows): test_utterances = ["Test sentence for embedding", "Another test utterance"] original_embeddings = embedder_original.embed(test_utterances) - # Fine-tune the model + # Fine-tune the model. data_handler.train_labels returns ListOfGenericLabels + # (may contain None for OOS); the test dataset has no OOS, so cast to + # the strict ListOfLabels for the typed API. embedder_original.train( utterances=data_handler.train_utterances(0), - labels=data_handler.train_labels(0), + labels=cast("ListOfLabels", data_handler.train_labels(0)), config=train_config, ) @@ -70,7 +79,7 @@ def test_finetune_dump_load(dataset, on_windows): ) -def test_dump_load_finetune(dataset, on_windows): +def test_dump_load_finetune(dataset: Dataset, on_windows: bool) -> None: """Test scenario: dump -> load -> fine-tune.""" pytest.importorskip("accelerate", reason="Accelerate library is required for this test") @@ -109,9 +118,10 @@ def test_dump_load_finetune(dataset, on_windows): # Step 3: Fine-tune the loaded embedder loaded_before_training = embedder_loaded.embed(test_utterances) + # Cast labels: dataset has no OOS, so ListOfGenericLabels narrows to ListOfLabels. embedder_loaded.train( utterances=data_handler.train_utterances(0), - labels=data_handler.train_labels(0), + labels=cast("ListOfLabels", data_handler.train_labels(0)), config=train_config, ) @@ -122,7 +132,7 @@ def test_dump_load_finetune(dataset, on_windows): ) -def test_load_from_disk_finetune_dump_load(dataset, on_windows): +def test_load_from_disk_finetune_dump_load(dataset: Dataset, on_windows: bool) -> None: """Test scenario: load sentence transformer from disk -> fine-tune -> dump -> load.""" pytest.importorskip("accelerate", reason="Accelerate library is required for this test") @@ -150,9 +160,10 @@ def test_load_from_disk_finetune_dump_load(dataset, on_windows): # Step 3: Fine-tune the embedder loaded from disk train_config = EmbedderFineTuningConfig(epoch_num=1, batch_size=4) + # Cast labels: dataset has no OOS, so ListOfGenericLabels narrows to ListOfLabels. embedder_from_disk.train( utterances=data_handler.train_utterances(0), - labels=data_handler.train_labels(0), + labels=cast("ListOfLabels", data_handler.train_labels(0)), config=train_config, ) @@ -181,7 +192,7 @@ def test_load_from_disk_finetune_dump_load(dataset, on_windows): ) -def test_embeddings_consistency_across_workflows(dataset, on_windows): +def test_embeddings_consistency_across_workflows(dataset: Dataset, on_windows: bool) -> None: """Test that different workflows produce consistent results when starting from same model.""" pytest.importorskip("accelerate", reason="Accelerate library is required for this test") @@ -196,17 +207,16 @@ def test_embeddings_consistency_across_workflows(dataset, on_windows): train_config = EmbedderFineTuningConfig(epoch_num=1, batch_size=4) test_utterances = ["Test sentence for embedding"] - train_data = { - "utterances": data_handler.train_utterances(0)[:50], # Same small subset - "labels": data_handler.train_labels(0)[:50], - } + # Cast labels: dataset has no OOS, so ListOfGenericLabels narrows to ListOfLabels. + utterances_subset = data_handler.train_utterances(0)[:50] + labels_subset = cast("ListOfLabels", data_handler.train_labels(0)[:50]) with tempfile.TemporaryDirectory(ignore_cleanup_errors=on_windows) as temp_dir: temp_path = Path(temp_dir) # Workflow 1: Direct fine-tune embedder1 = Embedder(embedder_config) - embedder1.train(**train_data, config=train_config) + embedder1.train(utterances=utterances_subset, labels=labels_subset, config=train_config) embeddings1 = embedder1.embed(test_utterances) # Workflow 2: Dump -> Load -> Fine-tune @@ -214,7 +224,7 @@ def test_embeddings_consistency_across_workflows(dataset, on_windows): dump_path2 = temp_path / "workflow2" embedder2.dump(dump_path2) embedder2_loaded = Embedder.load(dump_path2) - embedder2_loaded.train(**train_data, config=train_config) + embedder2_loaded.train(utterances=utterances_subset, labels=labels_subset, config=train_config) embeddings2 = embedder2_loaded.embed(test_utterances) # Both workflows should produce similar results (allowing for minor training variance) @@ -225,7 +235,7 @@ def test_embeddings_consistency_across_workflows(dataset, on_windows): ) -def test_multiple_dump_load_cycles_after_finetuning(dataset, on_windows): +def test_multiple_dump_load_cycles_after_finetuning(dataset: Dataset, on_windows: bool) -> None: """Test that multiple dump/load cycles preserve fine-tuned model state.""" pytest.importorskip("accelerate", reason="Accelerate library is required for this test") data_handler = DataHandler(dataset) @@ -241,11 +251,12 @@ def test_multiple_dump_load_cycles_after_finetuning(dataset, on_windows): with tempfile.TemporaryDirectory(ignore_cleanup_errors=on_windows) as temp_dir: temp_path = Path(temp_dir) - # Fine-tune original embedder + # Fine-tune original embedder. Cast labels: dataset has no OOS, so + # ListOfGenericLabels narrows to ListOfLabels. embedder_original = Embedder(embedder_config) embedder_original.train( utterances=data_handler.train_utterances(0), - labels=data_handler.train_labels(0), + labels=cast("ListOfLabels", data_handler.train_labels(0)), config=train_config, ) diff --git a/tests/embedder/test_hash.py b/tests/embedder/test_hash.py index aa03032e..f57af1d1 100644 --- a/tests/embedder/test_hash.py +++ b/tests/embedder/test_hash.py @@ -22,7 +22,7 @@ def embedder(self, embedder_config: EmbedderConfig) -> Embedder: """Create an Embedder instance for testing.""" return Embedder(embedder_config) - def test_hash_consistency(self, embedder: Embedder): + def test_hash_consistency(self, embedder: Embedder) -> None: """Test that hash generation is consistent for same configuration.""" # Create second embedder with same config embedder2 = Embedder(embedder.config.model_copy(deep=True)) @@ -30,7 +30,7 @@ def test_hash_consistency(self, embedder: Embedder): # Same configuration should produce same hash assert embedder._get_hash() == embedder2._get_hash() - def test_hash_deterministic(self, embedder: Embedder): + def test_hash_deterministic(self, embedder: Embedder) -> None: """Test that hash is deterministic across multiple calls.""" hash1 = embedder._get_hash() hash2 = embedder._get_hash() @@ -43,7 +43,7 @@ def test_hash_deterministic(self, embedder: Embedder): class TestSentenceTransformerHashSpecific: """Test hash generation specific to SentenceTransformer backend.""" - def test_hash_different_for_different_max_length(self): + def test_hash_different_for_different_max_length(self) -> None: """Test that different max_length produces different hashes.""" config1 = SentenceTransformerEmbeddingConfig( model_name="sergeyzh/rubert-tiny-turbo", tokenizer_config=TokenizerConfig(max_length=128) @@ -58,7 +58,7 @@ def test_hash_different_for_different_max_length(self): # Different max_length should produce different hashes assert embedder1._get_hash() != embedder2._get_hash() - def test_hash_different_for_different_models(self): + def test_hash_different_for_different_models(self) -> None: """Test that different models produce different hashes.""" config1 = SentenceTransformerEmbeddingConfig(model_name="sergeyzh/rubert-tiny-turbo") config2 = SentenceTransformerEmbeddingConfig(model_name="sentence-transformers/all-MiniLM-L6-v2") diff --git a/tests/embedder/test_memory.py b/tests/embedder/test_memory.py index 176c13d4..18f53cb0 100644 --- a/tests/embedder/test_memory.py +++ b/tests/embedder/test_memory.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import numpy as np import pytest @@ -11,6 +11,7 @@ from .conftest import backend_configs if TYPE_CHECKING: + from autointent._wrappers.embedder.sentence_transformers import SentenceTransformerEmbeddingBackend from autointent.configs import EmbedderConfig @@ -23,24 +24,24 @@ def embedder(self, embedder_config: EmbedderConfig) -> Embedder: """Create an Embedder instance for testing.""" return Embedder(embedder_config) - def test_clear_ram(self, embedder: Embedder): + def test_clear_ram(self, embedder: Embedder) -> None: """Test RAM clearing functionality.""" # Load the model by doing an embedding embedder.embed(["test"]) # Check that backend model is loaded for SentenceTransformers if isinstance(embedder.config, SentenceTransformerEmbeddingConfig): - assert embedder._backend._model is not None + assert cast("SentenceTransformerEmbeddingBackend", embedder._backend)._model is not None # Clear RAM embedder.clear_ram() # For SentenceTransformers, model should be cleared if isinstance(embedder.config, SentenceTransformerEmbeddingConfig): - assert embedder._backend._model is None + assert cast("SentenceTransformerEmbeddingBackend", embedder._backend)._model is None # For OpenAI, clear_ram is a no-op (no model stored in RAM) - def test_memory_efficiency_multiple_calls(self, embedder: Embedder): + def test_memory_efficiency_multiple_calls(self, embedder: Embedder) -> None: """Test that multiple embed calls don't cause memory leaks.""" test_utterances = ["First test", "Second test", "Third test"] @@ -51,15 +52,15 @@ def test_memory_efficiency_multiple_calls(self, embedder: Embedder): # For SentenceTransformers, model should still be loaded once if isinstance(embedder.config, SentenceTransformerEmbeddingConfig): - assert embedder._backend._model is not None + assert cast("SentenceTransformerEmbeddingBackend", embedder._backend)._model is not None # Clear RAM should work after multiple calls embedder.clear_ram() if isinstance(embedder.config, SentenceTransformerEmbeddingConfig): - assert embedder._backend._model is None + assert cast("SentenceTransformerEmbeddingBackend", embedder._backend)._model is None - def test_model_reloading_after_clear(self, embedder: Embedder): + def test_model_reloading_after_clear(self, embedder: Embedder) -> None: """Test that model can be reloaded after clearing RAM.""" # First embedding embeddings1 = embedder.embed(["test"]) diff --git a/tests/embedder/test_openai_backend.py b/tests/embedder/test_openai_backend.py index 7ba53374..1f860ba8 100644 --- a/tests/embedder/test_openai_backend.py +++ b/tests/embedder/test_openai_backend.py @@ -13,7 +13,7 @@ @pytest.fixture -def openai_backend_config(): +def openai_backend_config() -> OpenaiEmbeddingConfig: """Create an OpenAI backend config for testing.""" return OpenaiEmbeddingConfig( model_name="text-embedding-3-small", @@ -25,7 +25,7 @@ def openai_backend_config(): @pytest.fixture -def openai_backend(openai_backend_config: OpenaiEmbeddingConfig): +def openai_backend(openai_backend_config: OpenaiEmbeddingConfig) -> OpenaiEmbeddingBackend: """Create an OpenAI backend instance.""" return OpenaiEmbeddingBackend(openai_backend_config) @@ -33,23 +33,27 @@ def openai_backend(openai_backend_config: OpenaiEmbeddingConfig): class TestOpenaiBackend: """Test OpenAI-specific backend functionality.""" - def test_backend_initialization(self, openai_backend: OpenaiEmbeddingBackend): + def test_backend_initialization(self, openai_backend: OpenaiEmbeddingBackend) -> None: """Test backend initialization.""" assert openai_backend.supports_training is False assert openai_backend._client is None # Client should be lazy-loaded assert openai_backend._async_client is None - def test_client_lazy_loading(self, openai_backend: OpenaiEmbeddingBackend): + def test_client_lazy_loading(self, openai_backend: OpenaiEmbeddingBackend) -> None: """Test that client is lazy-loaded.""" assert openai_backend._client is None # Client should be loaded on first API call embeddings = openai_backend.embed(["Test sentence"]) + # reason: mypy narrowed `_client` to `None` from the prior assert and + # cannot see the mutation inside `.embed()`. The post-call assert and + # subsequent shape checks are the whole point of this test + # (lazy load: None -> non-None), so suppress the unreachable cascade. assert openai_backend._client is not None - assert embeddings.shape[0] == 1 + assert embeddings.shape[0] == 1 # type: ignore[unreachable] assert embeddings.shape[1] > 0 - def test_similarity_calculation(self, openai_backend: OpenaiEmbeddingBackend): + def test_similarity_calculation(self, openai_backend: OpenaiEmbeddingBackend) -> None: """Test cosine similarity calculation.""" embeddings = openai_backend.embed(["Hello", "World", "Hello world"]) @@ -66,7 +70,7 @@ def test_similarity_calculation(self, openai_backend: OpenaiEmbeddingBackend): hello_to_hello_world = similarity[0, 1] assert hello_to_hello_world > hello_to_world - def test_hash_calculation(self, openai_backend: OpenaiEmbeddingBackend): + def test_hash_calculation(self, openai_backend: OpenaiEmbeddingBackend) -> None: """Test hash calculation for caching.""" hash1 = openai_backend.get_hash() hash2 = openai_backend.get_hash() @@ -75,7 +79,7 @@ def test_hash_calculation(self, openai_backend: OpenaiEmbeddingBackend): assert hash1 == hash2 assert isinstance(hash1, int) - def test_different_models_different_hashes(self): + def test_different_models_different_hashes(self) -> None: """Test that different models produce different hashes.""" config1 = OpenaiEmbeddingConfig( model_name="text-embedding-3-small", @@ -89,7 +93,7 @@ def test_different_models_different_hashes(self): assert backend1.get_hash() != backend2.get_hash() - def test_dimensions_parameter(self): + def test_dimensions_parameter(self) -> None: """Test that dimensions parameter affects embeddings.""" # Test with different dimensions (if supported by model) config_with_dims = OpenaiEmbeddingConfig( @@ -104,7 +108,7 @@ def test_dimensions_parameter(self): # Check that embeddings have the specified dimensions assert embeddings.shape[1] == 512 - def test_batch_processing(self, openai_backend: OpenaiEmbeddingBackend): + def test_batch_processing(self, openai_backend: OpenaiEmbeddingBackend) -> None: """Test batch processing functionality.""" utterances = ["First sentence", "Second sentence", "Third sentence", "Fourth sentence"] @@ -115,7 +119,7 @@ def test_batch_processing(self, openai_backend: OpenaiEmbeddingBackend): assert embeddings.shape[0] == 4 assert embeddings.shape[1] > 0 - def test_async_processing_initialization(self): + def test_async_processing_initialization(self) -> None: """Test async processing initialization.""" config = OpenaiEmbeddingConfig( model_name="text-embedding-3-small", @@ -130,7 +134,7 @@ def test_async_processing_initialization(self): embeddings = backend.embed(["Test", "async", "processing"]) assert embeddings.shape[0] == 3 - def test_prompts_application(self): + def test_prompts_application(self) -> None: """Test that prompts are applied correctly.""" config = OpenaiEmbeddingConfig( model_name="text-embedding-3-small", @@ -149,7 +153,7 @@ def test_prompts_application(self): # Embeddings should be different when prompts are applied assert not np.allclose(embeddings_no_prompt, embeddings_with_prompt, rtol=1e-3) - def test_return_tensors_functionality(self, openai_backend: OpenaiEmbeddingBackend): + def test_return_tensors_functionality(self, openai_backend: OpenaiEmbeddingBackend) -> None: """Test return_tensors parameter.""" utterances = ["Hello world", "Test sentence"] diff --git a/tests/embedder/test_prompts.py b/tests/embedder/test_prompts.py index 3ef1a407..fc556565 100644 --- a/tests/embedder/test_prompts.py +++ b/tests/embedder/test_prompts.py @@ -45,7 +45,7 @@ def prompt_embedder_config(self, embedder_config: EmbedderConfig) -> EmbedderCon use_cache=False, ) - def test_different_task_prompts(self, prompt_embedder_config: EmbedderConfig): + def test_different_task_prompts(self, prompt_embedder_config: EmbedderConfig) -> None: """Test that different task types produce different embeddings.""" embedder = Embedder(prompt_embedder_config) test_utterance = ["Test sentence"] @@ -60,7 +60,7 @@ def test_different_task_prompts(self, prompt_embedder_config: EmbedderConfig): assert not np.allclose(default_emb, passage_emb, rtol=1e-3) assert not np.allclose(default_emb, classification_emb, rtol=1e-3) - def test_fallback_to_default_prompt(self, embedder_config: EmbedderConfig): + def test_fallback_to_default_prompt(self, embedder_config: EmbedderConfig) -> None: """Test fallback to default prompt when specific prompt not set.""" from autointent.configs import HashingVectorizerEmbeddingConfig @@ -68,6 +68,7 @@ def test_fallback_to_default_prompt(self, embedder_config: EmbedderConfig): if isinstance(embedder_config, HashingVectorizerEmbeddingConfig): pytest.skip("HashingVectorizer doesn't support prompts") + config: EmbedderConfig if hasattr(embedder_config, "similarity_fn_name"): # SentenceTransformers config config = create_sentence_transformer_config( diff --git a/tests/embedder/test_sentence_transformers_backend.py b/tests/embedder/test_sentence_transformers_backend.py index f08dfe6c..b0c72a80 100644 --- a/tests/embedder/test_sentence_transformers_backend.py +++ b/tests/embedder/test_sentence_transformers_backend.py @@ -7,7 +7,7 @@ @pytest.fixture -def st_backend_config(): +def st_backend_config() -> SentenceTransformerEmbeddingConfig: """Create a SentenceTransformer backend config for testing.""" return SentenceTransformerEmbeddingConfig( model_name="sergeyzh/rubert-tiny-turbo", @@ -19,7 +19,7 @@ def st_backend_config(): @pytest.fixture -def st_backend(st_backend_config: SentenceTransformerEmbeddingConfig): +def st_backend(st_backend_config: SentenceTransformerEmbeddingConfig) -> SentenceTransformerEmbeddingBackend: """Create a SentenceTransformer backend instance.""" return SentenceTransformerEmbeddingBackend(st_backend_config) @@ -27,22 +27,25 @@ def st_backend(st_backend_config: SentenceTransformerEmbeddingConfig): class TestSentenceTransformerBackend: """Test SentenceTransformer-specific backend functionality.""" - def test_backend_initialization(self, st_backend: SentenceTransformerEmbeddingBackend): + def test_backend_initialization(self, st_backend: SentenceTransformerEmbeddingBackend) -> None: """Test backend initialization.""" assert st_backend.supports_training is True assert st_backend._model is None # Model should be lazy-loaded assert st_backend._trained is False - def test_model_lazy_loading(self, st_backend: SentenceTransformerEmbeddingBackend): + def test_model_lazy_loading(self, st_backend: SentenceTransformerEmbeddingBackend) -> None: """Test that model is lazy-loaded.""" assert st_backend._model is None # Model should be loaded on first embed call embeddings = st_backend.embed(["Test sentence"]) + # reason: mypy narrowed `_model` to `None` from the prior assert and + # cannot see the mutation inside `.embed()`. The post-call assert is + # the whole point of this test (lazy load: None -> non-None). assert st_backend._model is not None - assert embeddings.shape == (1, st_backend._model.get_sentence_embedding_dimension()) + assert embeddings.shape == (1, st_backend._model.get_sentence_embedding_dimension()) # type: ignore[unreachable] - def test_clear_ram(self, st_backend: SentenceTransformerEmbeddingBackend): + def test_clear_ram(self, st_backend: SentenceTransformerEmbeddingBackend) -> None: """Test clearing model from RAM.""" # Load model st_backend.embed(["Test sentence"]) @@ -52,7 +55,7 @@ def test_clear_ram(self, st_backend: SentenceTransformerEmbeddingBackend): st_backend.clear_ram() assert st_backend._model is None - def test_similarity_function_name(self, st_backend: SentenceTransformerEmbeddingBackend): + def test_similarity_function_name(self, st_backend: SentenceTransformerEmbeddingBackend) -> None: """Test that similarity function is configured correctly.""" embeddings = st_backend.embed(["Hello", "World"]) similarity = st_backend.similarity(embeddings[:1], embeddings[1:]) @@ -61,7 +64,7 @@ def test_similarity_function_name(self, st_backend: SentenceTransformerEmbedding assert similarity.shape == (1, 1) assert -1.0 <= similarity[0, 0] <= 1.0 - def test_hash_calculation(self, st_backend: SentenceTransformerEmbeddingBackend): + def test_hash_calculation(self, st_backend: SentenceTransformerEmbeddingBackend) -> None: """Test hash calculation for caching.""" hash1 = st_backend.get_hash() hash2 = st_backend.get_hash() @@ -70,7 +73,7 @@ def test_hash_calculation(self, st_backend: SentenceTransformerEmbeddingBackend) assert hash1 == hash2 assert isinstance(hash1, int) - def test_training_functionality(self, st_backend: SentenceTransformerEmbeddingBackend): + def test_training_functionality(self, st_backend: SentenceTransformerEmbeddingBackend) -> None: """Test basic training functionality.""" pytest.importorskip("accelerate", reason="Accelerate library is required for this test") diff --git a/tests/embedder/test_tokenizer.py b/tests/embedder/test_tokenizer.py index f14b2d8e..2b51b886 100644 --- a/tests/embedder/test_tokenizer.py +++ b/tests/embedder/test_tokenizer.py @@ -1,13 +1,14 @@ from autointent._wrappers.embedder import Embedder from autointent.configs import SentenceTransformerEmbeddingConfig as EmbedderConfig +from autointent.configs import TokenizerConfig -def test_max_length_configuration(): +def test_max_length_configuration() -> None: """Test max_length configuration affects tokenization.""" max_length = 10 config = EmbedderConfig( model_name="sergeyzh/rubert-tiny-turbo", - tokenizer_config={"max_length": max_length}, + tokenizer_config=TokenizerConfig(max_length=max_length), use_cache=False, ) embedder = Embedder(config)