diff --git a/check_types.sh b/check_types.sh index 3ae6bf9d..78226843 100755 --- a/check_types.sh +++ b/check_types.sh @@ -21,6 +21,8 @@ TYPED_FILES=( wp1/logic/zim_files.py wp1/logic/zim_schedules.py wp1/selection/abstract_builder.py + wp1/selection/meta_builder.py + wp1/selection/meta_builder_test.py wp1/selection/models/simple.py wp1/selection/models/petscan.py wp1/selection/models/sparql.py diff --git a/wp1/logic/builder.py b/wp1/logic/builder.py index 7672eb55..358ac312 100644 --- a/wp1/logic/builder.py +++ b/wp1/logic/builder.py @@ -39,6 +39,20 @@ logger = logging.getLogger(__name__) +META_BUILDER_MODELS = {"wp1.selection.models.combinator"} + + +def builder_label_by_id(wp10db: Connection, builder_id: str | bytes) -> str: + try: + builder = get_builder(wp10db, builder_id) + except ObjectNotFoundError: + return logic_util.as_text(builder_id) + return builder.label + + +def is_meta_builder(builder: Builder) -> bool: + return builder.model in META_BUILDER_MODELS + def get_builder_module_class(model: str) -> type[AbstractBuilder]: """Dynamically imports the builder module and returns the Builder class.""" diff --git a/wp1/logic/util.py b/wp1/logic/util.py index 77aa8696..34da025b 100644 --- a/wp1/logic/util.py +++ b/wp1/logic/util.py @@ -15,6 +15,12 @@ DATABASE_WIKI_TS = config["DATABASE_WIKI_TS"] +def as_text(value: bytes | str | int | None) -> str: + if isinstance(value, bytes): + return value.decode("utf-8") + return str(value) + + def wp10_timestamp_to_unix(ts): if ts is None: raise ValueError("Cannot convert None timestamp") diff --git a/wp1/models/wp10/builder.py b/wp1/models/wp10/builder.py index f6650672..1a17f6ba 100644 --- a/wp1/models/wp10/builder.py +++ b/wp1/models/wp10/builder.py @@ -6,6 +6,7 @@ import attr +from wp1.logic import util as logic_util from wp1.constants import TS_FORMAT_WP10 from wp1.timestamp import utcnow @@ -33,6 +34,32 @@ class Builder: b_current_version: int = attr.ib(default=0) b_selection_zim_version: int = attr.ib(default=0) + @property + def id(self) -> str: + return logic_util.as_text(self.b_id) + + @property + def name(self) -> str: + return logic_util.as_text(self.b_name) + + @property + def user_id(self) -> str: + return logic_util.as_text(self.b_user_id) + + @property + def project(self) -> str: + return logic_util.as_text(self.b_project) + + @property + def model(self) -> str: + return logic_util.as_text(self.b_model) + + @property + def label(self) -> str: + if self.b_name is not None: + return f"{self.name} ({self.id})" + return self.id + @property def created_at_dt(self) -> datetime.datetime: """The timestamp parsed into a datetime.datetime object.""" @@ -75,10 +102,10 @@ def set_updated_at_now(self) -> None: def to_web_dict(self) -> dict[str, Any]: return { - "name": self.b_name.decode("utf-8"), - "project": self.b_project.decode("utf-8"), + "name": self.name, + "project": self.project, "params": json.loads(self.b_params.decode("utf-8")), - "model": self.b_model.decode("utf-8"), + "model": self.model, } def set_id(self) -> None: diff --git a/wp1/models/wp10/builder_test.py b/wp1/models/wp10/builder_test.py index db5ff65c..27430044 100644 --- a/wp1/models/wp10/builder_test.py +++ b/wp1/models/wp10/builder_test.py @@ -52,6 +52,16 @@ def test_updated_at_dt(self): self.assertEqual(28, dt.minute) self.assertEqual(44, dt.second) + def test_decoded_properties(self): + self.builder.b_id = b"builder-a" + + self.assertEqual("builder-a", self.builder.id) + self.assertEqual("My List", self.builder.name) + self.assertEqual("100", self.builder.user_id) + self.assertEqual("en.wikipedia.org", self.builder.project) + self.assertEqual("wp1.selection.models.simple", self.builder.model) + self.assertEqual("My List (builder-a)", self.builder.label) + def test_set_updated_at_dt(self): dt = datetime.datetime(2020, 12, 15, 9, 30, 55) self.builder.set_updated_at_dt(dt) diff --git a/wp1/selection/meta_builder.py b/wp1/selection/meta_builder.py new file mode 100644 index 00000000..f777eb34 --- /dev/null +++ b/wp1/selection/meta_builder.py @@ -0,0 +1,66 @@ +import io + +from botocore.exceptions import ClientError + +import wp1.logic.builder as logic_builder +from wp1.logic import util as logic_util +from wp1.exceptions import ( + Wp1FatalSelectionError, + Wp1RetryableSelectionError, +) +from wp1.selection.abstract_builder import AbstractBuilder + + +class MetaBuilder(AbstractBuilder): + """Base class for builders that reference other builders.""" + + def _fetch_selection_data( + self, wp10db, s3, builder_id: str, reference_label: str | None = None + ) -> bytes: + """Fetch the latest materialized TSV snapshot for a referenced builder.""" + label = reference_label or builder_id + selection = logic_builder.latest_selection_for( + wp10db, builder_id, "text/tab-separated-values" + ) + + # TODO: #1196 - Add retry handling for Combinator referenced selections. + if selection is None: + raise Wp1RetryableSelectionError( + f"Referenced builder {label} has no usable selection " + f"(no selection found)" + ) + + status = logic_util.as_text(selection.s_status) + if status == "FAILED": + raise Wp1FatalSelectionError( + f"Referenced builder {label} latest selection failed" + ) + + if status != "OK": + raise Wp1RetryableSelectionError( + f"Referenced builder {label} latest selection is not ready " + f"(status={status!r})" + ) + + # OK selections can have no stored data when materialization produced empty + # data, since AbstractBuilder only uploads filled selection.data. + if selection.s_object_key is None: + raise Wp1RetryableSelectionError( + f"Referenced builder {label} latest selection has no stored data" + ) + + object_key = selection.s_object_key + if isinstance(object_key, bytes): + object_key = object_key.decode("utf-8") + + buffer = io.BytesIO() + try: + s3.download_fileobj(object_key, buffer) + except ClientError as e: + code = e.response.get("Error", {}).get("Code", "Unknown") + raise Wp1RetryableSelectionError( + f"Failed to download selection for builder {label} " + f"from S3 key {object_key!r}: {code}" + ) from e + + return buffer.getvalue() diff --git a/wp1/selection/meta_builder_test.py b/wp1/selection/meta_builder_test.py new file mode 100644 index 00000000..5996fe3b --- /dev/null +++ b/wp1/selection/meta_builder_test.py @@ -0,0 +1,61 @@ +from unittest import TestCase +from unittest.mock import MagicMock, patch + +from wp1.exceptions import Wp1FatalSelectionError, Wp1RetryableSelectionError +from wp1.models.wp10.selection import Selection +from wp1.selection.meta_builder import MetaBuilder + + +def _selection(status: bytes = b"OK", object_key: bytes | None = b"object-key"): + return Selection( + s_builder_id=b"builder-a", + s_content_type=b"text/tab-separated-values", + s_version=1, + s_status=status, + s_object_key=object_key, + ) + + +class MetaBuilderTest(TestCase): + + def setUp(self): + self.builder = MetaBuilder() + + @patch("wp1.selection.meta_builder.logic_builder.latest_selection_for") + def test_fetch_selection_data(self, mock_latest_selection): + mock_latest_selection.return_value = _selection() + s3 = MagicMock() + s3.download_fileobj.side_effect = lambda _key, buf: buf.write(b"first\n") + + actual = self.builder._fetch_selection_data(MagicMock(), s3, "builder-a") + + self.assertEqual(b"first\n", actual) + s3.download_fileobj.assert_called_once() + + @patch("wp1.selection.meta_builder.logic_builder.latest_selection_for") + def test_fetch_selection_data_failed_selection(self, mock_latest_selection): + mock_latest_selection.return_value = _selection(status=b"FAILED") + + with self.assertRaises(Wp1FatalSelectionError): + self.builder._fetch_selection_data(MagicMock(), MagicMock(), "builder-a") + + @patch("wp1.selection.meta_builder.logic_builder.latest_selection_for") + def test_fetch_selection_data_retryable_selection(self, mock_latest_selection): + mock_latest_selection.return_value = _selection(status=b"CAN_RETRY") + + with self.assertRaises(Wp1RetryableSelectionError): + self.builder._fetch_selection_data(MagicMock(), MagicMock(), "builder-a") + + @patch("wp1.selection.meta_builder.logic_builder.latest_selection_for") + def test_fetch_selection_data_without_stored_data(self, mock_latest_selection): + mock_latest_selection.return_value = _selection(object_key=None) + + with self.assertRaisesRegex(Wp1RetryableSelectionError, "no stored data"): + self.builder._fetch_selection_data(MagicMock(), MagicMock(), "builder-a") + + @patch("wp1.selection.meta_builder.logic_builder.latest_selection_for") + def test_fetch_selection_data_missing_selection(self, mock_latest_selection): + mock_latest_selection.return_value = None + + with self.assertRaises(Wp1RetryableSelectionError): + self.builder._fetch_selection_data(MagicMock(), MagicMock(), "builder-a") diff --git a/wp1/selection/models/combinator.py b/wp1/selection/models/combinator.py index 7b0c6ad3..88fb857b 100644 --- a/wp1/selection/models/combinator.py +++ b/wp1/selection/models/combinator.py @@ -1,65 +1,17 @@ -import io import logging from typing import Any -from botocore.exceptions import ClientError - import wp1.logic.builder as logic_builder +from wp1.logic import util as logic_util from wp1.exceptions import ( ObjectNotFoundError, Wp1FatalSelectionError, - Wp1RetryableSelectionError, ) -from wp1.models.wp10.builder import Builder as Wp10Builder -from wp1.selection.abstract_builder import AbstractBuilder +from wp1.selection.meta_builder import MetaBuilder logger = logging.getLogger(__name__) -META_BUILDER_MODELS = {"wp1.selection.models.combinator"} - - -# TODO: #1181 - Move shared helpers into AbstractBuilder. -def _as_text(value: bytes | str | int | None) -> str: - if isinstance(value, bytes): - return value.decode("utf-8") - return str(value) - - -def _builder_label(builder: Wp10Builder) -> str: - name = getattr(builder, "b_name", None) - builder_id = _as_text(getattr(builder, "b_id", "")) - if name is not None: - return f"{_as_text(name)} ({builder_id})" - return builder_id - - -def _reference_label(wp10db, builder_id: str) -> str: - try: - builder = logic_builder.get_builder(wp10db, builder_id) - except ObjectNotFoundError: - return builder_id - return _builder_label(builder) - - -def _builder_model(builder: Wp10Builder) -> str: - return _as_text(getattr(builder, "b_model", "")) - - -def _is_meta_builder(builder: Wp10Builder) -> bool: - return _builder_model(builder) in META_BUILDER_MODELS - - -def _dedupe(builder_ids: list[str]) -> list[str]: - seen: set[str] = set() - unique_ids: list[str] = [] - for builder_id in builder_ids: - if builder_id not in seen: - seen.add(builder_id) - unique_ids.append(builder_id) - return unique_ids - - def _validate_group_shape( name: str, group: Any, required: bool ) -> tuple[list[str], list[str]]: @@ -101,7 +53,6 @@ def _validate_group_shape( return builders, errors -# TODO: #1181 - Move shared title parsing helpers into AbstractBuilder. def _normalize(line: bytes) -> str | None: s = line.decode("utf-8", errors="replace").strip() if not s or s.startswith("#"): @@ -118,7 +69,6 @@ def _parse_tsv_to_set(data: bytes) -> set[str]: return titles -# TODO: #1181 - Move shared set operation helpers into AbstractBuilder. def _apply_operation(operation: str, sets: list[set[str]]) -> set[str]: if not sets: return set() @@ -132,80 +82,7 @@ def _apply_operation(operation: str, sets: list[set[str]]) -> set[str]: raise ValueError(f"Unsupported operation: {operation}") -def _fetch_selection_data( - wp10db, s3, builder_id: str, reference_label: str | None = None -) -> bytes: - """Fetch the latest materialized TSV snapshot for a referenced builder.""" - label = reference_label or builder_id - selection = logic_builder.latest_selection_for( - wp10db, builder_id, "text/tab-separated-values" - ) - - if selection is None: - raise Wp1RetryableSelectionError( - f"Referenced builder {label} has no usable selection " - f"(no selection found)" - ) - - status = _as_text(selection.s_status) - if status == "FAILED": - raise Wp1FatalSelectionError( - f"Referenced builder {label} latest selection failed" - ) - - if status != "OK": - raise Wp1RetryableSelectionError( - f"Referenced builder {label} latest selection is not ready " - f"(status={status!r})" - ) - - # OK selections can have no object key when materialization produced empty - # data, since AbstractBuilder only uploads filled selection.data. - if selection.s_object_key is None: - raise Wp1RetryableSelectionError( - f"Referenced builder {label} latest selection has no object key" - ) - - object_key = selection.s_object_key - if isinstance(object_key, bytes): - object_key = object_key.decode("utf-8") - - buffer = io.BytesIO() - try: - s3.download_fileobj(object_key, buffer) - except ClientError as e: - code = e.response.get("Error", {}).get("Code", "Unknown") - raise Wp1RetryableSelectionError( - f"Failed to download selection for builder {label} " - f"from S3 key {object_key!r}: {code}" - ) from e - - return buffer.getvalue() - - -def _process_group(wp10db, s3, group: dict[str, Any]) -> set[str]: - - builder_values = group.get("builders", []) - - builder_ids = [ - builder_id for builder_id in builder_values if isinstance(builder_id, str) - ] - - operation = group.get("operation", "union") - - sets: list[set[str]] = [] - # TODO: #1184 - Handle multiple bad referenced selections in combinator build. - for builder_id in _dedupe(builder_ids): - data = _fetch_selection_data( - wp10db, s3, builder_id, _reference_label(wp10db, builder_id) - ) - title_set = _parse_tsv_to_set(data) - sets.append(title_set) - - return _apply_operation(operation, sets) - - -class Builder(AbstractBuilder): +class Builder(MetaBuilder): """Combinator builder: combines other builders using set operations.""" def validate( @@ -248,9 +125,9 @@ def validate( project = params["project"] builder_id = params.get("builder_id") - referenced_ids = _dedupe(include_builders + exclude_builders) + referenced_ids = set(include_builders + exclude_builders) - if builder_id is not None and _as_text(builder_id) in referenced_ids: + if builder_id is not None and logic_util.as_text(builder_id) in referenced_ids: errors.append("This combinator cannot reference itself") if errors: @@ -267,22 +144,20 @@ def validate( ) continue - # name + id is only for helping testing, in prod we dont show the id - # TODO : show the name only - label = _builder_label(builder) + label = builder.label - if _as_text(builder.b_user_id) != _as_text(user_id): + if builder.user_id != logic_util.as_text(user_id): errors.append( f"Builder {label} belongs to another user. You can only " "reference your own builders." ) - if _as_text(builder.b_project) != _as_text(project): + if builder.project != logic_util.as_text(project): errors.append( f"Builder {label} belongs to project " - f"{_as_text(builder.b_project)!r}. All referenced builders " + f"{builder.project!r}. All referenced builders " "must use the same project." ) - if _is_meta_builder(builder): + if logic_builder.is_meta_builder(builder): errors.append( f"Builder {label} is a combinator. Combinators can only " "reference leaf builders such as Simple, SPARQL, PetScan, " @@ -294,6 +169,30 @@ def validate( return ([], [], []) + def _process_group(self, wp10db, s3, group: dict[str, Any]) -> set[str]: + + builder_values = group.get("builders", []) + + builder_ids = [ + builder_id for builder_id in builder_values if isinstance(builder_id, str) + ] + + operation = group.get("operation", "union") + + sets: list[set[str]] = [] + # TODO: #1184 - Handle multiple bad referenced selections in combinator build. + for builder_id in set(builder_ids): + data = self._fetch_selection_data( + wp10db, + s3, + builder_id, + logic_builder.builder_label_by_id(wp10db, builder_id), + ) + title_set = _parse_tsv_to_set(data) + sets.append(title_set) + + return _apply_operation(operation, sets) + def build(self, content_type: str, **params: Any) -> bytes: """Build the combinator selection from referenced builders.""" if content_type != "text/tab-separated-values": @@ -313,13 +212,13 @@ def build(self, content_type: str, **params: Any) -> bytes: exclude_group = params.get("exclude") - include_set = _process_group(wp10db, s3, include_group) + include_set = self._process_group(wp10db, s3, include_group) exclude_set: set[str] = set() if exclude_group is not None: exclude_builders = exclude_group.get("builders", []) if exclude_builders: - exclude_set = _process_group(wp10db, s3, exclude_group) + exclude_set = self._process_group(wp10db, s3, exclude_group) result = include_set - exclude_set diff --git a/wp1/selection/models/combinator_test.py b/wp1/selection/models/combinator_test.py index 9449bfd4..0c26f4a0 100644 --- a/wp1/selection/models/combinator_test.py +++ b/wp1/selection/models/combinator_test.py @@ -1,11 +1,9 @@ from unittest.mock import MagicMock, patch from wp1.base_db_test import BaseWpOneDbTest -from wp1.exceptions import Wp1FatalSelectionError, Wp1RetryableSelectionError +from wp1.exceptions import Wp1FatalSelectionError from wp1.models.wp10.builder import Builder -from wp1.models.wp10.selection import Selection from wp1.selection.models.combinator import Builder as CombinatorBuilder -from wp1.selection.models.combinator import _fetch_selection_data def _reference_builder( @@ -25,16 +23,6 @@ def _reference_builder( ) -def _selection(status=b"OK", object_key=b"object-key"): - return Selection( - s_builder_id=b"builder-a", - s_content_type=b"text/tab-separated-values", - s_version=1, - s_status=status, - s_object_key=object_key, - ) - - class CombinatorBuilderTest(BaseWpOneDbTest): def setUp(self): @@ -187,8 +175,8 @@ def test_validate_meta_builder_reference(self, mock_get_builder): ) self.assertEqual(expected, actual) - @patch("wp1.selection.models.combinator.logic_builder.get_builder") - @patch("wp1.selection.models.combinator._fetch_selection_data") + @patch("wp1.logic.builder.get_builder") + @patch("wp1.selection.models.combinator.Builder._fetch_selection_data") def test_build(self, mock_fetch_selection_data, mock_get_builder): mock_get_builder.return_value = _reference_builder() data = { @@ -210,8 +198,8 @@ def test_build(self, mock_fetch_selection_data, mock_get_builder): self.assertEqual(b"first_article\nsecond", actual) - @patch("wp1.selection.models.combinator.logic_builder.get_builder") - @patch("wp1.selection.models.combinator._fetch_selection_data") + @patch("wp1.logic.builder.get_builder") + @patch("wp1.selection.models.combinator.Builder._fetch_selection_data") def test_build_intersection(self, mock_fetch_selection_data, mock_get_builder): mock_get_builder.return_value = _reference_builder() data = { @@ -234,8 +222,8 @@ def test_build_intersection(self, mock_fetch_selection_data, mock_get_builder): self.assertEqual(b"shared", actual) - @patch("wp1.selection.models.combinator.logic_builder.get_builder") - @patch("wp1.selection.models.combinator._fetch_selection_data") + @patch("wp1.logic.builder.get_builder") + @patch("wp1.selection.models.combinator.Builder._fetch_selection_data") def test_build_empty_result(self, mock_fetch_selection_data, mock_get_builder): mock_get_builder.return_value = _reference_builder() mock_fetch_selection_data.return_value = b"" @@ -245,38 +233,6 @@ def test_build_empty_result(self, mock_fetch_selection_data, mock_get_builder): with self.assertRaises(Wp1FatalSelectionError): self.builder.build("text/tab-separated-values", **params) - @patch("wp1.selection.models.combinator.logic_builder.latest_selection_for") - def test_fetch_selection_data(self, mock_latest_selection): - mock_latest_selection.return_value = _selection() - s3 = MagicMock() - s3.download_fileobj.side_effect = lambda _key, buf: buf.write(b"first\n") - - actual = _fetch_selection_data(MagicMock(), s3, "builder-a") - - self.assertEqual(b"first\n", actual) - s3.download_fileobj.assert_called_once() - - @patch("wp1.selection.models.combinator.logic_builder.latest_selection_for") - def test_fetch_selection_data_failed_selection(self, mock_latest_selection): - mock_latest_selection.return_value = _selection(status=b"FAILED") - - with self.assertRaises(Wp1FatalSelectionError): - _fetch_selection_data(MagicMock(), MagicMock(), "builder-a") - - @patch("wp1.selection.models.combinator.logic_builder.latest_selection_for") - def test_fetch_selection_data_retryable_selection(self, mock_latest_selection): - mock_latest_selection.return_value = _selection(status=b"CAN_RETRY") - - with self.assertRaises(Wp1RetryableSelectionError): - _fetch_selection_data(MagicMock(), MagicMock(), "builder-a") - - @patch("wp1.selection.models.combinator.logic_builder.latest_selection_for") - def test_fetch_selection_data_missing_selection(self, mock_latest_selection): - mock_latest_selection.return_value = None - - with self.assertRaises(Wp1RetryableSelectionError): - _fetch_selection_data(MagicMock(), MagicMock(), "builder-a") - def test_validate_with_referenced_builder_in_db(self): self._insert_builder()