From d24a43cdda37031aa636ab8c53cb48a3b3e630d4 Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Mon, 4 May 2026 16:27:16 +0200 Subject: [PATCH 01/14] Reimplement the dataset download manually --- fedivertex/cache.py | 146 +++++++++++++++++++++++++++++++++++++++ fedivertex/exceptions.py | 10 +++ fedivertex/main.py | 70 +++++++++---------- setup.py | 8 +-- tests/test_loader.py | 28 +++++--- 5 files changed, 209 insertions(+), 53 deletions(-) create mode 100644 fedivertex/cache.py create mode 100644 fedivertex/exceptions.py diff --git a/fedivertex/cache.py b/fedivertex/cache.py new file mode 100644 index 0000000..d4c4ca8 --- /dev/null +++ b/fedivertex/cache.py @@ -0,0 +1,146 @@ +import os +import zipfile +from datetime import datetime, timezone +from pathlib import Path +from turtle import down +from typing import Optional + +import requests +from platformdirs import user_cache_dir +from tqdm import tqdm + +from .exceptions import DownloadError + +_CHUNK_SIZE = 1024 + +DEFAULT_CACHE_DIR = user_cache_dir( + appname="fedivertex-dataset", + appauthor="MarcDamie", # optional but recommended on Windows +) + +DATASET_METADATA_URL = "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset/croissant/download" +LIGHT_DATASET_METADATA_URL = "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset-reduced/croissant/download" +DATASET_URL = ( + "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset/download" +) +LIGHT_DATASET_URL = ( + "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset-reduced/download" +) + + +def download_from_http(url: str, filepath: Path): # Inspired from Croissant ML codebase + response = requests.get( + url, + stream=True, + timeout=10, + ) + response.raise_for_status() + total = int(response.headers.get("Content-Length", 0)) + with ( + filepath.open("wb") as file, + tqdm( + desc=f"Downloading {url}...", + total=total, + unit="iB", + unit_scale=True, + unit_divisor=1024, + ) as bar, + ): + for data in response.iter_content(chunk_size=_CHUNK_SIZE): + size = file.write(data) + bar.update(size) + +def clear_cache(cache_dir=Path(DEFAULT_CACHE_DIR)): + if os.path.exists(cache_dir): + os.rmdir(cache_dir) + +def check_for_update(light_dataset, cache_dir): + if light_dataset: + dataset_update_file = "last_update_reduced.txt" + metadata_url = LIGHT_DATASET_METADATA_URL + else: + dataset_update_file = "last_update_full.txt" + metadata_url = DATASET_METADATA_URL + + update_file_path = cache_dir / dataset_update_file + + if os.path.exists(update_file_path): + print("Cache found, checking for updates...") + with open(update_file_path, "r", encoding="utf-8") as update_file: + last_local_update = datetime.fromisoformat(update_file.read()) + + try: + resp = requests.get(metadata_url) + if resp.status_code != 200: + raise DownloadError( + f"Could not retrieve dataset metadata (Invalid status {resp.status_code})" + ) + metadata = resp.json() + last_online_update = datetime.fromisoformat(metadata["dateModified"]).replace(tzinfo=timezone.utc) + except requests.RequestException as err: + raise DownloadError( + f"Could not retrieve dataset metadata ({str(err)})" + ) from err + except KeyError as err: + raise DownloadError( + f"Could not retrieve dataset metadata (Missing 'dateModified' in the metadata)" + ) from err + + if last_local_update > last_online_update: + print("Cache is up-to-date, no download necessary.") + return False + else + print("Cache is outdated, download necessary.") + return True + else: + print("No cache found, download necessary.") + return True + +def download_dataset(light_dataset, cache_dir): + if light_dataset: + data_url = LIGHT_DATASET_URL + dataset_dir = "reduced" + else: + data_url = DATASET_URL + dataset_dir = "full" + + archive_path = cache_dir / "archive.zip" + dataset_path = cache_dir / dataset_dir + + download_from_http(data_url, archive_path) + + + print("Decompressing the dataset...") + with zipfile.ZipFile(archive_path) as zip: + zip.extractall(dataset_path) + + os.remove(archive_path) + +def create_update_date_file(light_dataset, cache_dir): + if light_dataset: + dataset_update_file = "last_update_reduced.txt" + else: + dataset_update_file = "last_update_full.txt" + + update_file_path = cache_dir / dataset_update_file + + with open(update_file_path, "w", encoding="utf-8") as update_file: + date_now = datetime.now(timezone.utc).isoformat() + update_file.write(date_now) + +def init_cache(light_dataset: bool, cache_dir: Optional[Path | str] = None) -> Path: + if cache_dir is None: + cache_dir = DEFAULT_CACHE_DIR + + cache_dir = Path(cache_dir) + + if check_for_update(cache_dir=cache_dir,light_dataset=light_dataset): + clear_cache(cache_dir) # Remove existing cache files as outdated + + os.makedirs(cache_dir) # Recreate the cache + + download_dataset(cache_dir=cache_dir,light_dataset=light_dataset) + + create_update_date_file(cache_dir=cache_dir,light_dataset=light_dataset) + + return cache_dir diff --git a/fedivertex/exceptions.py b/fedivertex/exceptions.py new file mode 100644 index 0000000..21c4045 --- /dev/null +++ b/fedivertex/exceptions.py @@ -0,0 +1,10 @@ +class FedivertexException(Exception): + pass + + +class DownloadError(FedivertexException): + pass + + +class InteractionError(FedivertexException): + pass diff --git a/fedivertex/main.py b/fedivertex/main.py index a57f7e5..d9b49f2 100644 --- a/fedivertex/main.py +++ b/fedivertex/main.py @@ -1,12 +1,15 @@ import json +import os from types import NoneType from typing import List, Optional, Tuple -import mlcroissant as mlc import networkx as nx import networkx_temporal as tx from tqdm import tqdm +from .cache import init_cache +from .exceptions import InteractionError + class GraphLoader: VALID_GRAPH_TYPES = { @@ -25,18 +28,9 @@ class GraphLoader: } UNDIRECTED_GRAPHS = ["federation"] - def __init__(self, light_version=True): + def __init__(self, light_version=True, cache_dir=None): self.light_version = light_version - if self.light_version: - url = "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset-reduced/croissant/download" - else: - url = "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset/croissant/download" - try: - self.dataset = mlc.Dataset(jsonld=url) - except json.JSONDecodeError as err: - raise SystemError( - "Unexpected error from Croissant (try to empty Croissant's cache in ~/.cache/croissant)" - ) from err + self.CACHE_DIR = init_cache(light_version, cache_dir) def _check_input(self, software: str, graph_type: str) -> NoneType: """Verify that (software,graph type) combination exists @@ -45,23 +39,23 @@ def _check_input(self, software: str, graph_type: str) -> NoneType: :type software: str :param graph_type: graph type :type graph_type: str - :raises ValueError: if the software does not exist in the dataset - :raises ValueError: if the graph type does not exist for a given software + :raises InteractionError: if the software does not exist in the dataset + :raises InteractionError: if the graph type does not exist for a given software :return: Nothing :rtype: NoneType """ if software not in self.VALID_GRAPH_TYPES.keys(): - raise ValueError( + raise InteractionError( f"Invalid software! Valid software: {list(self.VALID_GRAPH_TYPES.keys())}" ) if graph_type not in self.VALID_GRAPH_TYPES[software]: - raise ValueError( + raise InteractionError( f"{graph_type} is not a valid graph type for {software}. Valid types: {self.VALID_GRAPH_TYPES[software]}" ) if self.light_version and software == "mastodon" and graph_type == "federation": - raise ValueError( + raise InteractionError( f"The graph {software} {graph_type} is not included in the light version of Fedivertex\n" "To download the full version, generate the dataset loader as follows: `GraphLoader(light_version=False)`" ) @@ -76,20 +70,20 @@ def _fetch_date_index(self, software: str, graph_type: str, index: int) -> str: :type graph_type: str :param index: :type index: int - :raises ValueError: if there is no graph available of the given type. - :raises ValueError: if the index is invalid + :raises InteractionError: if there is no graph available of the given type. + :raises InteractionError: if the index is invalid :return: date :rtype: str """ dates = self.list_available_dates(software, graph_type) if len(dates) == 0: - raise ValueError(f"No graph available for {software}+{graph_type}") + raise InteractionError(f"No graph available for {software}+{graph_type}") try: return dates[index] except Exception as err: - raise ValueError("Invalid index: " + str(index)) from err + raise InteractionError("Invalid index: " + str(index)) from err def _fetch_latest_date(self, software: str, graph_type: str) -> str: """Returns the latest date available for a given graph. @@ -98,14 +92,14 @@ def _fetch_latest_date(self, software: str, graph_type: str) -> str: :type software: str :param graph_type: :type graph_type: str - :raises ValueError: if there is no graph available of the given type. + :raises InteractionError: if there is no graph available of the given type. :return: date :rtype: str """ dates = self.list_available_dates(software, graph_type) if len(dates) == 0: - raise ValueError(f"No graph available for {software}+{graph_type}") + raise InteractionError(f"No graph available for {software}+{graph_type}") return dates[-1] @@ -114,7 +108,7 @@ def list_all_software(self) -> List[str]: def list_graph_types(self, software: str) -> List[str]: if software not in self.VALID_GRAPH_TYPES.keys(): - raise ValueError( + raise InteractionError( f"Invalid software! Valid software: {list(self.VALID_GRAPH_TYPES.keys())}" ) @@ -132,7 +126,7 @@ def list_available_dates(self, software: str, graph_type: str) -> List[str]: """ self._check_input(software, graph_type) - record_sets = list(self.dataset.metadata.record_sets) + record_sets = list(self.dataset.metadata.record_sets) # TODO dates = [] for record_set in record_sets: if "interactions.csv" not in record_set.uuid: @@ -169,14 +163,14 @@ def get_graph( :type only_largest_component: bool, optional :param disable_tqdm: disables the TQDM progress bars, defaults to False :type disable_tqdm: bool, optional - :raises ValueError: if both a date and an index are provided. + :raises InteractionError: if both a date and an index are provided. :return: a graph in the NetworkX format :rtype: nx.Graph """ self._check_input(software, graph_type) if index is not None and date is not None: - raise ValueError( + raise InteractionError( "You must provide either the date or the index of the graph, not both." ) @@ -190,10 +184,10 @@ def get_graph( assert date is not None interactions_csv_file = f"{software}/{graph_type}/{date}/interactions.csv" - interaction_records = self.dataset.records(interactions_csv_file) + interaction_records = self.dataset.records(interactions_csv_file) # TODO instances_csv_file = f"{software}/{graph_type}/{date}/instances.csv" - instance_records = self.dataset.records(instances_csv_file) + instance_records = self.dataset.records(instances_csv_file) # TODO if graph_type in self.UNDIRECTED_GRAPHS: graph = nx.Graph() @@ -254,7 +248,7 @@ def get_temporal_graph( :type date: Optional[Tuple[str, str]], optional :param disable_tqdm: disables the TQDM progress bars, defaults to False :type disable_tqdm: bool, optional - :raises ValueError: if both a date and an index are provided. + :raises InteractionError: if both a date and an index are provided. :return: a graph in the NetworkX format :rtype: tx.TemporalGraph """ @@ -275,16 +269,16 @@ def get_temporal_graph( # Fetch all graphs selected_dates = availables_dates elif index is not None and date is not None: - raise ValueError( + raise InteractionError( "You must provide either the date or the index range of the graph, not both." ) elif index is not None: if len(index) > 2: - raise ValueError("Incorrect format for the index range") + raise InteractionError("Incorrect format for the index range") if index[0] > index[1]: - raise ValueError("Incorrect index range") + raise InteractionError("Incorrect index range") if index[0] < 0 or index[1] > len(availables_dates) - 1: - raise ValueError( + raise InteractionError( f"Indices are out of the acceptable range (0,{len(availables_dates) - 1})" ) @@ -292,20 +286,20 @@ def get_temporal_graph( else: # date is not None: assert date is not None if len(date) > 2: - raise ValueError("Incorrect format for the date range") + raise InteractionError("Incorrect format for the date range") min_date, max_date = date try: min_date = int(min_date) max_date = int(max_date) - except ValueError as err: - raise ValueError("Invalid date format") from err + except InteractionError as err: + raise InteractionError("Invalid date format") from err if ( min_date > int(availables_dates[-1]) or int(availables_dates[0]) > max_date ): - raise ValueError( + raise InteractionError( f"Indices not covering the available dates: ({availables_dates[0]},{availables_dates[-1]})" ) diff --git a/setup.py b/setup.py index c26e97d..73588c0 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name="fedivertex", - version="1.0.0", + version="1.1.0", author="Marc DAMIE", author_email="marc.damie@inria.fr", description="Interface to download and interact with Fedivertex, the Fediverse Graph Dataset", @@ -15,12 +15,12 @@ long_description_content_type="text/markdown", packages=find_packages(), license="GPLv3", - python_requires=">=3.10", # To be compatible with mlcroissant + python_requires=">=3.10", install_requires=[ - "numpy<2.0", # To be compatible with mlcroissant - "mlcroissant", "networkx", "networkx-temporal", + "platformdirs", + "requests", "tqdm", ], extras_require={"test": ["pytest", "pytest-coverage"]}, diff --git a/tests/test_loader.py b/tests/test_loader.py index 7f541cf..1cdf646 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -1,6 +1,7 @@ import pytest from fedivertex import GraphLoader +from fedivertex.exceptions import InteractionError def test_basic_lists(): @@ -20,7 +21,7 @@ def test_basic_lists(): for software in software_list: assert loader.list_graph_types(software) == loader.VALID_GRAPH_TYPES[software] - with pytest.raises(ValueError): + with pytest.raises(InteractionError): loader.list_graph_types("NON-EXISTING SOFTWARE") @@ -47,7 +48,7 @@ def test_available_dates(): def test_index_selection(): loader = GraphLoader() - with pytest.raises(ValueError): + with pytest.raises(InteractionError): loader._fetch_date_index("peertube", "follow", 10000000000000000000000000) assert loader._fetch_date_index("peertube", "follow", 0) == "20250203" @@ -59,13 +60,13 @@ def test_index_selection(): def test_get_graph_errors(): loader = GraphLoader() - with pytest.raises(ValueError): + with pytest.raises(InteractionError): loader.get_graph("NON-EXISTING", "federation") - with pytest.raises(ValueError): + with pytest.raises(InteractionError): loader.get_graph("peertube", "NON-EXISTING") - with pytest.raises(ValueError): + with pytest.raises(InteractionError): loader.get_graph("peertube", "follow", date="20250203", index=3) @@ -172,24 +173,24 @@ def test_graph_consistency(): def test_get_temporal_graph(): loader = GraphLoader() - with pytest.raises(ValueError): + with pytest.raises(InteractionError): loader.get_temporal_graph("NON-EXISTING", "federation") - with pytest.raises(ValueError): + with pytest.raises(InteractionError): loader.get_temporal_graph("peertube", "NON-EXISTING") - with pytest.raises(ValueError): + with pytest.raises(InteractionError): loader.get_temporal_graph( "peertube", "follow", date=("20250203", "20250217"), index=(3, 7) ) - with pytest.raises(ValueError): + with pytest.raises(InteractionError): loader.get_temporal_graph("peertube", "follow", index=(-1, 7)) - with pytest.raises(ValueError): + with pytest.raises(InteractionError): loader.get_temporal_graph("peertube", "follow", index=(3, 70000000000)) - with pytest.raises(ValueError): + with pytest.raises(InteractionError): loader.get_temporal_graph("peertube", "follow", date=("20210203", "20210217")) temporal_graph = loader.get_temporal_graph( @@ -203,3 +204,8 @@ def test_get_temporal_graph(): assert len(temporal_graph.temporal_nodes()) == 991 assert len(temporal_graph.temporal_edges()) == 133852 assert temporal_graph.number_of_snapshots() == 8 + + +def test_outdated_cache(): + loader = GraphLoader() + raise NotImplementedError # TODO From 2b6312704719373e0d31c3b188c029699b2ad280 Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Mon, 4 May 2026 20:46:58 +0200 Subject: [PATCH 02/14] Debug the dataset download --- fedivertex/cache.py | 71 +++++++++++++++++++++++---------------------- fedivertex/main.py | 17 ++++------- setup.py | 1 + 3 files changed, 43 insertions(+), 46 deletions(-) diff --git a/fedivertex/cache.py b/fedivertex/cache.py index d4c4ca8..90d7c77 100644 --- a/fedivertex/cache.py +++ b/fedivertex/cache.py @@ -1,4 +1,5 @@ import os +import shutil import zipfile from datetime import datetime, timezone from pathlib import Path @@ -21,14 +22,16 @@ DATASET_METADATA_URL = "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset/croissant/download" LIGHT_DATASET_METADATA_URL = "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset-reduced/croissant/download" DATASET_URL = ( - "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset/download" -) -LIGHT_DATASET_URL = ( - "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset-reduced/download" + "https://www.kaggle.com/api/v1/datasets/download/marcdamie/fediverse-graph-dataset" ) +LIGHT_DATASET_URL = "https://www.kaggle.com/api/v1/datasets/download/marcdamie/fediverse-graph-dataset-reduced" + +def cache_subdir_name(light_version): + return "reduced" if light_version else "full" -def download_from_http(url: str, filepath: Path): # Inspired from Croissant ML codebase + +def download_from_http(url: str, filepath: Path): # Inspired from Croissant ML codebase response = requests.get( url, stream=True, @@ -39,7 +42,7 @@ def download_from_http(url: str, filepath: Path): # Inspired from Croissant ML c with ( filepath.open("wb") as file, tqdm( - desc=f"Downloading {url}...", + desc="Downloading the dataset...", total=total, unit="iB", unit_scale=True, @@ -50,17 +53,15 @@ def download_from_http(url: str, filepath: Path): # Inspired from Croissant ML c size = file.write(data) bar.update(size) + def clear_cache(cache_dir=Path(DEFAULT_CACHE_DIR)): if os.path.exists(cache_dir): - os.rmdir(cache_dir) + shutil.rmtree(cache_dir) + def check_for_update(light_dataset, cache_dir): - if light_dataset: - dataset_update_file = "last_update_reduced.txt" - metadata_url = LIGHT_DATASET_METADATA_URL - else: - dataset_update_file = "last_update_full.txt" - metadata_url = DATASET_METADATA_URL + dataset_update_file = f"last_update_{cache_subdir_name(light_dataset)}.txt" + metadata_url = LIGHT_DATASET_METADATA_URL if light_dataset else DATASET_METADATA_URL update_file_path = cache_dir / dataset_update_file @@ -76,51 +77,52 @@ def check_for_update(light_dataset, cache_dir): f"Could not retrieve dataset metadata (Invalid status {resp.status_code})" ) metadata = resp.json() - last_online_update = datetime.fromisoformat(metadata["dateModified"]).replace(tzinfo=timezone.utc) + last_online_update = datetime.fromisoformat( + metadata["dateModified"] + ).replace(tzinfo=timezone.utc) except requests.RequestException as err: raise DownloadError( f"Could not retrieve dataset metadata ({str(err)})" ) from err except KeyError as err: raise DownloadError( - f"Could not retrieve dataset metadata (Missing 'dateModified' in the metadata)" + "Could not retrieve dataset metadata (Missing 'dateModified' in the metadata)" ) from err if last_local_update > last_online_update: print("Cache is up-to-date, no download necessary.") return False - else + else: print("Cache is outdated, download necessary.") return True else: print("No cache found, download necessary.") return True + def download_dataset(light_dataset, cache_dir): - if light_dataset: - data_url = LIGHT_DATASET_URL - dataset_dir = "reduced" - else: - data_url = DATASET_URL - dataset_dir = "full" + data_url = LIGHT_DATASET_URL if light_dataset else DATASET_URL archive_path = cache_dir / "archive.zip" - dataset_path = cache_dir / dataset_dir + dataset_path = cache_dir / cache_subdir_name(light_version=light_dataset) download_from_http(data_url, archive_path) - print("Decompressing the dataset...") with zipfile.ZipFile(archive_path) as zip: - zip.extractall(dataset_path) + zip.extractall(cache_dir) + + # Rename the extracted folder to have a fixed name (without version) + roots = {Path(m).parts[0] for m in zip.namelist() if m.strip()} + if len(roots) == 1: + old_root = cache_dir / next(iter(roots)) + old_root.rename(dataset_path) os.remove(archive_path) + def create_update_date_file(light_dataset, cache_dir): - if light_dataset: - dataset_update_file = "last_update_reduced.txt" - else: - dataset_update_file = "last_update_full.txt" + dataset_update_file = f"last_update_{cache_subdir_name(light_dataset)}.txt" update_file_path = cache_dir / dataset_update_file @@ -128,19 +130,20 @@ def create_update_date_file(light_dataset, cache_dir): date_now = datetime.now(timezone.utc).isoformat() update_file.write(date_now) + def init_cache(light_dataset: bool, cache_dir: Optional[Path | str] = None) -> Path: if cache_dir is None: cache_dir = DEFAULT_CACHE_DIR cache_dir = Path(cache_dir) - if check_for_update(cache_dir=cache_dir,light_dataset=light_dataset): - clear_cache(cache_dir) # Remove existing cache files as outdated + if check_for_update(cache_dir=cache_dir, light_dataset=light_dataset): + clear_cache(cache_dir) # Remove existing cache files as outdated - os.makedirs(cache_dir) # Recreate the cache + os.makedirs(cache_dir) # Recreate the cache - download_dataset(cache_dir=cache_dir,light_dataset=light_dataset) + download_dataset(cache_dir=cache_dir, light_dataset=light_dataset) - create_update_date_file(cache_dir=cache_dir,light_dataset=light_dataset) + create_update_date_file(cache_dir=cache_dir, light_dataset=light_dataset) return cache_dir diff --git a/fedivertex/main.py b/fedivertex/main.py index d9b49f2..e8c99c1 100644 --- a/fedivertex/main.py +++ b/fedivertex/main.py @@ -1,4 +1,4 @@ -import json +import csv import os from types import NoneType from typing import List, Optional, Tuple @@ -7,7 +7,7 @@ import networkx_temporal as tx from tqdm import tqdm -from .cache import init_cache +from .cache import cache_subdir_name, init_cache from .exceptions import InteractionError @@ -31,6 +31,7 @@ class GraphLoader: def __init__(self, light_version=True, cache_dir=None): self.light_version = light_version self.CACHE_DIR = init_cache(light_version, cache_dir) + self.SUB_DIR = cache_subdir_name(light_version) def _check_input(self, software: str, graph_type: str) -> NoneType: """Verify that (software,graph type) combination exists @@ -125,17 +126,9 @@ def list_available_dates(self, software: str, graph_type: str) -> List[str]: :rtype: List[str] """ self._check_input(software, graph_type) + graph_path = self.CACHE_DIR / software / graph_type - record_sets = list(self.dataset.metadata.record_sets) # TODO - dates = [] - for record_set in record_sets: - if "interactions.csv" not in record_set.uuid: - continue - - software_i, graph_type_i, date_i, _file = record_set.uuid.split("/") - if software_i == software and graph_type_i == graph_type: - dates.append(date_i) - + dates = list(os.listdir(graph_path)) dates.sort() return dates diff --git a/setup.py b/setup.py index 73588c0..a1a287a 100644 --- a/setup.py +++ b/setup.py @@ -17,6 +17,7 @@ license="GPLv3", python_requires=">=3.10", install_requires=[ + "numpy<2.0", # Necessary for networkx-temporal "networkx", "networkx-temporal", "platformdirs", From c7908b564a76c1bfed2128a1db200818dc050c20 Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Mon, 4 May 2026 21:12:09 +0200 Subject: [PATCH 03/14] Improve cache status handling --- fedivertex/cache.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/fedivertex/cache.py b/fedivertex/cache.py index 90d7c77..3fed71b 100644 --- a/fedivertex/cache.py +++ b/fedivertex/cache.py @@ -2,6 +2,7 @@ import shutil import zipfile from datetime import datetime, timezone +from enum import Enum from pathlib import Path from turtle import down from typing import Optional @@ -27,6 +28,12 @@ LIGHT_DATASET_URL = "https://www.kaggle.com/api/v1/datasets/download/marcdamie/fediverse-graph-dataset-reduced" +class CacheStatus(Enum): + ABSENT = -1 + OUTDATED = 0 + UPTODATE = 1 + + def cache_subdir_name(light_version): return "reduced" if light_version else "full" @@ -91,13 +98,13 @@ def check_for_update(light_dataset, cache_dir): if last_local_update > last_online_update: print("Cache is up-to-date, no download necessary.") - return False + return CacheStatus.UPTODATE else: print("Cache is outdated, download necessary.") - return True + return CacheStatus.OUTDATED else: print("No cache found, download necessary.") - return True + return CacheStatus.ABSENT def download_dataset(light_dataset, cache_dir): @@ -137,10 +144,12 @@ def init_cache(light_dataset: bool, cache_dir: Optional[Path | str] = None) -> P cache_dir = Path(cache_dir) - if check_for_update(cache_dir=cache_dir, light_dataset=light_dataset): - clear_cache(cache_dir) # Remove existing cache files as outdated + cache_status = check_for_update(cache_dir=cache_dir, light_dataset=light_dataset) + if cache_status != CacheStatus.UPTODATE: + if cache_status == CacheStatus.OUTDATED: + clear_cache(cache_dir) - os.makedirs(cache_dir) # Recreate the cache + os.makedirs(cache_dir, exist_ok=True) # (Re)Create the cache if necessary download_dataset(cache_dir=cache_dir, light_dataset=light_dataset) From c427c6d2ada6c555af03b81e8dff1a1b66f67b5e Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Tue, 5 May 2026 10:09:02 +0200 Subject: [PATCH 04/14] Adapted the interaction with the cache and added tests for the cache --- fedivertex/cache.py | 28 ++--- fedivertex/main.py | 92 ++++++++++------- tests/test_basic.py | 64 ++++++++++++ tests/test_cache.py | 49 +++++++++ tests/test_consistency.py | 123 ++++++++++++++++++++++ tests/test_errors.py | 55 ++++++++++ tests/test_loader.py | 211 -------------------------------------- 7 files changed, 359 insertions(+), 263 deletions(-) create mode 100644 tests/test_basic.py create mode 100644 tests/test_cache.py create mode 100644 tests/test_consistency.py create mode 100644 tests/test_errors.py delete mode 100644 tests/test_loader.py diff --git a/fedivertex/cache.py b/fedivertex/cache.py index 3fed71b..6593a35 100644 --- a/fedivertex/cache.py +++ b/fedivertex/cache.py @@ -29,6 +29,7 @@ class CacheStatus(Enum): + CORRUPTED = -2 ABSENT = -1 OUTDATED = 0 UPTODATE = 1 @@ -67,16 +68,18 @@ def clear_cache(cache_dir=Path(DEFAULT_CACHE_DIR)): def check_for_update(light_dataset, cache_dir): - dataset_update_file = f"last_update_{cache_subdir_name(light_dataset)}.txt" metadata_url = LIGHT_DATASET_METADATA_URL if light_dataset else DATASET_METADATA_URL - - update_file_path = cache_dir / dataset_update_file + update_file_path = cache_dir / cache_subdir_name(light_dataset) / "last_update.txt" if os.path.exists(update_file_path): - print("Cache found, checking for updates...") - with open(update_file_path, "r", encoding="utf-8") as update_file: - last_local_update = datetime.fromisoformat(update_file.read()) + try: + with open(update_file_path, "r", encoding="utf-8") as update_file: + last_local_update = datetime.fromisoformat(update_file.read()) + except ValueError: + print("Cache corrupted (invalid update date), download necessary.") + return CacheStatus.CORRUPTED + print("Cache found, checking for updates...") try: resp = requests.get(metadata_url) if resp.status_code != 200: @@ -129,9 +132,7 @@ def download_dataset(light_dataset, cache_dir): def create_update_date_file(light_dataset, cache_dir): - dataset_update_file = f"last_update_{cache_subdir_name(light_dataset)}.txt" - - update_file_path = cache_dir / dataset_update_file + update_file_path = cache_dir / cache_subdir_name(light_dataset) / "last_update.txt" with open(update_file_path, "w", encoding="utf-8") as update_file: date_now = datetime.now(timezone.utc).isoformat() @@ -143,13 +144,14 @@ def init_cache(light_dataset: bool, cache_dir: Optional[Path | str] = None) -> P cache_dir = DEFAULT_CACHE_DIR cache_dir = Path(cache_dir) + # Create the main cache directory if necessary + os.makedirs(cache_dir, exist_ok=True) cache_status = check_for_update(cache_dir=cache_dir, light_dataset=light_dataset) if cache_status != CacheStatus.UPTODATE: - if cache_status == CacheStatus.OUTDATED: - clear_cache(cache_dir) - - os.makedirs(cache_dir, exist_ok=True) # (Re)Create the cache if necessary + clear_cache( + cache_dir / cache_subdir_name(light_dataset) + ) # Clears the cache if exists download_dataset(cache_dir=cache_dir, light_dataset=light_dataset) diff --git a/fedivertex/main.py b/fedivertex/main.py index e8c99c1..eb026f1 100644 --- a/fedivertex/main.py +++ b/fedivertex/main.py @@ -126,7 +126,7 @@ def list_available_dates(self, software: str, graph_type: str) -> List[str]: :rtype: List[str] """ self._check_input(software, graph_type) - graph_path = self.CACHE_DIR / software / graph_type + graph_path = self.CACHE_DIR / self.SUB_DIR / software / graph_type dates = list(os.listdir(graph_path)) dates.sort() @@ -176,49 +176,63 @@ def get_graph( assert date is not None - interactions_csv_file = f"{software}/{graph_type}/{date}/interactions.csv" - interaction_records = self.dataset.records(interactions_csv_file) # TODO - - instances_csv_file = f"{software}/{graph_type}/{date}/instances.csv" - instance_records = self.dataset.records(instances_csv_file) # TODO - if graph_type in self.UNDIRECTED_GRAPHS: graph = nx.Graph() else: graph = nx.DiGraph() - for record in tqdm( - instance_records, desc="Adding the nodes", disable=disable_tqdm - ): - host = record[instances_csv_file + "/host"].decode() - graph.add_node(host) - graph.nodes[host]["domain"] = host.split("[DOT]")[-1] - for col, val in record.items(): - col_name = col.split("/")[-1] - if type(val) is bytes: - val = val.decode() - if col_name not in ["host", "Id", "Label"]: - graph.nodes[host][col_name] = val - - for record in tqdm( - interaction_records, desc="Adding the edges", disable=disable_tqdm - ): - source = record[interactions_csv_file + "/Source"].decode() - target = record[interactions_csv_file + "/Target"].decode() - weight = record[interactions_csv_file + "/Weight"] - graph.add_edge(source, target, weight=weight) - - if only_largest_component: - if graph_type in self.UNDIRECTED_GRAPHS: - largest_cc = max(nx.connected_components(graph), key=len) - else: - largest_cc = max( - nx.strongly_connected_components(graph), key=len, default=() - ) - - graph = graph.subgraph(largest_cc).copy() - - return graph + instances_csv_file = ( + self.CACHE_DIR + / self.SUB_DIR + / software + / graph_type + / date + / "instances.csv" + ) + with open(instances_csv_file, "r", encoding="utf-8") as csvfile: + record_reader = csv.DictReader(csvfile) + for record in tqdm( + record_reader, desc="Adding the nodes", disable=disable_tqdm + ): + host = record["host"] + graph.add_node(host) + graph.nodes[host]["domain"] = host.split("[DOT]")[-1] + for col, val in record.items(): + col_name = col.split("/")[-1] + + if col_name not in ["host", "Id", "Label"]: + graph.nodes[host][col_name] = val + + interactions_csv_file = ( + self.CACHE_DIR + / self.SUB_DIR + / software + / graph_type + / date + / "interactions.csv" + ) + + with open(interactions_csv_file, "r", encoding="utf-8") as csvfile: + record_reader = csv.DictReader(csvfile) + for record in tqdm( + record_reader, desc="Adding the edges", disable=disable_tqdm + ): + source = record["Source"] + target = record["Target"] + weight = record["Weight"] + graph.add_edge(source, target, weight=weight) + + if only_largest_component: + if isinstance(graph, nx.DiGraph): + largest_cc = max( + nx.strongly_connected_components(graph), key=len, default=() + ) + else: + largest_cc = max(nx.connected_components(graph), key=len) + + graph = graph.subgraph(largest_cc).copy() + + return graph def get_temporal_graph( self, diff --git a/tests/test_basic.py b/tests/test_basic.py new file mode 100644 index 0000000..341a3b5 --- /dev/null +++ b/tests/test_basic.py @@ -0,0 +1,64 @@ +from fedivertex import GraphLoader + + +def test_basic_lists(): + software_list = [ + "bookwyrm", + "friendica", + "lemmy", + "mastodon", + "misskey", + "peertube", + "pleroma", + ] + + loader = GraphLoader() + assert loader.list_all_software() == software_list + + for software in software_list: + assert loader.list_graph_types(software) == loader.VALID_GRAPH_TYPES[software] + + +def test_available_dates(): + loader = GraphLoader() + peertube_dates = loader.list_available_dates("peertube", "follow") + assert set(peertube_dates).issuperset( + { + "20250203", + "20250210", + "20250217", + "20250224", + "20250303", + "20250311", + "20250317", + "20250324", + } + ) + + peertube_dates.sort() + assert loader._fetch_latest_date("peertube", "follow") == peertube_dates[-1] + + +def test_index_selection(): + loader = GraphLoader() + + assert loader._fetch_date_index("peertube", "follow", 0) == "20250203" + + latest_date = loader._fetch_latest_date("peertube", "follow") + assert loader._fetch_date_index("peertube", "follow", -1) == latest_date + + +def test_get_temporal_graph(): + loader = GraphLoader() + + temporal_graph = loader.get_temporal_graph( + "peertube", "follow", date=("20250203", "20250617") + ) + assert len(temporal_graph.temporal_nodes()) == 1157 + assert len(temporal_graph.temporal_edges()) == 310695 + assert temporal_graph.number_of_snapshots() == 20 + + temporal_graph = loader.get_temporal_graph("peertube", "follow", index=(0, 7)) + assert len(temporal_graph.temporal_nodes()) == 991 + assert len(temporal_graph.temporal_edges()) == 133852 + assert temporal_graph.number_of_snapshots() == 8 diff --git a/tests/test_cache.py b/tests/test_cache.py new file mode 100644 index 0000000..cd60b94 --- /dev/null +++ b/tests/test_cache.py @@ -0,0 +1,49 @@ +import os + +from fedivertex import GraphLoader +from fedivertex.cache import clear_cache + + +def test_cache_status(capsys): + clear_cache() + _loader = GraphLoader() + captured = capsys.readouterr() + assert ( + "No cache found, download necessary.\nDecompressing the dataset...\n" + == captured.out + ) + del _loader + + loader = GraphLoader() + captured = capsys.readouterr() + assert ( + "Cache found, checking for updates...\nCache is up-to-date, no download necessary.\n" + == captured.out + ) + + update_file_path = loader.CACHE_DIR / "reduced" / "last_update.txt" + os.remove(update_file_path) + with open(update_file_path, "w") as update_file: + update_file.write("INVALID DATA") + + del loader + + _loader = GraphLoader() + captured = capsys.readouterr() + assert ( + "Cache corrupted (invalid update date), download necessary.\nDecompressing the dataset...\n" + == captured.out + ) + del _loader + + os.remove(update_file_path) + with open(update_file_path, "w") as update_file: + update_file.write("2019-05-05T07:24:39.383197+00:00") + + _loader = GraphLoader() + captured = capsys.readouterr() + assert ( + "Cache found, checking for updates...\nCache is outdated, download necessary.\nDecompressing the dataset...\n" + == captured.out + ) + del _loader diff --git a/tests/test_consistency.py b/tests/test_consistency.py new file mode 100644 index 0000000..fb0eb89 --- /dev/null +++ b/tests/test_consistency.py @@ -0,0 +1,123 @@ +import pytest + +from fedivertex import GraphLoader + + +def _iter_software_graph(): + loader = GraphLoader() + for software, graph_types in loader.VALID_GRAPH_TYPES.items(): + if software == "mastodon": + continue + for graph_type in graph_types: + if graph_type == "federation": + continue + yield software, graph_type + + +@pytest.mark.parametrize("software,graph_type", list(_iter_software_graph())) +def test_get_graph_selection(software, graph_type): + loader = GraphLoader() + + date = loader._fetch_latest_date(software, graph_type) + + # Test date selection + graph1 = loader.get_graph(software, graph_type, date=date) + + if not graph_type == "federation": # Because Federation is undirected + csv_file = ( + loader.CACHE_DIR + / loader.SUB_DIR + / software + / graph_type + / date + / "interactions.csv" + ) + + with open(csv_file, "r", encoding="utf-8") as f: + line_count = sum(1 for _ in f) + line_count -= 1 # Remove the header from the count + + assert graph1.number_of_edges() == line_count + + # Test index selection + graph2 = loader.get_graph(software, graph_type, index=-1) + assert graph1.number_of_edges() == graph2.number_of_edges() + + available_dates = loader.list_available_dates(software, graph_type) + date = available_dates[0] + graph3 = loader.get_graph(software, graph_type, date=date) + + graph4 = loader.get_graph(software, graph_type, index=0) + assert graph3.number_of_edges() == graph4.number_of_edges() + + +def _iter_software_graph_date(): + loader = GraphLoader() + for software, graph_types in loader.VALID_GRAPH_TYPES.items(): + if software == "mastodon": + continue + for graph_type in graph_types: + if graph_type == "federation": + continue + for date in loader.list_available_dates(software, graph_type): + yield software, graph_type, date + + +@pytest.mark.parametrize("software,graph_type,date", list(_iter_software_graph_date())) +def test_get_graph_sizes(software, graph_type, date): + loader = GraphLoader() + + graph = loader.get_graph(software, graph_type, date=date) + csv_file = ( + loader.CACHE_DIR + / loader.SUB_DIR + / software + / graph_type + / date + / "interactions.csv" + ) + + with open(csv_file, "r", encoding="utf-8") as f: + line_count = sum(1 for _ in f) + line_count -= 1 # Remove the header from the count + + assert graph.number_of_edges() == line_count # Verify that we load all the edges + # NB: an error can also occur in case of data cleaning issue in the dataset + + +def test_graph_consistency(): + loader = GraphLoader() + + # Check graph consistency + peertube_graph = loader.get_graph("peertube", "follow", date="20250324") + assert peertube_graph.number_of_edges() == 19171 + assert peertube_graph.number_of_nodes() == 883 + + # Check node attributes + assert peertube_graph.nodes["aperi[DOT]tube"] == { + "domain": "tube", + "totalUsers": 39, + "totalDailyActiveUsers": 0.0, + "totalWeeklyActiveUsers": 4.0, + "totalMonthlyActiveUsers": 8.0, + "totalLocalVideos": 638, + "totalVideos": 1287, + "totalLocalPlaylists": 26.0, + "totalVideoComments": 4632, + "totalLocalVideoComments": 44, + "totalLocalVideoViews": 106216, + "serverVersion": "7.1.0", + } + + # Check largest component consistency + peertube_graph = loader.get_graph( # DIRECTED GRAPH + "peertube", "follow", date="20250324", only_largest_component=True + ) + assert peertube_graph.number_of_edges() == 7450 + assert peertube_graph.number_of_nodes() == 264 + + bookwyrm_graph = loader.get_graph( + "bookwyrm", "federation", date="20250324", only_largest_component=True + ) + assert bookwyrm_graph.number_of_nodes() == 70 + assert bookwyrm_graph.number_of_edges() == 1827 diff --git a/tests/test_errors.py b/tests/test_errors.py new file mode 100644 index 0000000..14f2c5a --- /dev/null +++ b/tests/test_errors.py @@ -0,0 +1,55 @@ +import pytest + +from fedivertex import GraphLoader +from fedivertex.exceptions import InteractionError + + +def test_list_error(): + loader = GraphLoader() + + with pytest.raises(InteractionError): + loader.list_graph_types("NON-EXISTING SOFTWARE") + + +def test_index_selection_error(): + loader = GraphLoader() + + with pytest.raises(InteractionError): + loader._fetch_date_index("peertube", "follow", 10000000000000000000000000) + + +def test_get_graph_errors(): + loader = GraphLoader() + + with pytest.raises(InteractionError): + loader.get_graph("NON-EXISTING", "federation") + + with pytest.raises(InteractionError): + loader.get_graph("peertube", "NON-EXISTING") + + with pytest.raises(InteractionError): + loader.get_graph("peertube", "follow", date="20250203", index=3) + + +def test_get_temporal_graph_errors(): + loader = GraphLoader() + + with pytest.raises(InteractionError): + loader.get_temporal_graph("NON-EXISTING", "federation") + + with pytest.raises(InteractionError): + loader.get_temporal_graph("peertube", "NON-EXISTING") + + with pytest.raises(InteractionError): + loader.get_temporal_graph( + "peertube", "follow", date=("20250203", "20250217"), index=(3, 7) + ) + + with pytest.raises(InteractionError): + loader.get_temporal_graph("peertube", "follow", index=(-1, 7)) + + with pytest.raises(InteractionError): + loader.get_temporal_graph("peertube", "follow", index=(3, 70000000000)) + + with pytest.raises(InteractionError): + loader.get_temporal_graph("peertube", "follow", date=("20210203", "20210217")) diff --git a/tests/test_loader.py b/tests/test_loader.py deleted file mode 100644 index 1cdf646..0000000 --- a/tests/test_loader.py +++ /dev/null @@ -1,211 +0,0 @@ -import pytest - -from fedivertex import GraphLoader -from fedivertex.exceptions import InteractionError - - -def test_basic_lists(): - software_list = [ - "bookwyrm", - "friendica", - "lemmy", - "mastodon", - "misskey", - "peertube", - "pleroma", - ] - - loader = GraphLoader() - assert loader.list_all_software() == software_list - - for software in software_list: - assert loader.list_graph_types(software) == loader.VALID_GRAPH_TYPES[software] - - with pytest.raises(InteractionError): - loader.list_graph_types("NON-EXISTING SOFTWARE") - - -def test_available_dates(): - loader = GraphLoader() - peertube_dates = loader.list_available_dates("peertube", "follow") - assert set(peertube_dates).issuperset( - { - "20250203", - "20250210", - "20250217", - "20250224", - "20250303", - "20250311", - "20250317", - "20250324", - } - ) - - peertube_dates.sort() - assert loader._fetch_latest_date("peertube", "follow") == peertube_dates[-1] - - -def test_index_selection(): - loader = GraphLoader() - - with pytest.raises(InteractionError): - loader._fetch_date_index("peertube", "follow", 10000000000000000000000000) - - assert loader._fetch_date_index("peertube", "follow", 0) == "20250203" - - latest_date = loader._fetch_latest_date("peertube", "follow") - assert loader._fetch_date_index("peertube", "follow", -1) == latest_date - - -def test_get_graph_errors(): - loader = GraphLoader() - - with pytest.raises(InteractionError): - loader.get_graph("NON-EXISTING", "federation") - - with pytest.raises(InteractionError): - loader.get_graph("peertube", "NON-EXISTING") - - with pytest.raises(InteractionError): - loader.get_graph("peertube", "follow", date="20250203", index=3) - - -def _iter_software_graph(): - loader = GraphLoader() - for software, graph_types in loader.VALID_GRAPH_TYPES.items(): - if software == "mastodon": - continue - for graph_type in graph_types: - if graph_type == "federation": - continue - yield software, graph_type - - -@pytest.mark.parametrize("software,graph_type", list(_iter_software_graph())) -def test_get_graph_selection(software, graph_type): - loader = GraphLoader() - - date = loader._fetch_latest_date(software, graph_type) - - # Test date selection - graph1 = loader.get_graph(software, graph_type, date=date) - - if not graph_type == "federation": # Because Federation is undirected - csv_file = f"{software}/{graph_type}/{date}/interactions.csv" - records = loader.dataset.records(csv_file) - - assert graph1.number_of_edges() == len(list(records)) - - # Test index selection - graph2 = loader.get_graph(software, graph_type, index=-1) - assert graph1.number_of_edges() == graph2.number_of_edges() - - available_dates = loader.list_available_dates(software, graph_type) - date = available_dates[0] - graph3 = loader.get_graph(software, graph_type, date=date) - - graph4 = loader.get_graph(software, graph_type, index=0) - assert graph3.number_of_edges() == graph4.number_of_edges() - - -def _iter_software_graph_date(): - loader = GraphLoader() - for software, graph_types in loader.VALID_GRAPH_TYPES.items(): - if software == "mastodon": - continue - for graph_type in graph_types: - if graph_type == "federation": - continue - for date in loader.list_available_dates(software, graph_type): - yield software, graph_type, date - - -@pytest.mark.parametrize("software,graph_type,date", list(_iter_software_graph_date())) -def test_get_graph_sizes(software, graph_type, date): - loader = GraphLoader() - - graph = loader.get_graph(software, graph_type, date=date) - csv_file = f"{software}/{graph_type}/{date}/interactions.csv" - records = list(loader.dataset.records(csv_file)) - - assert graph.number_of_edges() == len(records) # Verify that we load all the edges - # NB: an error can also occur in case of data cleaning issue in the dataset - - -def test_graph_consistency(): - loader = GraphLoader() - - # Check graph consistency - peertube_graph = loader.get_graph("peertube", "follow", date="20250324") - assert peertube_graph.number_of_edges() == 19171 - assert peertube_graph.number_of_nodes() == 883 - - # Check node attributes - assert peertube_graph.nodes["aperi[DOT]tube"] == { - "domain": "tube", - "totalUsers": 39, - "totalDailyActiveUsers": 0.0, - "totalWeeklyActiveUsers": 4.0, - "totalMonthlyActiveUsers": 8.0, - "totalLocalVideos": 638, - "totalVideos": 1287, - "totalLocalPlaylists": 26.0, - "totalVideoComments": 4632, - "totalLocalVideoComments": 44, - "totalLocalVideoViews": 106216, - "serverVersion": "7.1.0", - } - - # Check largest component consistency - peertube_graph = loader.get_graph( # DIRECTED GRAPH - "peertube", "follow", date="20250324", only_largest_component=True - ) - assert peertube_graph.number_of_edges() == 7450 - assert peertube_graph.number_of_nodes() == 264 - - bookwyrm_graph = loader.get_graph( - "bookwyrm", "federation", date="20250324", only_largest_component=True - ) - assert bookwyrm_graph.number_of_nodes() == 70 - assert bookwyrm_graph.number_of_edges() == 1827 - - -def test_get_temporal_graph(): - loader = GraphLoader() - - with pytest.raises(InteractionError): - loader.get_temporal_graph("NON-EXISTING", "federation") - - with pytest.raises(InteractionError): - loader.get_temporal_graph("peertube", "NON-EXISTING") - - with pytest.raises(InteractionError): - loader.get_temporal_graph( - "peertube", "follow", date=("20250203", "20250217"), index=(3, 7) - ) - - with pytest.raises(InteractionError): - loader.get_temporal_graph("peertube", "follow", index=(-1, 7)) - - with pytest.raises(InteractionError): - loader.get_temporal_graph("peertube", "follow", index=(3, 70000000000)) - - with pytest.raises(InteractionError): - loader.get_temporal_graph("peertube", "follow", date=("20210203", "20210217")) - - temporal_graph = loader.get_temporal_graph( - "peertube", "follow", date=("20250203", "20250617") - ) - assert len(temporal_graph.temporal_nodes()) == 1157 - assert len(temporal_graph.temporal_edges()) == 310695 - assert temporal_graph.number_of_snapshots() == 20 - - temporal_graph = loader.get_temporal_graph("peertube", "follow", index=(0, 7)) - assert len(temporal_graph.temporal_nodes()) == 991 - assert len(temporal_graph.temporal_edges()) == 133852 - assert temporal_graph.number_of_snapshots() == 8 - - -def test_outdated_cache(): - loader = GraphLoader() - raise NotImplementedError # TODO From f3e50e5874474f0ea341c5380cb5137b2343c986 Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Tue, 5 May 2026 11:29:22 +0200 Subject: [PATCH 05/14] Refactored the cache information storage --- fedivertex/cache.py | 121 ++++++++++++++++++++++++-------------------- fedivertex/main.py | 32 +++++------- tests/test_cache.py | 19 +++++-- 3 files changed, 95 insertions(+), 77 deletions(-) diff --git a/fedivertex/cache.py b/fedivertex/cache.py index 6593a35..ef6786b 100644 --- a/fedivertex/cache.py +++ b/fedivertex/cache.py @@ -1,10 +1,9 @@ import os import shutil import zipfile -from datetime import datetime, timezone +from datetime import datetime from enum import Enum from pathlib import Path -from turtle import down from typing import Optional import requests @@ -13,7 +12,7 @@ from .exceptions import DownloadError -_CHUNK_SIZE = 1024 +_CHUNK_SIZE = 1024 * 1024 DEFAULT_CACHE_DIR = user_cache_dir( appname="fedivertex-dataset", @@ -35,8 +34,32 @@ class CacheStatus(Enum): UPTODATE = 1 -def cache_subdir_name(light_version): - return "reduced" if light_version else "full" +class DatasetInfo: + def __init__(self, cache_dir: Path, light_dataset: bool): + self.cache_root = cache_dir + self.light_version = light_dataset + self.dataset_dir = cache_dir / ("reduced" if self.light_version else "full") + metadata_url = ( + LIGHT_DATASET_METADATA_URL if self.light_version else DATASET_METADATA_URL + ) + self.data_url = LIGHT_DATASET_URL if self.light_version else DATASET_URL + + try: + resp = requests.get(metadata_url, timeout=10) + if resp.status_code != 200: + raise DownloadError( + f"Could not retrieve dataset metadata (Invalid status {resp.status_code})" + ) + metadata = resp.json() + self.last_update = metadata["dateModified"] + except requests.RequestException as err: + raise DownloadError( + f"Could not retrieve dataset metadata ({str(err)})" + ) from err + except KeyError as err: + raise DownloadError( + "Could not retrieve dataset metadata (Missing 'dateModified' in the metadata)" + ) from err def download_from_http(url: str, filepath: Path): # Inspired from Croissant ML codebase @@ -47,8 +70,10 @@ def download_from_http(url: str, filepath: Path): # Inspired from Croissant ML ) response.raise_for_status() total = int(response.headers.get("Content-Length", 0)) + + tmp_path = filepath.with_suffix(".tmp") with ( - filepath.open("wb") as file, + tmp_path.open("wb") as file, tqdm( desc="Downloading the dataset...", total=total, @@ -61,17 +86,20 @@ def download_from_http(url: str, filepath: Path): # Inspired from Croissant ML size = file.write(data) bar.update(size) + tmp_path.replace(filepath) + + +def clear_default_cache(): + cache_dir = Path(DEFAULT_CACHE_DIR) -def clear_cache(cache_dir=Path(DEFAULT_CACHE_DIR)): - if os.path.exists(cache_dir): + if cache_dir.exists(): shutil.rmtree(cache_dir) -def check_for_update(light_dataset, cache_dir): - metadata_url = LIGHT_DATASET_METADATA_URL if light_dataset else DATASET_METADATA_URL - update_file_path = cache_dir / cache_subdir_name(light_dataset) / "last_update.txt" +def check_for_update(dataset_info: DatasetInfo) -> CacheStatus: + update_file_path = dataset_info.dataset_dir / "last_update.txt" - if os.path.exists(update_file_path): + if update_file_path.exists(): try: with open(update_file_path, "r", encoding="utf-8") as update_file: last_local_update = datetime.fromisoformat(update_file.read()) @@ -80,26 +108,8 @@ def check_for_update(light_dataset, cache_dir): return CacheStatus.CORRUPTED print("Cache found, checking for updates...") - try: - resp = requests.get(metadata_url) - if resp.status_code != 200: - raise DownloadError( - f"Could not retrieve dataset metadata (Invalid status {resp.status_code})" - ) - metadata = resp.json() - last_online_update = datetime.fromisoformat( - metadata["dateModified"] - ).replace(tzinfo=timezone.utc) - except requests.RequestException as err: - raise DownloadError( - f"Could not retrieve dataset metadata ({str(err)})" - ) from err - except KeyError as err: - raise DownloadError( - "Could not retrieve dataset metadata (Missing 'dateModified' in the metadata)" - ) from err - if last_local_update > last_online_update: + if last_local_update >= datetime.fromisoformat(dataset_info.last_update): print("Cache is up-to-date, no download necessary.") return CacheStatus.UPTODATE else: @@ -110,51 +120,54 @@ def check_for_update(light_dataset, cache_dir): return CacheStatus.ABSENT -def download_dataset(light_dataset, cache_dir): - data_url = LIGHT_DATASET_URL if light_dataset else DATASET_URL +def download_dataset(dataset_info: DatasetInfo): + archive_path = dataset_info.cache_root / "archive.zip" - archive_path = cache_dir / "archive.zip" - dataset_path = cache_dir / cache_subdir_name(light_version=light_dataset) - - download_from_http(data_url, archive_path) + download_from_http(dataset_info.data_url, archive_path) print("Decompressing the dataset...") with zipfile.ZipFile(archive_path) as zip: - zip.extractall(cache_dir) + zip.extractall(dataset_info.cache_root) # Rename the extracted folder to have a fixed name (without version) roots = {Path(m).parts[0] for m in zip.namelist() if m.strip()} if len(roots) == 1: - old_root = cache_dir / next(iter(roots)) - old_root.rename(dataset_path) + old_root = dataset_info.cache_root / next(iter(roots)) + old_root.rename(dataset_info.dataset_dir) - os.remove(archive_path) + archive_path.unlink() -def create_update_date_file(light_dataset, cache_dir): - update_file_path = cache_dir / cache_subdir_name(light_dataset) / "last_update.txt" +def create_update_date_file(dataset_info: DatasetInfo): + update_file_path = dataset_info.dataset_dir / "last_update.txt" with open(update_file_path, "w", encoding="utf-8") as update_file: - date_now = datetime.now(timezone.utc).isoformat() - update_file.write(date_now) + update_file.write(dataset_info.last_update) -def init_cache(light_dataset: bool, cache_dir: Optional[Path | str] = None) -> Path: +def init_cache( + light_dataset: bool, cache_dir: Optional[Path | str] = None +) -> DatasetInfo: if cache_dir is None: cache_dir = DEFAULT_CACHE_DIR - cache_dir = Path(cache_dir) # Create the main cache directory if necessary os.makedirs(cache_dir, exist_ok=True) + return DatasetInfo(cache_dir, light_dataset) + + +def load_dataset( + light_dataset: bool, cache_dir: Optional[Path | str] = None +) -> DatasetInfo: + dataset_info = init_cache(light_dataset, cache_dir) - cache_status = check_for_update(cache_dir=cache_dir, light_dataset=light_dataset) + cache_status = check_for_update(dataset_info) if cache_status != CacheStatus.UPTODATE: - clear_cache( - cache_dir / cache_subdir_name(light_dataset) - ) # Clears the cache if exists + if dataset_info.dataset_dir.exists(): + shutil.rmtree(dataset_info.dataset_dir) - download_dataset(cache_dir=cache_dir, light_dataset=light_dataset) + download_dataset(dataset_info) - create_update_date_file(cache_dir=cache_dir, light_dataset=light_dataset) + create_update_date_file(dataset_info) - return cache_dir + return dataset_info diff --git a/fedivertex/main.py b/fedivertex/main.py index eb026f1..24f2a4e 100644 --- a/fedivertex/main.py +++ b/fedivertex/main.py @@ -1,5 +1,6 @@ import csv import os +from pathlib import Path from types import NoneType from typing import List, Optional, Tuple @@ -7,7 +8,7 @@ import networkx_temporal as tx from tqdm import tqdm -from .cache import cache_subdir_name, init_cache +from .cache import load_dataset from .exceptions import InteractionError @@ -29,9 +30,10 @@ class GraphLoader: UNDIRECTED_GRAPHS = ["federation"] def __init__(self, light_version=True, cache_dir=None): - self.light_version = light_version - self.CACHE_DIR = init_cache(light_version, cache_dir) - self.SUB_DIR = cache_subdir_name(light_version) + self.DATASET_INFO = load_dataset(light_version, cache_dir) + + def _graph_dir(self, software: str, graph_type: str, date: str) -> Path: + return self.DATASET_INFO.dataset_dir / software / graph_type / date def _check_input(self, software: str, graph_type: str) -> NoneType: """Verify that (software,graph type) combination exists @@ -55,7 +57,11 @@ def _check_input(self, software: str, graph_type: str) -> NoneType: f"{graph_type} is not a valid graph type for {software}. Valid types: {self.VALID_GRAPH_TYPES[software]}" ) - if self.light_version and software == "mastodon" and graph_type == "federation": + if ( + self.DATASET_INFO.light_version + and software == "mastodon" + and graph_type == "federation" + ): raise InteractionError( f"The graph {software} {graph_type} is not included in the light version of Fedivertex\n" "To download the full version, generate the dataset loader as follows: `GraphLoader(light_version=False)`" @@ -126,7 +132,7 @@ def list_available_dates(self, software: str, graph_type: str) -> List[str]: :rtype: List[str] """ self._check_input(software, graph_type) - graph_path = self.CACHE_DIR / self.SUB_DIR / software / graph_type + graph_path = self.DATASET_INFO.dataset_dir / software / graph_type dates = list(os.listdir(graph_path)) dates.sort() @@ -182,12 +188,7 @@ def get_graph( graph = nx.DiGraph() instances_csv_file = ( - self.CACHE_DIR - / self.SUB_DIR - / software - / graph_type - / date - / "instances.csv" + self._graph_dir(software, graph_type, date) / "instances.csv" ) with open(instances_csv_file, "r", encoding="utf-8") as csvfile: record_reader = csv.DictReader(csvfile) @@ -204,12 +205,7 @@ def get_graph( graph.nodes[host][col_name] = val interactions_csv_file = ( - self.CACHE_DIR - / self.SUB_DIR - / software - / graph_type - / date - / "interactions.csv" + self._graph_dir(software, graph_type, date) / "interactions.csv" ) with open(interactions_csv_file, "r", encoding="utf-8") as csvfile: diff --git a/tests/test_cache.py b/tests/test_cache.py index cd60b94..03861c3 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -1,11 +1,21 @@ import os +from pathlib import Path from fedivertex import GraphLoader -from fedivertex.cache import clear_cache +from fedivertex.cache import DEFAULT_CACHE_DIR, clear_default_cache + + +def test_cache_removal(): + cache_path = Path(DEFAULT_CACHE_DIR) + assert cache_path.exists() + + clear_default_cache() + + assert not cache_path.exists() def test_cache_status(capsys): - clear_cache() + clear_default_cache() _loader = GraphLoader() captured = capsys.readouterr() assert ( @@ -21,7 +31,7 @@ def test_cache_status(capsys): == captured.out ) - update_file_path = loader.CACHE_DIR / "reduced" / "last_update.txt" + update_file_path = loader.DATASET_INFO.dataset_dir / "last_update.txt" os.remove(update_file_path) with open(update_file_path, "w") as update_file: update_file.write("INVALID DATA") @@ -38,7 +48,7 @@ def test_cache_status(capsys): os.remove(update_file_path) with open(update_file_path, "w") as update_file: - update_file.write("2019-05-05T07:24:39.383197+00:00") + update_file.write("2016-04-24T12:08:29.887") _loader = GraphLoader() captured = capsys.readouterr() @@ -46,4 +56,3 @@ def test_cache_status(capsys): "Cache found, checking for updates...\nCache is outdated, download necessary.\nDecompressing the dataset...\n" == captured.out ) - del _loader From 611736d760ded570c160db21883d37d158c1a6ef Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Tue, 5 May 2026 11:34:03 +0200 Subject: [PATCH 06/14] Fix a unit test --- tests/test_consistency.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_consistency.py b/tests/test_consistency.py index fb0eb89..74a1fa8 100644 --- a/tests/test_consistency.py +++ b/tests/test_consistency.py @@ -25,8 +25,7 @@ def test_get_graph_selection(software, graph_type): if not graph_type == "federation": # Because Federation is undirected csv_file = ( - loader.CACHE_DIR - / loader.SUB_DIR + loader.DATASET_INFO.dataset_dir / software / graph_type / date @@ -69,8 +68,7 @@ def test_get_graph_sizes(software, graph_type, date): graph = loader.get_graph(software, graph_type, date=date) csv_file = ( - loader.CACHE_DIR - / loader.SUB_DIR + loader.DATASET_INFO.dataset_dir / software / graph_type / date From 1b59339955a6eb51f20601c277e6b64f2443f085 Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Tue, 5 May 2026 12:15:11 +0200 Subject: [PATCH 07/14] Add a "cache-only" option to optimize dataset loading --- .github/workflows/pytest.yml | 2 +- fedivertex/cache.py | 77 +++++++++++++++++++++--------------- fedivertex/exceptions.py | 4 ++ fedivertex/main.py | 4 +- tests/test_basic.py | 9 ----- tests/test_consistency.py | 49 +++++++++++++---------- tests/test_errors.py | 31 ++++++++++++++- 7 files changed, 111 insertions(+), 65 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 662165d..a463a2a 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -23,4 +23,4 @@ jobs: pip install .[test] - name: Test with pytest run: | - pytest + pytest -vv diff --git a/fedivertex/cache.py b/fedivertex/cache.py index ef6786b..55f0944 100644 --- a/fedivertex/cache.py +++ b/fedivertex/cache.py @@ -10,7 +10,7 @@ from platformdirs import user_cache_dir from tqdm import tqdm -from .exceptions import DownloadError +from .exceptions import CacheError, DownloadError _CHUNK_SIZE = 1024 * 1024 @@ -34,8 +34,16 @@ class CacheStatus(Enum): UPTODATE = 1 +def read_last_update(filepath): + try: + with open(filepath, "r", encoding="utf-8") as update_file: + return datetime.fromisoformat(update_file.read()) + except ValueError: + raise CacheError("Cache corrupted (invalid update date), download necessary.") + + class DatasetInfo: - def __init__(self, cache_dir: Path, light_dataset: bool): + def __init__(self, cache_dir: Path, light_dataset: bool, cache_only: bool): self.cache_root = cache_dir self.light_version = light_dataset self.dataset_dir = cache_dir / ("reduced" if self.light_version else "full") @@ -44,22 +52,29 @@ def __init__(self, cache_dir: Path, light_dataset: bool): ) self.data_url = LIGHT_DATASET_URL if self.light_version else DATASET_URL - try: - resp = requests.get(metadata_url, timeout=10) - if resp.status_code != 200: + if cache_only: + last_update_file = self.dataset_dir / "last_update.txt" + if last_update_file.exists(): + self.last_update = read_last_update(last_update_file) + else: + raise CacheError("No cache found... incompatible with cache_only=True") + else: + try: + resp = requests.get(metadata_url, timeout=10) + if resp.status_code != 200: + raise DownloadError( + f"Could not retrieve dataset metadata (Invalid status {resp.status_code})" + ) + metadata = resp.json() + self.last_update = metadata["dateModified"] + except requests.RequestException as err: + raise DownloadError( + f"Could not retrieve dataset metadata ({str(err)})" + ) from err + except KeyError as err: raise DownloadError( - f"Could not retrieve dataset metadata (Invalid status {resp.status_code})" - ) - metadata = resp.json() - self.last_update = metadata["dateModified"] - except requests.RequestException as err: - raise DownloadError( - f"Could not retrieve dataset metadata ({str(err)})" - ) from err - except KeyError as err: - raise DownloadError( - "Could not retrieve dataset metadata (Missing 'dateModified' in the metadata)" - ) from err + "Could not retrieve dataset metadata (Missing 'dateModified' in the metadata)" + ) from err def download_from_http(url: str, filepath: Path): # Inspired from Croissant ML codebase @@ -101,10 +116,9 @@ def check_for_update(dataset_info: DatasetInfo) -> CacheStatus: if update_file_path.exists(): try: - with open(update_file_path, "r", encoding="utf-8") as update_file: - last_local_update = datetime.fromisoformat(update_file.read()) - except ValueError: - print("Cache corrupted (invalid update date), download necessary.") + last_local_update = read_last_update(update_file_path) + except CacheError as err: + print(str(err)) return CacheStatus.CORRUPTED print("Cache found, checking for updates...") @@ -146,28 +160,29 @@ def create_update_date_file(dataset_info: DatasetInfo): def init_cache( - light_dataset: bool, cache_dir: Optional[Path | str] = None + light_dataset: bool, cache_dir: Optional[Path | str] = None, cache_only=False ) -> DatasetInfo: if cache_dir is None: cache_dir = DEFAULT_CACHE_DIR cache_dir = Path(cache_dir) # Create the main cache directory if necessary os.makedirs(cache_dir, exist_ok=True) - return DatasetInfo(cache_dir, light_dataset) + return DatasetInfo(cache_dir, light_dataset, cache_only) def load_dataset( - light_dataset: bool, cache_dir: Optional[Path | str] = None + light_dataset: bool, cache_dir: Optional[Path | str] = None, cache_only=False ) -> DatasetInfo: - dataset_info = init_cache(light_dataset, cache_dir) + dataset_info = init_cache(light_dataset, cache_dir, cache_only) - cache_status = check_for_update(dataset_info) - if cache_status != CacheStatus.UPTODATE: - if dataset_info.dataset_dir.exists(): - shutil.rmtree(dataset_info.dataset_dir) + if not cache_only: + cache_status = check_for_update(dataset_info) + if cache_status != CacheStatus.UPTODATE: + if dataset_info.dataset_dir.exists(): + shutil.rmtree(dataset_info.dataset_dir) - download_dataset(dataset_info) + download_dataset(dataset_info) - create_update_date_file(dataset_info) + create_update_date_file(dataset_info) return dataset_info diff --git a/fedivertex/exceptions.py b/fedivertex/exceptions.py index 21c4045..7858568 100644 --- a/fedivertex/exceptions.py +++ b/fedivertex/exceptions.py @@ -6,5 +6,9 @@ class DownloadError(FedivertexException): pass +class CacheError(FedivertexException): + pass + + class InteractionError(FedivertexException): pass diff --git a/fedivertex/main.py b/fedivertex/main.py index 24f2a4e..e90fa9e 100644 --- a/fedivertex/main.py +++ b/fedivertex/main.py @@ -29,8 +29,8 @@ class GraphLoader: } UNDIRECTED_GRAPHS = ["federation"] - def __init__(self, light_version=True, cache_dir=None): - self.DATASET_INFO = load_dataset(light_version, cache_dir) + def __init__(self, light_version=True, cache_dir=None, cache_only=False): + self.DATASET_INFO = load_dataset(light_version, cache_dir, cache_only) def _graph_dir(self, software: str, graph_type: str, date: str) -> Path: return self.DATASET_INFO.dataset_dir / software / graph_type / date diff --git a/tests/test_basic.py b/tests/test_basic.py index 341a3b5..03b21bc 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -39,15 +39,6 @@ def test_available_dates(): assert loader._fetch_latest_date("peertube", "follow") == peertube_dates[-1] -def test_index_selection(): - loader = GraphLoader() - - assert loader._fetch_date_index("peertube", "follow", 0) == "20250203" - - latest_date = loader._fetch_latest_date("peertube", "follow") - assert loader._fetch_date_index("peertube", "follow", -1) == latest_date - - def test_get_temporal_graph(): loader = GraphLoader() diff --git a/tests/test_consistency.py b/tests/test_consistency.py index 74a1fa8..a66b60e 100644 --- a/tests/test_consistency.py +++ b/tests/test_consistency.py @@ -3,20 +3,29 @@ from fedivertex import GraphLoader -def _iter_software_graph(): +def test_index_selection(): loader = GraphLoader() + + assert loader._fetch_date_index("peertube", "follow", 0) == "20250203" + + latest_date = loader._fetch_latest_date("peertube", "follow") + assert loader._fetch_date_index("peertube", "follow", -1) == latest_date + + +def _iter_software_graph(): + loader = GraphLoader( + cache_only=True + ) # Avoids to fetch the metadata again and again for software, graph_types in loader.VALID_GRAPH_TYPES.items(): - if software == "mastodon": - continue for graph_type in graph_types: - if graph_type == "federation": + if software == "mastodon" and graph_type == "federation": continue yield software, graph_type @pytest.mark.parametrize("software,graph_type", list(_iter_software_graph())) def test_get_graph_selection(software, graph_type): - loader = GraphLoader() + loader = GraphLoader(cache_only=True) date = loader._fetch_latest_date(software, graph_type) @@ -51,12 +60,10 @@ def test_get_graph_selection(software, graph_type): def _iter_software_graph_date(): - loader = GraphLoader() + loader = GraphLoader(cache_only=True) for software, graph_types in loader.VALID_GRAPH_TYPES.items(): - if software == "mastodon": - continue for graph_type in graph_types: - if graph_type == "federation": + if graph_type == "federation": # Because we want directed graphs continue for date in loader.list_available_dates(software, graph_type): yield software, graph_type, date @@ -64,7 +71,7 @@ def _iter_software_graph_date(): @pytest.mark.parametrize("software,graph_type,date", list(_iter_software_graph_date())) def test_get_graph_sizes(software, graph_type, date): - loader = GraphLoader() + loader = GraphLoader(cache_only=True) graph = loader.get_graph(software, graph_type, date=date) csv_file = ( @@ -84,7 +91,7 @@ def test_get_graph_sizes(software, graph_type, date): def test_graph_consistency(): - loader = GraphLoader() + loader = GraphLoader(cache_only=True) # Check graph consistency peertube_graph = loader.get_graph("peertube", "follow", date="20250324") @@ -94,16 +101,16 @@ def test_graph_consistency(): # Check node attributes assert peertube_graph.nodes["aperi[DOT]tube"] == { "domain": "tube", - "totalUsers": 39, - "totalDailyActiveUsers": 0.0, - "totalWeeklyActiveUsers": 4.0, - "totalMonthlyActiveUsers": 8.0, - "totalLocalVideos": 638, - "totalVideos": 1287, - "totalLocalPlaylists": 26.0, - "totalVideoComments": 4632, - "totalLocalVideoComments": 44, - "totalLocalVideoViews": 106216, + "totalUsers": "39", + "totalDailyActiveUsers": "0.0", + "totalWeeklyActiveUsers": "4.0", + "totalMonthlyActiveUsers": "8.0", + "totalLocalVideos": "638", + "totalVideos": "1287", + "totalLocalPlaylists": "26.0", + "totalVideoComments": "4632", + "totalLocalVideoComments": "44", + "totalLocalVideoViews": "106216", "serverVersion": "7.1.0", } diff --git a/tests/test_errors.py b/tests/test_errors.py index 14f2c5a..3fbda7b 100644 --- a/tests/test_errors.py +++ b/tests/test_errors.py @@ -1,7 +1,11 @@ +from pathlib import Path +from turtle import clear + import pytest from fedivertex import GraphLoader -from fedivertex.exceptions import InteractionError +from fedivertex.cache import DEFAULT_CACHE_DIR, clear_default_cache +from fedivertex.exceptions import CacheError, InteractionError def test_list_error(): @@ -11,6 +15,31 @@ def test_list_error(): loader.list_graph_types("NON-EXISTING SOFTWARE") +def test_cache_only_errors(): + cache_path = Path(DEFAULT_CACHE_DIR) + assert cache_path.exists() + loader = GraphLoader(cache_only=True) + # No error because the cache exists + + # Cache corruption + update_file_path = loader.DATASET_INFO.dataset_dir / "last_update.txt" + update_file_path.unlink() + with open(update_file_path, "w") as update_file: + update_file.write("INVALID DATA") + + del loader + + assert cache_path.exists() + with pytest.raises(CacheError): # Corrupted cache + _loader = GraphLoader(cache_only=True) + + clear_default_cache() + + assert cache_path.exists() + with pytest.raises(CacheError): # Missing cache + _loader = GraphLoader(cache_only=True) + + def test_index_selection_error(): loader = GraphLoader() From 506c59510c3ccfdb7282473496cad9de11f93b12 Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Tue, 5 May 2026 12:18:05 +0200 Subject: [PATCH 08/14] Fix a bug due to missing cache --- tests/test_consistency.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_consistency.py b/tests/test_consistency.py index a66b60e..fdeeb37 100644 --- a/tests/test_consistency.py +++ b/tests/test_consistency.py @@ -13,9 +13,7 @@ def test_index_selection(): def _iter_software_graph(): - loader = GraphLoader( - cache_only=True - ) # Avoids to fetch the metadata again and again + loader = GraphLoader() for software, graph_types in loader.VALID_GRAPH_TYPES.items(): for graph_type in graph_types: if software == "mastodon" and graph_type == "federation": @@ -25,7 +23,9 @@ def _iter_software_graph(): @pytest.mark.parametrize("software,graph_type", list(_iter_software_graph())) def test_get_graph_selection(software, graph_type): - loader = GraphLoader(cache_only=True) + loader = GraphLoader( + cache_only=True + ) # Avoids to fetch the metadata again and again date = loader._fetch_latest_date(software, graph_type) From 2aa3920da052c173b5c8dc8b7cf0aa048e5ffba5 Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Tue, 5 May 2026 12:22:30 +0200 Subject: [PATCH 09/14] Fix a copy-paste error --- tests/test_errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_errors.py b/tests/test_errors.py index 3fbda7b..0be661d 100644 --- a/tests/test_errors.py +++ b/tests/test_errors.py @@ -35,7 +35,7 @@ def test_cache_only_errors(): clear_default_cache() - assert cache_path.exists() + assert not cache_path.exists() with pytest.raises(CacheError): # Missing cache _loader = GraphLoader(cache_only=True) From c10a34171c95b3afc044513aef7d025e8e58f116 Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Tue, 5 May 2026 13:47:45 +0200 Subject: [PATCH 10/14] Add docstrings --- fedivertex/cache.py | 110 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 2 deletions(-) diff --git a/fedivertex/cache.py b/fedivertex/cache.py index 55f0944..134bdaf 100644 --- a/fedivertex/cache.py +++ b/fedivertex/cache.py @@ -35,6 +35,15 @@ class CacheStatus(Enum): def read_last_update(filepath): + """Read the last update timestamp from a cache file. + + :param filepath: Path to the file containing the last update timestamp. + :type filepath: Path + :raises CacheError: if the file content is not a valid ISO datetime. + :return: Parsed datetime of the last update. + :rtype: datetime + """ + try: with open(filepath, "r", encoding="utf-8") as update_file: return datetime.fromisoformat(update_file.read()) @@ -43,6 +52,21 @@ def read_last_update(filepath): class DatasetInfo: + """Container for dataset-related paths and metadata. + + This class centralizes all information required to interact with the dataset, + including cache locations, download URLs, and last update timestamps. + + :param cache_dir: Root directory for the cache. + :type cache_dir: Path + :param light_dataset: Whether to use the reduced version of the dataset. + :type light_dataset: bool + :param cache_only: If True, only local cache is used (no network requests). + :type cache_only: bool + :raises CacheError: if cache_only=True and no cache is available. + :raises DownloadError: if metadata cannot be retrieved from the remote source. + """ + def __init__(self, cache_dir: Path, light_dataset: bool, cache_only: bool): self.cache_root = cache_dir self.light_version = light_dataset @@ -78,6 +102,20 @@ def __init__(self, cache_dir: Path, light_dataset: bool, cache_only: bool): def download_from_http(url: str, filepath: Path): # Inspired from Croissant ML codebase + """Download a file from an HTTP endpoint with progress reporting. + + The file is first written to a temporary location and then atomically + renamed to avoid partial or corrupted downloads. + + :param url: URL of the file to download. + :type url: str + :param filepath: Destination path for the downloaded file. + :type filepath: Path + :raises requests.RequestException: if the HTTP request fails. + :return: None + :rtype: None + """ + response = requests.get( url, stream=True, @@ -105,6 +143,14 @@ def download_from_http(url: str, filepath: Path): # Inspired from Croissant ML def clear_default_cache(): + """Remove the entire default cache directory. + + This deletes all cached datasets stored in the default cache location. + + :return: None + :rtype: None + """ + cache_dir = Path(DEFAULT_CACHE_DIR) if cache_dir.exists(): @@ -112,6 +158,15 @@ def clear_default_cache(): def check_for_update(dataset_info: DatasetInfo) -> CacheStatus: + """Check whether the local cache is up-to-date with the remote dataset. + + :param dataset_info: Dataset information object. + :type dataset_info: DatasetInfo + :return: Cache status indicating whether the dataset is up-to-date, + outdated, absent, or corrupted. + :rtype: CacheStatus + """ + update_file_path = dataset_info.dataset_dir / "last_update.txt" if update_file_path.exists(): @@ -123,7 +178,7 @@ def check_for_update(dataset_info: DatasetInfo) -> CacheStatus: print("Cache found, checking for updates...") - if last_local_update >= datetime.fromisoformat(dataset_info.last_update): + if last_local_update >= dataset_info.last_update: print("Cache is up-to-date, no download necessary.") return CacheStatus.UPTODATE else: @@ -135,6 +190,19 @@ def check_for_update(dataset_info: DatasetInfo) -> CacheStatus: def download_dataset(dataset_info: DatasetInfo): + """Download and extract the dataset into the cache directory. + + The dataset archive is downloaded, extracted, and normalized so that + the dataset directory has a stable name independent of versioning. + + :param dataset_info: Dataset information object. + :type dataset_info: DatasetInfo + :raises requests.RequestException: if the download fails. + :raises zipfile.BadZipFile: if the archive is invalid. + :return: None + :rtype: None + """ + archive_path = dataset_info.cache_root / "archive.zip" download_from_http(dataset_info.data_url, archive_path) @@ -153,15 +221,38 @@ def download_dataset(dataset_info: DatasetInfo): def create_update_date_file(dataset_info: DatasetInfo): + """Write the dataset last update timestamp to the cache. + + :param dataset_info: Dataset information object. + :type dataset_info: DatasetInfo + :return: None + :rtype: None + """ + update_file_path = dataset_info.dataset_dir / "last_update.txt" with open(update_file_path, "w", encoding="utf-8") as update_file: - update_file.write(dataset_info.last_update) + update_file.write(dataset_info.last_update.isoformat()) def init_cache( light_dataset: bool, cache_dir: Optional[Path | str] = None, cache_only=False ) -> DatasetInfo: + """Initialize dataset cache metadata without downloading data. + + This function prepares the cache directory and returns a DatasetInfo + object describing the dataset configuration. + + :param light_dataset: Whether to use the reduced dataset version. + :type light_dataset: bool + :param cache_dir: Optional custom cache directory. + :type cache_dir: Optional[Path | str] + :param cache_only: If True, only local cache is used (no network requests). + :type cache_only: bool + :return: Dataset information object. + :rtype: DatasetInfo + """ + if cache_dir is None: cache_dir = DEFAULT_CACHE_DIR cache_dir = Path(cache_dir) @@ -173,6 +264,21 @@ def init_cache( def load_dataset( light_dataset: bool, cache_dir: Optional[Path | str] = None, cache_only=False ) -> DatasetInfo: + """Ensure the dataset is available locally and up-to-date. + + This function checks the cache status and downloads the dataset if necessary, + unless cache_only is set to True. + + :param light_dataset: Whether to use the reduced dataset version. + :type light_dataset: bool + :param cache_dir: Optional custom cache directory. + :type cache_dir: Optional[Path | str] + :param cache_only: If True, only local cache is used (no download allowed). + :type cache_only: bool + :return: Dataset information object pointing to the local dataset. + :rtype: DatasetInfo + """ + dataset_info = init_cache(light_dataset, cache_dir, cache_only) if not cache_only: From 88585908a9e76330f6e82316ae8706ed0fd9d51f Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Tue, 5 May 2026 13:52:57 +0200 Subject: [PATCH 11/14] Fix typo in the README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0ddf9c4..e85ee47 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ This Python package provides a simple interface to interact with Fedivertex: htt Our package automatically downloads the dataset from Kaggle and loads graphs in a usable format (i.e., NetworkX). The Fediverse Graph dataset provides graphs for different decentralized social media. -These graphs represents the interactions between servers in these decentralized social media. +These graphs model the interactions between servers in these decentralized social media. The graph type corresponds to the type of interactions modelled by the graph. Finally, the dataset provides the graphs obtained on different dates, so the users can analyze the evolution of the interactions. From c860ebab4c8db1da1e4ebff78ded32da126ec32c Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Tue, 5 May 2026 13:56:17 +0200 Subject: [PATCH 12/14] Fix a test failure due to date format --- fedivertex/cache.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fedivertex/cache.py b/fedivertex/cache.py index 134bdaf..e2808f8 100644 --- a/fedivertex/cache.py +++ b/fedivertex/cache.py @@ -46,7 +46,7 @@ def read_last_update(filepath): try: with open(filepath, "r", encoding="utf-8") as update_file: - return datetime.fromisoformat(update_file.read()) + return update_file.read() except ValueError: raise CacheError("Cache corrupted (invalid update date), download necessary.") @@ -171,14 +171,16 @@ def check_for_update(dataset_info: DatasetInfo) -> CacheStatus: if update_file_path.exists(): try: - last_local_update = read_last_update(update_file_path) + last_local_update = datetime.fromisoformat( + read_last_update(update_file_path) + ) except CacheError as err: print(str(err)) return CacheStatus.CORRUPTED print("Cache found, checking for updates...") - if last_local_update >= dataset_info.last_update: + if last_local_update >= datetime.fromisoformat(dataset_info.last_update): print("Cache is up-to-date, no download necessary.") return CacheStatus.UPTODATE else: @@ -232,7 +234,7 @@ def create_update_date_file(dataset_info: DatasetInfo): update_file_path = dataset_info.dataset_dir / "last_update.txt" with open(update_file_path, "w", encoding="utf-8") as update_file: - update_file.write(dataset_info.last_update.isoformat()) + update_file.write(dataset_info.last_update) def init_cache( From 4f6096c35dd613e284ad3de0c4432fac1bf87107 Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Tue, 5 May 2026 14:02:08 +0200 Subject: [PATCH 13/14] Fixed invalid cache detection --- fedivertex/cache.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fedivertex/cache.py b/fedivertex/cache.py index e2808f8..8947430 100644 --- a/fedivertex/cache.py +++ b/fedivertex/cache.py @@ -46,7 +46,7 @@ def read_last_update(filepath): try: with open(filepath, "r", encoding="utf-8") as update_file: - return update_file.read() + return datetime.fromisoformat(update_file.read()) except ValueError: raise CacheError("Cache corrupted (invalid update date), download necessary.") @@ -90,7 +90,7 @@ def __init__(self, cache_dir: Path, light_dataset: bool, cache_only: bool): f"Could not retrieve dataset metadata (Invalid status {resp.status_code})" ) metadata = resp.json() - self.last_update = metadata["dateModified"] + date = metadata["dateModified"] except requests.RequestException as err: raise DownloadError( f"Could not retrieve dataset metadata ({str(err)})" @@ -100,6 +100,13 @@ def __init__(self, cache_dir: Path, light_dataset: bool, cache_only: bool): "Could not retrieve dataset metadata (Missing 'dateModified' in the metadata)" ) from err + try: + self.last_update = datetime.fromisoformat(date) + except ValueError as err: + raise DownloadError( + f"Could not retrieve dataset date (Invalid format '{date}')" + ) from err + def download_from_http(url: str, filepath: Path): # Inspired from Croissant ML codebase """Download a file from an HTTP endpoint with progress reporting. @@ -171,16 +178,14 @@ def check_for_update(dataset_info: DatasetInfo) -> CacheStatus: if update_file_path.exists(): try: - last_local_update = datetime.fromisoformat( - read_last_update(update_file_path) - ) + last_local_update = read_last_update(update_file_path) except CacheError as err: print(str(err)) return CacheStatus.CORRUPTED print("Cache found, checking for updates...") - if last_local_update >= datetime.fromisoformat(dataset_info.last_update): + if last_local_update >= dataset_info.last_update: print("Cache is up-to-date, no download necessary.") return CacheStatus.UPTODATE else: @@ -234,7 +239,7 @@ def create_update_date_file(dataset_info: DatasetInfo): update_file_path = dataset_info.dataset_dir / "last_update.txt" with open(update_file_path, "w", encoding="utf-8") as update_file: - update_file.write(dataset_info.last_update) + update_file.write(dataset_info.last_update.isoformat()) def init_cache( From d4bde3062b60b4deeb214aa7032426d7b77eb80a Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Tue, 5 May 2026 14:02:31 +0200 Subject: [PATCH 14/14] Remove usused import --- tests/test_errors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_errors.py b/tests/test_errors.py index 0be661d..5f36215 100644 --- a/tests/test_errors.py +++ b/tests/test_errors.py @@ -1,5 +1,4 @@ from pathlib import Path -from turtle import clear import pytest