From 115bb535b04347b85c64d7d5e58fd56667068555 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Fri, 12 Jun 2026 12:33:19 -0400 Subject: [PATCH 1/2] [Test] Add log line in get_similar_instance_types to facilitate troubleshooting. --- tests/integration-tests/utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/integration-tests/utils.py b/tests/integration-tests/utils.py index 264ae518d7..fab51fb494 100644 --- a/tests/integration-tests/utils.py +++ b/tests/integration-tests/utils.py @@ -1074,6 +1074,7 @@ def get_similar_instance_types(instance_type: str, region: str = None, max_items # Now query for similar instances using filters paginator = ec2.get_paginator("describe_instance_types") similar_instances = [] + reached_max_items = False for page in paginator.paginate( Filters=[ @@ -1097,7 +1098,12 @@ def get_similar_instance_types(instance_type: str, region: str = None, max_items ): similar_instances.append(instance["InstanceType"]) if max_items and len(similar_instances) >= max_items: - return similar_instances + reached_max_items = True + break + if reached_max_items: + break + + logging.info(f"Retrieved instance types equivalent to {instance_type} in {region}: {similar_instances}") return similar_instances From 13bd11c664d2820a4c2bfef40dcb99721e92daae Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Fri, 12 Jun 2026 13:29:31 -0400 Subject: [PATCH 2/2] [Test] Make the retrieval of flexible instance types more resilient. In particular: 1. Retry the retrieval to be robust against transient failures (networking glithces or throttling) 2. in case of consistent failure, emit a warning log and fall back to the original instance type. 3. sort the list of equivalent instance types so that multiple calls to the function always returns the same result. --- tests/integration-tests/conftest.py | 13 ++++++++++++- tests/integration-tests/utils.py | 13 +++++++------ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index 3b86fb3d25..28abe9e17d 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -698,7 +698,18 @@ def inject_placement_group_settings(vpc_stack, instance, region, kwargs): def inject_flexible_instance_types_settings(instance, region, kwargs): - kwargs["flexible_instance_types"] = list({instance, *get_similar_instance_types(instance, region, 5)}) + flexible_instance_types = [instance] + try: + flexible_instance_types.extend(it for it in get_similar_instance_types(instance, region, 5) if it != instance) + except Exception: + logging.warning( + "Failed to retrieve instance types equivalent to %s in region %s. " + "Falling back to using only the original instance type %s.", + instance, + region, + instance, + ) + kwargs["flexible_instance_types"] = flexible_instance_types def inject_additional_image_configs_settings(image_config, request): diff --git a/tests/integration-tests/utils.py b/tests/integration-tests/utils.py index fab51fb494..752a17b8c9 100644 --- a/tests/integration-tests/utils.py +++ b/tests/integration-tests/utils.py @@ -1054,6 +1054,7 @@ def _get_gpu_spec(instance_type_data): return frozenset((gpu.get("Manufacturer", ""), gpu.get("Count", 0)) for gpu in gpu_info.get("Gpus", [])) +@retry(wait_fixed=seconds(10), stop_max_delay=minutes(1)) def get_similar_instance_types(instance_type: str, region: str = None, max_items: int = None): ec2 = boto3.client("ec2", region_name=region) @@ -1074,7 +1075,6 @@ def get_similar_instance_types(instance_type: str, region: str = None, max_items # Now query for similar instances using filters paginator = ec2.get_paginator("describe_instance_types") similar_instances = [] - reached_max_items = False for page in paginator.paginate( Filters=[ @@ -1097,11 +1097,12 @@ def get_similar_instance_types(instance_type: str, region: str = None, max_items and instance_inference_accelerators == target_inference_accelerators ): similar_instances.append(instance["InstanceType"]) - if max_items and len(similar_instances) >= max_items: - reached_max_items = True - break - if reached_max_items: - break + + # Sort before truncating so that multiple calls always return the same instance types in the same order, + # regardless of the order in which the API returns them. + similar_instances = sorted(similar_instances) + if max_items: + similar_instances = similar_instances[:max_items] logging.info(f"Retrieved instance types equivalent to {instance_type} in {region}: {similar_instances}")