diff --git a/CHANGELOG.md b/CHANGELOG.md index 81f64b333c..5f7949feed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ CHANGELOG - Fail `pcluster build-image` early when the downloaded cookbook version does not match the ParallelCluster CLI version. - Fix login nodes not mounting `/opt/parallelcluster/shared` when EFS is used as the internal shared storage type. - Fix an issue where compute nodes are incorrectly replaced when launching a large number of nodes due to eventual consistency. +- Fix an issue where starting the compute fleet may not reliably recover the cluster from protected mode. **DEPRECATIONS** - Amazon Linux 2 is no longer supported. diff --git a/tests/integration-tests/tests/schedulers/test_slurm.py b/tests/integration-tests/tests/schedulers/test_slurm.py index ad57043214..7a424a5853 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm.py +++ b/tests/integration-tests/tests/schedulers/test_slurm.py @@ -379,13 +379,6 @@ def test_slurm_custom_partitions( assert_that(scheduler_commands.get_partition_state(partition=partition)).is_equal_to(expected_state) logging.info("Checking pcluster start...") - # Restore the protected failure count to its default before starting the fleet. The failing job left dynamic nodes - # still powering up when the fleet was stopped; with a short clustermgtd cleanup window they can survive stop/start - # with a stale nodeaddr. On the first poll after start, clustermgtd re-counts them as bootstrap failures and, with - # the lowered count of 2, re-enters protected mode, flipping the pcluster-managed partitions back to INACTIVE and - # making this test intermittently fail. The default count is high enough that the few leftover nodes cannot reach - # the threshold in a single poll. - set_protected_failure_count(remote_command_executor, 10) for partition in custom_partitions: scheduler_commands.set_partition_state(partition, "INACTIVE") cluster.start()