GoogleCloudPlatform · ajaysundark · Jun 23, 2026
diff --git a/perfkitbenchmarker/configs/container_spec.py b/perfkitbenchmarker/configs/container_spec.py
@@ -243,6 +243,7 @@ def __init__(
     self.vm_spec: virtual_machine_spec.BaseVmSpec
     self.machine_families: list[str] | None
     self.sandbox_config: SandboxSpec | None
+    self.swap_config: SwapConfigSpec | None
 
   @classmethod
   def _GetOptionDecoderConstructions(cls):
@@ -259,6 +260,7 @@ def _GetOptionDecoderConstructions(cls):
             option_decoders.ListDecoder,
             {'item_decoder': option_decoders.StringDecoder(), 'default': None},
         ),
+        'swap_config': (_SwapConfigDecoder, {'default': None, 'none_ok': True}),
         'vm_count': (
             option_decoders.IntDecoder,
             {'default': _DEFAULT_VM_COUNT, 'min': 0},
@@ -393,6 +395,93 @@ def Decode(self, value, component_full_name, flag_values):
     )
 
 
+# Backing stores accepted per cloud. Validated by each provider, not here, so
+# that adding a cloud doesn't require editing this shared spec.
+#   GCP/GKE : 'local_ssd' (ephemeral local SSD) | 'boot_disk'
+#   AWS/EKS : 'instance_store' (NVMe, Nitro-encrypted) | 'io2'
+_DEFAULT_SWAP_BACKING_STORE = 'local_ssd'
+
+
+class SwapConfigSpec(spec.BaseSpec):
+  """Declarative swap configuration for a node pool.
+
+  This treats swap as a node-pool property workloads can request, instead of
+  requiring benchmarks to configure it at runtime. Providers translate these
+  cloud-neutral fields into their native mechanisms:
+
+    GKE  -> linuxConfig.swapConfig passed via `gcloud ... --system-config-from-file`.
+            GKE provisions and encrypts the swap device itself.
+    EKS  -> kubelet swapBehavior in the nodeadm user-data, plus an
+            instance-store or io2 device formatted at boot.
+
+  Attributes:
+    enabled: Whether swap is configured on the node pool.
+    encrypted: Whether the swap device is encrypted. True uses GKE's default
+      ephemeral keys or AWS Nitro encryption. False provides an unencrypted baseline.
+    backing_store: Which device backs swap (see per-cloud values above).
+    size_percent: Swap size as a percentage of the backing store (GKE local-SSD
+      profile). Ignored when size_gb is set.
+    size_gb: Explicit swap size in GiB (used by EKS or boot-disk profiles).
+    behavior: kubelet memory-swap behavior ('LimitedSwap' or 'NoSwap'). Default
+      is LimitedSwap so scheduled pods can use the swap.
+  """
+
+  def __init__(self, *args, **kwargs):
+    self.enabled: bool = False
+    self.encrypted: bool = True
+    self.backing_store: str = _DEFAULT_SWAP_BACKING_STORE
+    self.size_percent: int = 30
+    self.size_gb: int | None = None
+    self.behavior: str = 'LimitedSwap'
+    super().__init__(*args, **kwargs)
+
+  @classmethod
+  def _GetOptionDecoderConstructions(cls):
+    result = super()._GetOptionDecoderConstructions()
+    result.update({
+        'enabled': (
+            option_decoders.BooleanDecoder,
+            {'default': False},
+        ),
+        'encrypted': (
+            option_decoders.BooleanDecoder,
+            {'default': True},
+        ),
+        'backing_store': (
+            option_decoders.StringDecoder,
+            {'default': _DEFAULT_SWAP_BACKING_STORE},
+        ),
+        'size_percent': (
+            option_decoders.IntDecoder,
+            {'default': 30, 'min': 1, 'max': 100},
+        ),
+        'size_gb': (
+            option_decoders.IntDecoder,
+            {'default': None, 'none_ok': True, 'min': 1},
+        ),
+        'behavior': (
+            option_decoders.StringDecoder,
+            {'default': 'LimitedSwap'},
+        ),
+    })
+    return result
+
+
+class _SwapConfigDecoder(option_decoders.TypeVerifier):
+  """Decodes the swap_config option of a nodepool."""
+
+  def __init__(self, **kwargs):
+    super().__init__(valid_types=(dict,), **kwargs)
+
+  def Decode(self, value, component_full_name, flag_values):
+    super().Decode(value, component_full_name, flag_values)
+    return SwapConfigSpec(
+        self._GetOptionFullName(component_full_name),
+        flag_values=flag_values,
+        **value,
+    )
+
+
 class ContainerClusterSpec(spec.BaseSpec):
   """Spec containing info needed to create a container cluster.
 

diff --git a/perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py b/perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py
@@ -30,6 +30,7 @@
 from urllib import parse
 
 from absl import flags
+import yaml
 from perfkitbenchmarker import errors
 from perfkitbenchmarker import provider_info
 from perfkitbenchmarker import virtual_machine
@@ -208,8 +209,143 @@ def _RenderNodeGroupJson(
     if nodepool.min_nodes != nodepool.max_nodes:
       group_json['minSize'] = nodepool.min_nodes
       group_json['maxSize'] = nodepool.max_nodes
+    if nodepool.swap_config and nodepool.swap_config.enabled:
+      self._AddSwapToNodeGroupJson(nodepool, group_json)
     return group_json
 
+  # Device name the io2 swap volume is attached as; the kernel renames it to a
+  # /dev/nvme*n1 node on Nitro, which the bootstrap script resolves by EBS model.
+  _EKS_IO2_SWAP_DEVICE = '/dev/sdf'
+  # Default io2 provisioned IOPS for the swap volume (overridable via the
+  # benchmark if a swap_config.iops field is later added).
+  _EKS_IO2_DEFAULT_IOPS = 16000
+
+  def _AddSwapToNodeGroupJson(
+      self,
+      nodepool: container.BaseNodePoolConfig,
+      group_json: dict[str, Any],
+  ) -> None:
+    """Wire swap into an eksctl managed-nodegroup spec (AmazonLinux2023).
+
+    EKS does not have a managed swap. Swap on EKS is handled by two requirements,
+    which are rendered here in the provider instead of individual benchmarks:
+
+      1. The swap device.
+         - instance_store: The NVMe Instance Store is physically present and
+           Nitro hardware-encrypted at rest. No extra volume or cryptsetup needed.
+         - io2: An EBS io2 volume added via eksctl `additionalVolumes`. The
+           `volumeEncrypted` field follows swap_config.encrypted.
+      2. Kubelet swap behavior, allowing scheduled pods to use swap. AL2023
+         uses nodeadm. We emit a nodeadm `NodeConfig` via
+         `overrideBootstrapCommand`. Eksctl prepends it, and nodeadm merges it
+         with the default config, so we only set the swap-related fields.
+
+    The device is formatted and activated by `preBootstrapCommands`. These run
+    before the kubelet starts, ensuring the swap device is active when the kubelet
+    applies memorySwap.swapBehavior.
+
+    Note: There are reports of preBootstrapCommands being flaky on some
+    AL2023 eksctl versions (eksctl-io/eksctl#7903). If this happens, these
+    commands can be delivered via a custom launch-template userData script.
+    """
+    swap = nodepool.swap_config
+    if swap.backing_store not in ('instance_store', 'io2'):
+      raise errors.Config.InvalidValue(
+          'EKS swap backing_store must be "instance_store" or "io2"; got '
+          f'{swap.backing_store!r}.'
+      )
+
+    # ---- 1. swap device ----------------------------------------------------
+    if swap.backing_store == 'io2':
+      size_gb = swap.size_gb or 100
+      volume: dict[str, Any] = {
+          'volumeName': self._EKS_IO2_SWAP_DEVICE,
+          'volumeType': 'io2',
+          'volumeSize': size_gb,
+          'volumeIOPS': self._EKS_IO2_DEFAULT_IOPS,
+          # encrypted=False provides an unencrypted baseline. True uses KMS/Nitro.
+          'volumeEncrypted': swap.encrypted,
+      }
+      group_json.setdefault('additionalVolumes', []).append(volume)
+      device_resolver = self._EksIo2DeviceResolverShell()
+    else:  # instance_store
+      if not swap.encrypted:
+        # Instance Store on Nitro is always encrypted at rest; there is no
+        # unencrypted instance-store row. Use io2 (volumeEncrypted=false) for an
+        # unencrypted AWS baseline instead.
+        logging.warning(
+            '[eks swap] instance_store is always Nitro-encrypted; '
+            'encrypted=False is ignored. Use backing_store=io2 for an '
+            'unencrypted AWS baseline.'
+        )
+      device_resolver = self._EksInstanceStoreDeviceResolverShell()
+
+    # ---- 2. device activation (runs before kubelet) ------------------------
+    # mkswap formats the whole device; the size was already set on the volume
+    # (io2) or is fixed by the instance (instance store).
+    group_json.setdefault('preBootstrapCommands', []).extend([
+        device_resolver,
+        # SWAP_DEV is exported by the resolver snippet above.
+        'mkswap "$SWAP_DEV"',
+        'swapon "$SWAP_DEV"',
+        'swapon --show',
+    ])
+
+    # ---- 3. kubelet swap behavior (nodeadm NodeConfig, merged by nodeadm) ---
+    nodeadm = {
+        'apiVersion': 'node.eks.aws/v1alpha1',
+        'kind': 'NodeConfig',
+        'spec': {
+            'kubelet': {
+                'config': {
+                    'memorySwap': {'swapBehavior': swap.behavior},
+                },
+                # failSwapOn must be false for the kubelet to start with swap on.
+                'flags': ['--fail-swap-on=false'],
+            },
+        },
+    }
+    group_json['overrideBootstrapCommand'] = yaml.safe_dump(
+        nodeadm, default_flow_style=False, sort_keys=False
+    )
+    logging.info(
+        '[eks swap] nodegroup %s: backing_store=%s encrypted=%s behavior=%s',
+        nodepool.name,
+        swap.backing_store,
+        swap.encrypted,
+        swap.behavior,
+    )
+
+  def _EksInstanceStoreDeviceResolverShell(self) -> str:
+    """Shell that exports SWAP_DEV = first NVMe Instance Store device."""
+    # 'Amazon EC2 NVMe Instance Storage' is the model string Nitro reports for
+    # instance-store volumes; EBS volumes report 'Amazon Elastic Block Store'.
+    return (
+        'SWAP_DEV=$(for d in /dev/nvme*n1; do '
+        '[ -e "$d" ] || continue; '
+        'm=$(nvme id-ctrl "$d" 2>/dev/null | grep -i "mn " | head -1); '
+        'echo "$m" | grep -qi "Instance Storage" && { echo "$d"; break; }; '
+        'done); '
+        '[ -n "$SWAP_DEV" ] || { echo "no instance-store NVMe found" >&2; '
+        'exit 1; }; export SWAP_DEV'
+    )
+
+  def _EksIo2DeviceResolverShell(self) -> str:
+    """Shell that exports SWAP_DEV = the attached io2 EBS volume's NVMe node."""
+    # On Nitro, /dev/sdf is renamed to a /dev/nvme*n1 node; match the EBS model
+    # and exclude the root device so we never pick the OS disk.
+    return (
+        'ROOT=$(lsblk -no pkname "$(findmnt -no SOURCE /)" 2>/dev/null); '
+        'SWAP_DEV=$(for d in /dev/nvme*n1; do '
+        '[ -e "$d" ] || continue; '
+        'n=$(basename "$d"); [ "$n" = "$ROOT" ] && continue; '
+        'm=$(nvme id-ctrl "$d" 2>/dev/null | grep -i "mn " | head -1); '
+        'echo "$m" | grep -qi "Elastic Block Store" && { echo "$d"; break; }; '
+        'done); '
+        '[ -n "$SWAP_DEV" ] || { echo "no io2 EBS swap device found" >&2; '
+        'exit 1; }; export SWAP_DEV'
+    )
+
   def _WriteJsonToFile(self, json_dict: dict[str, Any]) -> str:
     """Renders the given json dict to a file.
 

diff --git a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
@@ -18,13 +18,16 @@
 import math
 import os
 import re
+import tempfile
 import typing
 from typing import Any
 
 from absl import flags
+import yaml
 from perfkitbenchmarker import errors
 from perfkitbenchmarker import provider_info
 from perfkitbenchmarker import virtual_machine_spec
+from perfkitbenchmarker import vm_util
 from perfkitbenchmarker.configs import container_spec as container_spec_lib
 from perfkitbenchmarker.providers.gcp import flags as gcp_flags
 from perfkitbenchmarker.providers.gcp import gce_disk
@@ -570,7 +573,15 @@ def _AddNodeParamsToCmd(
     ):
       cmd.args.append('--enable-fast-socket')
 
-    if FLAGS.gke_node_system_config is not None:
+    # Node system config (kubelet/sysctl) and swap config both flow through
+    # --system-config-from-file. Swap takes precedence because it must inject
+    # linuxConfig.swapConfig + kubeletConfig.memorySwapBehavior. It merges any
+    # user-provided gke_node_system_config so both coexist.
+    if nodepool_config.swap_config and nodepool_config.swap_config.enabled:
+      cmd.flags['system-config-from-file'] = self._WriteSwapSystemConfigFile(
+          nodepool_config.swap_config, FLAGS.gke_node_system_config
+      )
+    elif FLAGS.gke_node_system_config is not None:
       cmd.flags['system-config-from-file'] = FLAGS.gke_node_system_config
 
     if nodepool_config.sandbox_config is not None:
@@ -585,6 +596,92 @@ def _AddNodeParamsToCmd(
       cmd.flags['min-nodes'] = nodepool_config.min_nodes
       cmd.flags['max-nodes'] = nodepool_config.max_nodes
 
+  def _WriteSwapSystemConfigFile(
+      self,
+      swap_config: container_spec_lib.SwapConfigSpec,
+      base_config_path: str | None = None,
+  ) -> str:
+    """Builds the GKE node system-config file that enables managed swap.
+
+    GKE provisions and encrypts the swap device itself when given
+    linuxConfig.swapConfig. Privileged DaemonSets, in-pod dm-crypt, and
+    cgroup memory.swap.max edits are not required. See:
+    https://docs.cloud.google.com/kubernetes-engine/docs/how-to/node-memory-swap
+
+    Args:
+      swap_config: The declarative swap config for this node pool.
+      base_config_path: Optional path to a user-supplied system-config YAML
+        (gke_node_system_config) whose keys are merged in first.
+
+    Returns:
+      Path to a temp YAML file suitable for `gcloud ... --system-config-from-file`.
+    """
+    config: dict[str, Any] = {}
+    if base_config_path:
+      with open(base_config_path) as f:
+        config = yaml.safe_load(f) or {}
+
+    linux_config = config.setdefault('linuxConfig', {})
+    swap_block: dict[str, Any] = {'enabled': True}
+
+    # Each profile accepts swapSizeGib OR swapSizePercent (LinuxNodeConfig API).
+    size_field = (
+        {'swapSizeGib': swap_config.size_gb}
+        if swap_config.size_gb
+        else {'swapSizePercent': swap_config.size_percent}
+    )
+    if swap_config.backing_store == 'local_ssd':
+      swap_block['ephemeralLocalSsdProfile'] = size_field
+    elif swap_config.backing_store == 'boot_disk':
+      # Boot-disk-backed managed swap. NOTE: this is NOT a dedicated,
+      # IOPS-provisioned hyperdisk swap device -- GKE managed swap only backs
+      # onto the boot disk / ephemeral / dedicated local SSD. A separately
+      # attached hyperdisk swap volume with provisioned IOPS still needs a disk
+      # attach path (see the PR-6776 review).
+      swap_block['bootDiskProfile'] = size_field
+    else:
+      raise errors.Config.InvalidValue(
+          'GKE swap backing_store must be "local_ssd" or "boot_disk"; got '
+          f'{swap_config.backing_store!r}.'
+      )
+
+    # Encrypted is GKE's default (ephemeral key). False disables encryption.
+    if not swap_config.encrypted:
+      swap_block['encryptionConfig'] = {'disabled': True}
+
+    linux_config['swapConfig'] = swap_block
+
+    # NOTE: we deliberately do NOT emit a kubelet swapBehavior key here. GKE's
+    # managed swap applies LimitedSwap proportionally on its own, and the GKE
+    # node system-config schema (LinuxNodeConfig) does not expose a kubelet
+    # memory-swap field -- injecting an unknown key would make gcloud reject the
+    # whole system-config. swap_config.behavior is honored on EKS, where the
+    # standard k8s KubeletConfiguration (memorySwap.swapBehavior) must be set
+    # explicitly. If a non-default behavior is ever requested on GKE, surface it
+    # rather than silently dropping it.
+    if swap_config.behavior and swap_config.behavior != 'LimitedSwap':
+      logging.warning(
+          'GKE manages swap behavior automatically (LimitedSwap); ignoring '
+          'requested swap behavior %r.',
+          swap_config.behavior,
+      )
+
+    tmp = tempfile.NamedTemporaryFile(
+        mode='w',
+        prefix='pkb_gke_swap_syscfg_',
+        suffix='.yaml',
+        delete=False,
+        dir=vm_util.GetTempDir(),
+    )
+    yaml.safe_dump(config, tmp)
+    tmp.close()
+    logging.info(
+        'GKE swap node system config written to %s:\n%s',
+        tmp.name,
+        yaml.safe_dump(config),
+    )
+    return tmp.name
+
   def _PostCreate(self):
     """Waits for kube-dns to be available."""
     super()._PostCreate()