Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions perfkitbenchmarker/configs/container_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ def __init__(
self.vm_spec: virtual_machine_spec.BaseVmSpec
self.machine_families: list[str] | None
self.sandbox_config: SandboxSpec | None
self.swap_config: SwapConfigSpec | None

@classmethod
def _GetOptionDecoderConstructions(cls):
Expand All @@ -259,6 +260,7 @@ def _GetOptionDecoderConstructions(cls):
option_decoders.ListDecoder,
{'item_decoder': option_decoders.StringDecoder(), 'default': None},
),
'swap_config': (_SwapConfigDecoder, {'default': None, 'none_ok': True}),
'vm_count': (
option_decoders.IntDecoder,
{'default': _DEFAULT_VM_COUNT, 'min': 0},
Expand Down Expand Up @@ -393,6 +395,93 @@ def Decode(self, value, component_full_name, flag_values):
)


# Backing stores accepted per cloud. Validated by each provider, not here, so
# that adding a cloud doesn't require editing this shared spec.
# GCP/GKE : 'local_ssd' (ephemeral local SSD) | 'boot_disk'
# AWS/EKS : 'instance_store' (NVMe, Nitro-encrypted) | 'io2'
_DEFAULT_SWAP_BACKING_STORE = 'local_ssd'


class SwapConfigSpec(spec.BaseSpec):
"""Declarative swap configuration for a node pool.

This treats swap as a node-pool property workloads can request, instead of
requiring benchmarks to configure it at runtime. Providers translate these
cloud-neutral fields into their native mechanisms:

GKE -> linuxConfig.swapConfig passed via `gcloud ... --system-config-from-file`.
GKE provisions and encrypts the swap device itself.
EKS -> kubelet swapBehavior in the nodeadm user-data, plus an
instance-store or io2 device formatted at boot.

Attributes:
enabled: Whether swap is configured on the node pool.
encrypted: Whether the swap device is encrypted. True uses GKE's default
ephemeral keys or AWS Nitro encryption. False provides an unencrypted baseline.
backing_store: Which device backs swap (see per-cloud values above).
size_percent: Swap size as a percentage of the backing store (GKE local-SSD
profile). Ignored when size_gb is set.
size_gb: Explicit swap size in GiB (used by EKS or boot-disk profiles).
behavior: kubelet memory-swap behavior ('LimitedSwap' or 'NoSwap'). Default
is LimitedSwap so scheduled pods can use the swap.
"""

def __init__(self, *args, **kwargs):
self.enabled: bool = False
self.encrypted: bool = True
self.backing_store: str = _DEFAULT_SWAP_BACKING_STORE
self.size_percent: int = 30
self.size_gb: int | None = None
self.behavior: str = 'LimitedSwap'
super().__init__(*args, **kwargs)

@classmethod
def _GetOptionDecoderConstructions(cls):
result = super()._GetOptionDecoderConstructions()
result.update({
'enabled': (
option_decoders.BooleanDecoder,
{'default': False},
),
'encrypted': (
option_decoders.BooleanDecoder,
{'default': True},
),
'backing_store': (
option_decoders.StringDecoder,
{'default': _DEFAULT_SWAP_BACKING_STORE},
),
'size_percent': (
option_decoders.IntDecoder,
{'default': 30, 'min': 1, 'max': 100},
),
'size_gb': (
option_decoders.IntDecoder,
{'default': None, 'none_ok': True, 'min': 1},
),
'behavior': (
option_decoders.StringDecoder,
{'default': 'LimitedSwap'},
),
})
return result


class _SwapConfigDecoder(option_decoders.TypeVerifier):
"""Decodes the swap_config option of a nodepool."""

def __init__(self, **kwargs):
super().__init__(valid_types=(dict,), **kwargs)

def Decode(self, value, component_full_name, flag_values):
super().Decode(value, component_full_name, flag_values)
return SwapConfigSpec(
self._GetOptionFullName(component_full_name),
flag_values=flag_values,
**value,
)


class ContainerClusterSpec(spec.BaseSpec):
"""Spec containing info needed to create a container cluster.

Expand Down
136 changes: 136 additions & 0 deletions perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from urllib import parse

from absl import flags
import yaml
from perfkitbenchmarker import errors
from perfkitbenchmarker import provider_info
from perfkitbenchmarker import virtual_machine
Expand Down Expand Up @@ -208,8 +209,143 @@ def _RenderNodeGroupJson(
if nodepool.min_nodes != nodepool.max_nodes:
group_json['minSize'] = nodepool.min_nodes
group_json['maxSize'] = nodepool.max_nodes
if nodepool.swap_config and nodepool.swap_config.enabled:
self._AddSwapToNodeGroupJson(nodepool, group_json)
return group_json

# Device name the io2 swap volume is attached as; the kernel renames it to a
# /dev/nvme*n1 node on Nitro, which the bootstrap script resolves by EBS model.
_EKS_IO2_SWAP_DEVICE = '/dev/sdf'
# Default io2 provisioned IOPS for the swap volume (overridable via the
# benchmark if a swap_config.iops field is later added).
_EKS_IO2_DEFAULT_IOPS = 16000

def _AddSwapToNodeGroupJson(
self,
nodepool: container.BaseNodePoolConfig,
group_json: dict[str, Any],
) -> None:
"""Wire swap into an eksctl managed-nodegroup spec (AmazonLinux2023).

EKS does not have a managed swap. Swap on EKS is handled by two requirements,
which are rendered here in the provider instead of individual benchmarks:

1. The swap device.
- instance_store: The NVMe Instance Store is physically present and
Nitro hardware-encrypted at rest. No extra volume or cryptsetup needed.
- io2: An EBS io2 volume added via eksctl `additionalVolumes`. The
`volumeEncrypted` field follows swap_config.encrypted.
2. Kubelet swap behavior, allowing scheduled pods to use swap. AL2023
uses nodeadm. We emit a nodeadm `NodeConfig` via
`overrideBootstrapCommand`. Eksctl prepends it, and nodeadm merges it
with the default config, so we only set the swap-related fields.

The device is formatted and activated by `preBootstrapCommands`. These run
before the kubelet starts, ensuring the swap device is active when the kubelet
applies memorySwap.swapBehavior.

Note: There are reports of preBootstrapCommands being flaky on some
AL2023 eksctl versions (eksctl-io/eksctl#7903). If this happens, these
commands can be delivered via a custom launch-template userData script.
"""
swap = nodepool.swap_config
if swap.backing_store not in ('instance_store', 'io2'):
raise errors.Config.InvalidValue(
'EKS swap backing_store must be "instance_store" or "io2"; got '
f'{swap.backing_store!r}.'
)

# ---- 1. swap device ----------------------------------------------------
if swap.backing_store == 'io2':
size_gb = swap.size_gb or 100
volume: dict[str, Any] = {
'volumeName': self._EKS_IO2_SWAP_DEVICE,
'volumeType': 'io2',
'volumeSize': size_gb,
'volumeIOPS': self._EKS_IO2_DEFAULT_IOPS,
# encrypted=False provides an unencrypted baseline. True uses KMS/Nitro.
'volumeEncrypted': swap.encrypted,
}
group_json.setdefault('additionalVolumes', []).append(volume)
device_resolver = self._EksIo2DeviceResolverShell()
else: # instance_store
if not swap.encrypted:
# Instance Store on Nitro is always encrypted at rest; there is no
# unencrypted instance-store row. Use io2 (volumeEncrypted=false) for an
# unencrypted AWS baseline instead.
logging.warning(
'[eks swap] instance_store is always Nitro-encrypted; '
'encrypted=False is ignored. Use backing_store=io2 for an '
'unencrypted AWS baseline.'
)
device_resolver = self._EksInstanceStoreDeviceResolverShell()

# ---- 2. device activation (runs before kubelet) ------------------------
# mkswap formats the whole device; the size was already set on the volume
# (io2) or is fixed by the instance (instance store).
group_json.setdefault('preBootstrapCommands', []).extend([
device_resolver,
# SWAP_DEV is exported by the resolver snippet above.
'mkswap "$SWAP_DEV"',
'swapon "$SWAP_DEV"',
'swapon --show',
])

# ---- 3. kubelet swap behavior (nodeadm NodeConfig, merged by nodeadm) ---
nodeadm = {
'apiVersion': 'node.eks.aws/v1alpha1',
'kind': 'NodeConfig',
'spec': {
'kubelet': {
'config': {
'memorySwap': {'swapBehavior': swap.behavior},
},
# failSwapOn must be false for the kubelet to start with swap on.
'flags': ['--fail-swap-on=false'],
},
},
}
group_json['overrideBootstrapCommand'] = yaml.safe_dump(
nodeadm, default_flow_style=False, sort_keys=False
)
logging.info(
'[eks swap] nodegroup %s: backing_store=%s encrypted=%s behavior=%s',
nodepool.name,
swap.backing_store,
swap.encrypted,
swap.behavior,
)

def _EksInstanceStoreDeviceResolverShell(self) -> str:
"""Shell that exports SWAP_DEV = first NVMe Instance Store device."""
# 'Amazon EC2 NVMe Instance Storage' is the model string Nitro reports for
# instance-store volumes; EBS volumes report 'Amazon Elastic Block Store'.
return (
'SWAP_DEV=$(for d in /dev/nvme*n1; do '
'[ -e "$d" ] || continue; '
'm=$(nvme id-ctrl "$d" 2>/dev/null | grep -i "mn " | head -1); '
'echo "$m" | grep -qi "Instance Storage" && { echo "$d"; break; }; '
'done); '
'[ -n "$SWAP_DEV" ] || { echo "no instance-store NVMe found" >&2; '
'exit 1; }; export SWAP_DEV'
)

def _EksIo2DeviceResolverShell(self) -> str:
"""Shell that exports SWAP_DEV = the attached io2 EBS volume's NVMe node."""
# On Nitro, /dev/sdf is renamed to a /dev/nvme*n1 node; match the EBS model
# and exclude the root device so we never pick the OS disk.
return (
'ROOT=$(lsblk -no pkname "$(findmnt -no SOURCE /)" 2>/dev/null); '
'SWAP_DEV=$(for d in /dev/nvme*n1; do '
'[ -e "$d" ] || continue; '
'n=$(basename "$d"); [ "$n" = "$ROOT" ] && continue; '
'm=$(nvme id-ctrl "$d" 2>/dev/null | grep -i "mn " | head -1); '
'echo "$m" | grep -qi "Elastic Block Store" && { echo "$d"; break; }; '
'done); '
'[ -n "$SWAP_DEV" ] || { echo "no io2 EBS swap device found" >&2; '
'exit 1; }; export SWAP_DEV'
)

def _WriteJsonToFile(self, json_dict: dict[str, Any]) -> str:
"""Renders the given json dict to a file.

Expand Down
99 changes: 98 additions & 1 deletion perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,16 @@
import math
import os
import re
import tempfile
import typing
from typing import Any

from absl import flags
import yaml
from perfkitbenchmarker import errors
from perfkitbenchmarker import provider_info
from perfkitbenchmarker import virtual_machine_spec
from perfkitbenchmarker import vm_util
from perfkitbenchmarker.configs import container_spec as container_spec_lib
from perfkitbenchmarker.providers.gcp import flags as gcp_flags
from perfkitbenchmarker.providers.gcp import gce_disk
Expand Down Expand Up @@ -570,7 +573,15 @@ def _AddNodeParamsToCmd(
):
cmd.args.append('--enable-fast-socket')

if FLAGS.gke_node_system_config is not None:
# Node system config (kubelet/sysctl) and swap config both flow through
# --system-config-from-file. Swap takes precedence because it must inject
# linuxConfig.swapConfig + kubeletConfig.memorySwapBehavior. It merges any
# user-provided gke_node_system_config so both coexist.
if nodepool_config.swap_config and nodepool_config.swap_config.enabled:
cmd.flags['system-config-from-file'] = self._WriteSwapSystemConfigFile(
nodepool_config.swap_config, FLAGS.gke_node_system_config
)
elif FLAGS.gke_node_system_config is not None:
cmd.flags['system-config-from-file'] = FLAGS.gke_node_system_config

if nodepool_config.sandbox_config is not None:
Expand All @@ -585,6 +596,92 @@ def _AddNodeParamsToCmd(
cmd.flags['min-nodes'] = nodepool_config.min_nodes
cmd.flags['max-nodes'] = nodepool_config.max_nodes

def _WriteSwapSystemConfigFile(
self,
swap_config: container_spec_lib.SwapConfigSpec,
base_config_path: str | None = None,
) -> str:
"""Builds the GKE node system-config file that enables managed swap.

GKE provisions and encrypts the swap device itself when given
linuxConfig.swapConfig. Privileged DaemonSets, in-pod dm-crypt, and
cgroup memory.swap.max edits are not required. See:
https://docs.cloud.google.com/kubernetes-engine/docs/how-to/node-memory-swap

Args:
swap_config: The declarative swap config for this node pool.
base_config_path: Optional path to a user-supplied system-config YAML
(gke_node_system_config) whose keys are merged in first.

Returns:
Path to a temp YAML file suitable for `gcloud ... --system-config-from-file`.
"""
config: dict[str, Any] = {}
if base_config_path:
with open(base_config_path) as f:
config = yaml.safe_load(f) or {}

linux_config = config.setdefault('linuxConfig', {})
swap_block: dict[str, Any] = {'enabled': True}

# Each profile accepts swapSizeGib OR swapSizePercent (LinuxNodeConfig API).
size_field = (
{'swapSizeGib': swap_config.size_gb}
if swap_config.size_gb
else {'swapSizePercent': swap_config.size_percent}
)
if swap_config.backing_store == 'local_ssd':
swap_block['ephemeralLocalSsdProfile'] = size_field
elif swap_config.backing_store == 'boot_disk':
# Boot-disk-backed managed swap. NOTE: this is NOT a dedicated,
# IOPS-provisioned hyperdisk swap device -- GKE managed swap only backs
# onto the boot disk / ephemeral / dedicated local SSD. A separately
# attached hyperdisk swap volume with provisioned IOPS still needs a disk
# attach path (see the PR-6776 review).
swap_block['bootDiskProfile'] = size_field
else:
raise errors.Config.InvalidValue(
'GKE swap backing_store must be "local_ssd" or "boot_disk"; got '
f'{swap_config.backing_store!r}.'
)

# Encrypted is GKE's default (ephemeral key). False disables encryption.
if not swap_config.encrypted:
swap_block['encryptionConfig'] = {'disabled': True}

linux_config['swapConfig'] = swap_block

# NOTE: we deliberately do NOT emit a kubelet swapBehavior key here. GKE's
# managed swap applies LimitedSwap proportionally on its own, and the GKE
# node system-config schema (LinuxNodeConfig) does not expose a kubelet
# memory-swap field -- injecting an unknown key would make gcloud reject the
# whole system-config. swap_config.behavior is honored on EKS, where the
# standard k8s KubeletConfiguration (memorySwap.swapBehavior) must be set
# explicitly. If a non-default behavior is ever requested on GKE, surface it
# rather than silently dropping it.
if swap_config.behavior and swap_config.behavior != 'LimitedSwap':
logging.warning(
'GKE manages swap behavior automatically (LimitedSwap); ignoring '
'requested swap behavior %r.',
swap_config.behavior,
)

tmp = tempfile.NamedTemporaryFile(
mode='w',
prefix='pkb_gke_swap_syscfg_',
suffix='.yaml',
delete=False,
dir=vm_util.GetTempDir(),
)
yaml.safe_dump(config, tmp)
tmp.close()
logging.info(
'GKE swap node system config written to %s:\n%s',
tmp.name,
yaml.safe_dump(config),
)
return tmp.name

def _PostCreate(self):
"""Waits for kube-dns to be available."""
super()._PostCreate()
Expand Down
Loading