diff --git a/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 new file mode 100644 index 0000000000..c40ec79dff --- /dev/null +++ b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 @@ -0,0 +1,266 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ ds_name }} + namespace: {{ ds_namespace }} + labels: + app: {{ ds_label }} +spec: + selector: + matchLabels: + app: {{ ds_label }} + template: + metadata: + labels: + app: {{ ds_label }} + spec: + hostPID: true + hostNetwork: true + # Pin to the benchmark nodepool — never schedule on the dummy default pool. + nodeSelector: + pkb_nodepool: {{ benchmark_nodepool }} + tolerations: + - operator: Exists + containers: + - name: benchmark + image: {{ image }} + command: + - bash + - -c + - | + echo "[pkb] Installing benchmark tools..." + # Retry apt-get up to 3 times — transient network failures are + # common on a freshly-started GKE node. Critical tools (fio, + # stress-ng) must be present before we write the ready sentinel; + # a silent || true here would cause /tmp/pkb_ready to appear even + # when tools are missing, breaking all subsequent phases. + PKB_APT_OK=0 + for _attempt in 1 2 3; do + apt-get update -qq 2>&1 || true + DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \\ + fio \\ + stress-ng \\ + sysstat \\ + cryptsetup \\ + mdadm \\ + redis-server \\ + redis-tools \\ + git \\ + wget \\ + curl \\ + make \\ + gcc \\ + bc \\ + flex \\ + bison \\ + libelf-dev \\ + libssl-dev \\ + cgroup-tools \\ + nvme-cli \\ + util-linux \\ + python3-pip \\ + libevent-dev \\ + libssl-dev \\ + libpcre3-dev \\ + zlib1g-dev \\ + build-essential \\ + autoconf \\ + automake \\ + libtool \\ + libtool-bin \\ + pkg-config \\ + python3-dev \\ + default-jre-headless \\ + 2>&1 && PKB_APT_OK=1 && break + echo "[pkb] apt-get attempt $_attempt failed, retrying in 15s..." >&2 + sleep 15 + done + if [ "$PKB_APT_OK" != "1" ] || \\ + ! command -v fio >/dev/null 2>&1 || \\ + ! command -v stress-ng >/dev/null 2>&1; then + echo "[pkb] FATAL: critical tools (fio, stress-ng) not installed after 3 attempts" >&2 + exit 1 + fi + echo "[pkb] Installing memtier_benchmark from source..." + # Pin a stable release tag — building from the moving default + # branch (HEAD) intermittently broke (memtier_benchmark not found + # → Phase 3a lost its P50/P90/P99 latency). 2.2.1 matches the + # version PKB's memtier package (memtier.MemtierResult.Parse) is + # validated against and builds cleanly with the apt deps above. + # Fall back to HEAD only if the tagged clone fails. + if ! command -v memtier_benchmark >/dev/null 2>&1; then + (cd /tmp && \\ + rm -rf memtier_benchmark && \\ + ( git clone --depth 1 --branch 2.2.1 \\ + https://github.com/RedisLabs/memtier_benchmark.git 2>&1 || \\ + git clone --depth 1 \\ + https://github.com/RedisLabs/memtier_benchmark.git 2>&1 ) && \\ + cd memtier_benchmark && \\ + autoreconf -ivf 2>&1 && \\ + ./configure 2>&1 && \\ + make -j$(nproc) 2>&1 && \\ + make install 2>&1) > /tmp/pkb_memtier_build.log 2>&1 || \\ + echo "[pkb] WARNING: memtier_benchmark build failed (see /tmp/pkb_memtier_build.log); redis-benchmark fallback will be used" + fi + if command -v memtier_benchmark >/dev/null 2>&1; then + echo "[pkb] memtier_benchmark installed: $(memtier_benchmark --version 2>&1 | head -1)" + fi + echo "[pkb] Installing esrally (lightweight)..." + python3 -m pip install --upgrade --break-system-packages pip setuptools wheel > /tmp/pkb_esrally_build.log 2>&1 || true + pip3 install --break-system-packages elastic-transport esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\ + pip3 install --break-system-packages esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\ + echo "[pkb] WARNING: esrally install failed (see /tmp/pkb_esrally_build.log); opensearch curl fallback will be used" + if command -v esrally >/dev/null 2>&1; then + echo "[pkb] esrally installed: $(esrally --version 2>&1 | head -1)" + else + echo "[pkb] WARNING: esrally binary not on PATH after install; opensearch curl fallback will be used" >&2 + fi + echo "[pkb] Installing OpenSearch (single-node, security off) for Phase 3c..." + # Phase 3c needs a real search server on :9200. Nothing in apt + # ships one and the pod has no systemd, so install the OpenSearch + # bundle (ships its own JDK) and launch the binary directly in the + # phase. All best-effort: if any step fails the phase probes the + # endpoint and skips cleanly rather than recording fake timings. + if [ ! -x /opt/opensearch/bin/opensearch ]; then + OS_VER=2.15.0 + (cd /opt && \\ + wget -q --timeout=600 -O os.tgz \\ + "https://artifacts.opensearch.org/releases/bundle/opensearch/$OS_VER/opensearch-$OS_VER-linux-x64.tar.gz" && \\ + tar -xzf os.tgz && rm -f os.tgz && \\ + mv "opensearch-$OS_VER" opensearch) > /tmp/pkb_opensearch_build.log 2>&1 || \\ + echo "[pkb] WARNING: OpenSearch download/extract failed (see /tmp/pkb_opensearch_build.log); Phase 3c will skip" >&2 + fi + if [ -x /opt/opensearch/bin/opensearch ]; then + # pkbos owns and runs OpenSearch (it refuses to run as root). + # Give it a home so HOME/temp paths are writable. + id pkbos >/dev/null 2>&1 || useradd -r -d /opt/opensearch -s /bin/bash pkbos 2>/dev/null || true + printf 'discovery.type: single-node\\nnetwork.host: 127.0.0.1\\nplugins.security.disabled: true\\n' \\ + > /opt/opensearch/config/opensearch.yml + mkdir -p /opt/opensearch/config/jvm.options.d + # 2 GB heap: 512 MB was too small and OpenSearch aborted early. + # On a 252 GB node this still leaves plenty of page cache to + # pressure into swap during the phase. + printf -- '-Xms2g\\n-Xmx2g\\n' \\ + > /opt/opensearch/config/jvm.options.d/pkb-heap.options + sysctl -w vm.max_map_count=262144 >/dev/null 2>&1 || true + # CRITICAL: never run the binary as root here (it bails and + # leaves root-owned files in logs/ that block the pkbos server). + # Clear any stale logs and chown everything to pkbos LAST. + rm -f /opt/opensearch/logs/* 2>/dev/null || true + chown -R pkbos /opt/opensearch 2>/dev/null || true + echo "[pkb] OpenSearch installed at /opt/opensearch (heap 2g, runs as pkbos)" + fi + echo "[pkb] Pre-fetching kernel source for Phase 3b build workload..." + PKB_KVER="{{ kernel_version }}" + PKB_KROOT="/mnt/stateful_partition/pkb_kernel" + PKB_KTARBALL="$PKB_KROOT/linux-$PKB_KVER.tar.xz" + PKB_KSRC="$PKB_KROOT/linux-$PKB_KVER" + PKB_KURL="https://cdn.kernel.org/pub/linux/kernel/v${PKB_KVER%%.*}.x/linux-$PKB_KVER.tar.xz" + mkdir -p "$PKB_KROOT" + if [ ! -f "$PKB_KTARBALL" ]; then + wget -q --timeout=300 -O "$PKB_KTARBALL" "$PKB_KURL" 2>&1 || \\ + echo "[pkb] WARNING: kernel tarball download failed" >&2 + fi + if [ -f "$PKB_KTARBALL" ] && [ ! -d "$PKB_KSRC" ]; then + echo "[pkb] Extracting kernel source (xz)..." + tar -xf "$PKB_KTARBALL" -C "$PKB_KROOT" 2>&1 || \\ + echo "[pkb] WARNING: kernel source extraction failed" >&2 + fi + echo "[pkb] Unlocking container cgroup swap limits..." + # GKE cgroup v2 sets memory.swap.max=0 per-container, which + # prevents swap usage even when the node has a swap device and + # vm.swappiness>0. Stress-ng gets OOM-killed in ~15s because + # the kernel can't page out to swap for this cgroup. + # + # NOTE: the old approach derived the cgroup path from + # /proc/self/cgroup, but inside a cgroup namespace that reports + # "0::/" — so the write targeted the host ROOT cgroup, silently + # no-op'd, and swap stayed locked (the OOM-in-15s symptom above). + # /sys is the host cgroup tree (hostPath mount) and this pod is + # privileged, so instead unlock swap across the entire kubepods + # hierarchy, which is guaranteed to contain our own container. + if [ -d /sys/fs/cgroup/kubepods.slice ] || \ + [ -d /sys/fs/cgroup/kubepods ]; then + # cgroup v2: write 'max' to every memory.swap.max under kubepods*. + find /sys/fs/cgroup -path '*kubepods*' -name memory.swap.max \ + 2>/dev/null | while read -r _f; do + echo max > "$_f" 2>/dev/null || true + done + fi + # Best-effort: our own namespaced path and the unified root. + PKB_CG=$(awk -F: '$2==""{print $3; exit}' /proc/self/cgroup \ + 2>/dev/null) + for _cgf in "/sys/fs/cgroup${PKB_CG}/memory.swap.max" \ + /sys/fs/cgroup/memory.swap.max; do + [ -f "$_cgf" ] && { echo max > "$_cgf" 2>/dev/null || true; } + done + # cgroup v1 fallback: lift the combined RAM+swap hard ceiling. + find /sys/fs/cgroup/memory -path '*kubepods*' \ + -name memory.memsw.limit_in_bytes 2>/dev/null \ + | while read -r _f; do + echo -1 > "$_f" 2>/dev/null || true + done + # Verify and surface the result in the pod log. grep -L lists + # files that do NOT contain 'max' on their first line, i.e. ones + # still capping swap. + PKB_STILL_CAPPED=$(find /sys/fs/cgroup -path '*kubepods*' \ + -name memory.swap.max 2>/dev/null \ + | xargs -r grep -L '^max' 2>/dev/null | head -1) + if [ -n "$PKB_STILL_CAPPED" ]; then + echo "[pkb] WARNING: cgroup swap still capped at \ + $PKB_STILL_CAPPED=$(cat "$PKB_STILL_CAPPED" 2>/dev/null) — stress-ng may be \ + OOM-killed before swap is exercised" >&2 + else + echo "[pkb] cgroup swap unlocked (memory.swap.max=max across kubepods)" + fi + echo "[pkb] Tools installed. Writing ready sentinel." + touch /tmp/pkb_ready + sleep infinity + securityContext: + privileged: true + capabilities: + add: ["SYS_ADMIN", "IPC_LOCK"] + resources: + requests: + memory: "512Mi" + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: dev + mountPath: /dev + - name: sys + mountPath: /sys + - name: run + mountPath: /run + - name: proc-host + mountPath: /proc-host + readOnly: true + - name: stateful-partition + mountPath: /mnt/stateful_partition + - name: lib-modules + mountPath: /lib/modules + readOnly: true + volumes: + - name: dev + hostPath: + path: /dev + - name: sys + hostPath: + path: /sys + - name: run + hostPath: + path: /run + - name: proc-host + hostPath: + path: /proc + - name: stateful-partition + hostPath: + path: /mnt/stateful_partition + type: DirectoryOrCreate + - name: lib-modules + hostPath: + path: /lib/modules + type: Directory diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py new file mode 100644 index 0000000000..2d97efa1ce --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -0,0 +1,2027 @@ +# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""GKE vs. AWS EKS Swap Encryption and LSSD Performance Benchmark. + +Methodology: go/swap-encryption-and-lssd-performance-comparison:gke-vs-aws + +== Architecture == + +Provisions a real GKE (GCP) or EKS (AWS) Kubernetes cluster via PKB's +container_cluster abstraction, then deploys a privileged DaemonSet whose +pod has full host-device access (/dev, /sys, hostPID). All benchmark +phases execute inside this pod via kubectl exec, so measurements reflect +actual cluster-node behaviour including Kubernetes overhead (kubelet, +containerd cgroup hierarchy, etc.). + + GKE nodes ── dm-crypt with ephemeral key (go/node:swap-encryption) + swap device: /dev/mapper/swap_encrypted (over dedicated + hyperdisk or LSSD RAID-0 /dev/md0). + Single-disk fallback: plain loop device on + /mnt/stateful_partition — dm-crypt is blocked by COS + kernel namespace restrictions from inside a pod. + + EKS nodes ── NVMe Instance Store, Nitro hardware-offloaded encryption + swap device: /dev/nvme1n1 (or auto-detected) + +== Resource pattern == + +Infrastructure lifecycle lives in two BaseResource subclasses: + + _Create(): gcloud container node-pools create with linuxConfig.swapConfig + + sysctl via --system-config-from-file; waits for node Ready; + optionally creates and attaches a dedicated swap disk. + _Delete(): detach+delete disk; delete the nodepool. + DeleteDefaultPool(): remove the dummy e2-medium default pool after the + DaemonSet pod is Running (separate step to avoid API-server + contention during nodepool ops). + + SwapDaemonSet (perfkitbenchmarker/resources/container_service/swap_daemonset.py) + _Create(): apply Jinja2 manifest; wait for Running + /tmp/pkb_ready. + _Delete(): in-pod swapoff / dmsetup / losetup teardown; kubectl delete. + PodExec(): kubectl exec wrapper with transient-reset retry, OOM-kill + detection (rc=137), and automatic pod recovery. + +Both resources are added to spec.resources in Prepare() and are auto-deleted +by the PKB framework in Cleanup(). + +== Benchmark Phases == + + Phase 1 – fio Microbenchmarks + Run fio directly on the swap block device (swapoff first) to measure + the hardware + encryption ceiling: random IOPS (4K), sequential + bandwidth (1M), and completion latency (iodepth=1). + + Phase 2a – CPU Overhead + stress-ng drives sustained swap I/O; vmstat and pidstat capture + swap-in/out rates and per-process CPU cost (kswapd, kcryptd, + dm-crypt threads on GKE; Nitro offload on EKS). + + Phase 2b – I/O Interference + Baseline fio on a scratch volume → re-run with concurrent swap + pressure. IOPS/latency delta = storage contention cost. + + Phase 3a – Redis Latency + Dataset loaded beyond container memory limit → GET/SET p99 latency + measured while kernel swaps pages. + + Phase 3b – Kernel Build + Linux compiled inside a memory-capped cgroup; slowdown ratio vs + unconstrained baseline. + + Phase 3c – OpenSearch + Bulk-index + search query under swap pressure (esrally or curl). +""" + +import json +import logging +import textwrap +import time +from typing import Any + +from absl import flags +from perfkitbenchmarker import benchmark_spec as bm_spec_lib +from perfkitbenchmarker import configs +from perfkitbenchmarker import errors +from perfkitbenchmarker import sample +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.resources.container_service import kubectl +from perfkitbenchmarker.resources.container_service import swap_daemonset as _ds_mod + +FLAGS = flags.FLAGS + +_BenchmarkSpec = bm_spec_lib.BenchmarkSpec + +# --------------------------------------------------------------------------- +# Benchmark identity +# --------------------------------------------------------------------------- + +BENCHMARK_NAME = 'swap_encryption' + + +BENCHMARK_CONFIG = """ +swap_encryption: + description: > + fio microbenchmarks (Tier 1) on swap-encrypted GKE/EKS nodes. Swap-enabled 'benchmark' nodepool declared in BENCHMARK_CONFIG; + GKE cluster creation applies --system-config-from-file (dm-crypt swapConfig) + automatically via swap_config field on NodepoolSpec. + container_cluster: + cloud: GCP + type: Kubernetes + vm_count: 1 + vm_spec: + GCP: + machine_type: e2-medium + boot_disk_size: 20 + zone: us-central1-a + nodepools: + benchmark: + vm_count: 1 + vm_spec: + GCP: + machine_type: n4-highmem-32 + boot_disk_type: hyperdisk-balanced + boot_disk_size: 500 + zone: us-central1-a + swap_config: + enabled: true + swappiness: 100 + min_free_kbytes: 200 + watermark_scale_factor: 500 + boot_disk_iops: 160000 + boot_disk_throughput: 2400 +""" + + +_SWAP_DEVICE = flags.DEFINE_string( + 'swap_encryption_device', + '', + 'Explicit swap block-device path on the cluster node, e.g. ' + '/dev/nvme1n1 or /dev/dm-0. When empty the benchmark auto-detects ' + 'via /proc/swaps after setup.', +) + + +_SWAP_SIZE_GB = flags.DEFINE_integer( + 'swap_encryption_swap_size_gb', + 32, + 'Size in GB of the swap space to configure on the node. ' + 'Ignored when a ready swap device already exists.', +) + + +_SWAP_TYPE = flags.DEFINE_string( + 'swap_encryption_swap_type', + 'hyperdisk', + 'Storage target for the swap device. One of: hyperdisk (default), ' + 'lssd, boot_disk, instance_store, io2.', +) + + +_FIO_RUNTIME_SEC = flags.DEFINE_integer( + 'swap_encryption_fio_runtime_sec', + 60, + 'Wall-clock runtime in seconds for each individual fio job.', +) + + +_ENABLE_ZSWAP = flags.DEFINE_boolean( + 'swap_encryption_enable_zswap', + False, + 'Enable zswap (lz4 compressor, 20%% max pool) before running tests.', +) + + +_MIN_FREE_KBYTES = flags.DEFINE_integer( + 'swap_encryption_min_free_kbytes', + 65536, + 'Value written to /proc/sys/vm/min_free_kbytes to trigger earlier ' + 'swapping. Set 0 to leave the kernel default unchanged.', +) + + +_DAEMONSET_IMAGE = flags.DEFINE_string( + 'swap_encryption_daemonset_image', + 'ubuntu:22.04', + 'Container image used for the privileged benchmark DaemonSet pod.', +) + + +_NODEPOOL = flags.DEFINE_string( + 'swap_encryption_nodepool', + 'benchmark', + 'Name of the node pool to deploy the benchmark DaemonSet on.', +) + + +_INSTANCE_SIZE_LABEL = flags.DEFINE_string( + 'swap_encryption_instance_size_label', + '', + 'Human-readable label for the current instance size being tested, e.g. ' + '"n4-highmem-32" or "i4i.4xlarge". Stored in sample metadata so that ' + 'results from multiple PKB runs across different instance sizes can be ' + 'collated and compared. Defaults to the value reported by the cloud ' + 'metadata endpoint inside the pod.', +) + + +_COLLECT_COST = flags.DEFINE_boolean( + 'swap_encryption_collect_cost', + False, + 'When True, emit a cost_estimate_usd sample using on-demand pricing ' + 'for the instance type detected at runtime.', +) + + +_IO2_ENCRYPTED = flags.DEFINE_boolean( + 'swap_encryption_io2_encrypted', + True, + 'When True (default), the dedicated io2 swap volume is created with EBS ' + 'encryption (Nitro/KMS) -> matrix row "io2 + hardware encryption". ' + 'Set False for the unencrypted io2 baseline row. Only applies when ' + '--swap_encryption_swap_type=io2 on AWS/EKS.', +) + + +_IO2_KMS_KEY_ID = flags.DEFINE_string( + 'swap_encryption_io2_kms_key_id', + '', + 'Optional KMS key id/ARN for the encrypted io2 volume. Empty = the ' + 'account default aws/ebs key. Ignored unless io2_encrypted is True.', +) + + +_FAIL_ON_DEGRADED = flags.DEFINE_boolean( + 'swap_encryption_fail_on_degraded', + True, + 'When True (default), raise an error at the end of Run() if the run was ' + 'catastrophically degraded — e.g. the benchmark pod was OOM-evicted and ' + 'replaced mid-run, Gate 1 (fio) produced no samples, or the stress-ng ' + 'swap-pressure phase was OOM-killed before completing. This prevents PKB ' + 'from reporting SUCCEEDED for a run whose post-eviction phases produced ' + 'empty or meaningless data. Set False to keep the legacy behaviour of ' + 'always returning whatever partial samples were collected.', +) + + +_PHASES = flags.DEFINE_list( + 'swap_encryption_phases', + ['all'], + 'Which Run() phases to execute, for fast iteration against an ' + 'already-provisioned cluster (e.g. --run_stage=run --run_uri=...). ' + 'Comma-separated subset of: fio (Tier 1 microbenchmarks), 2a (stress-ng ' + 'CPU overhead + swap pressure), 2b (I/O interference), 3a (redis), ' + '3b (kernel build), 3c (opensearch). Default "all" runs everything. ' + 'Example: --swap_encryption_phases=2a runs only the swap-pressure phase. ' + 'Phases not listed are skipped and do not affect the degraded-run gate ' + '(e.g. skipping fio will not be reported as "Gate 1 produced no samples").', +) + + +_BENCHMARK_MACHINE_TYPE = flags.DEFINE_string( + 'swap_encryption_benchmark_machine_type', + 'n4-highmem-32', + 'Machine type for the benchmark nodepool created in Prepare(). ' + 'Use n4-highmem-32 (hyperdisk, default) or c4-standard-8-lssd ' + '(LSSD RAID-0). The matching swap setup is selected automatically.', +) + + +_BENCHMARK_LSSD = flags.DEFINE_boolean( + 'swap_encryption_lssd', + False, + 'Force LSSD RAID-0 swap path even when the machine type name does not ' + 'contain "lssd". Auto-detected from machine type when False.', +) + + +_LSSD_COUNT = flags.DEFINE_integer( + 'swap_encryption_lssd_count', + 1, + 'Number of local NVMe SSDs to attach as raw block devices ' + '(--local-nvme-ssd-block count=N). Must match the fixed local SSD ' + 'count for the chosen machine type: c4-standard-8-lssd=1, ' + 'c4-standard-16-lssd=2, i4i.4xlarge has NVMe Instance Store (AWS). ' + 'Default 1 covers most single-lssd machine types.', +) + + +_ENABLE_DMCRYPT = flags.DEFINE_boolean( + 'swap_encryption_enable_dmcrypt', + True, + 'When True (default), configure dm-crypt on the swap device — the ' + '"encryption enabled" column of the test matrix. Set False to use ' + 'plain swap (encryption disabled column).', +) + + +_NODE_IMAGE_TYPE = flags.DEFINE_string( + 'swap_encryption_node_image_type', + 'UBUNTU_CONTAINERD', + 'GKE node image type for the benchmark nodepool. ' + 'UBUNTU_CONTAINERD is required for dm-crypt measurement: COS locks ' + 'down device-mapper at the kernel LSM level and cryptsetup hangs ' + 'indefinitely from any pod context (even privileged, even via nsenter ' + 'into the host mount namespace). Ubuntu GKE nodes allow cryptsetup ' + 'from privileged pods without restriction. ' + 'Use COS_CONTAINERD only when dm-crypt is disabled ' + '(--noswap_encryption_enable_dmcrypt) to measure plain-swap overhead. ' + 'AL2 on EKS.', +) + + +_BOOT_DISK_TYPE = flags.DEFINE_string( + 'swap_encryption_boot_disk_type', + 'hyperdisk-balanced', + 'Disk type for the benchmark nodepool boot disk. Use hyperdisk-balanced ' + 'for production machines (n4, c3, c4 families). Use pd-ssd for n2/e2 ' + 'dev/test machines, which do not support hyperdisk-balanced.', +) + + +_BOOT_DISK_IOPS = flags.DEFINE_integer( + 'swap_encryption_boot_disk_iops', + 80000, + 'Provisioned IOPS for the boot disk (hyperdisk-balanced only). ' + '80 000 is the COS max-IOPS target. Ignored for pd-ssd.', +) + + +_BOOT_DISK_THROUGHPUT = flags.DEFINE_integer( + 'swap_encryption_boot_disk_throughput', + 1200, + 'Provisioned throughput in MB/s for the boot disk (hyperdisk-balanced ' + 'only). Must be set together with iops. 1200 MB/s pairs with 80 000 ' + 'IOPS for production; use 140 (minimum) for dev/test. Ignored for ' + 'pd-ssd.', +) + + +_BOOT_DISK_SIZE_GB = flags.DEFINE_integer( + 'swap_encryption_boot_disk_size_gb', + 500, + 'Boot disk size in GiB for the benchmark nodepool. 500 GiB is ' + 'required for the n4-highmem-32 + hyperdisk-balanced Config 2 run ' + '(see Engineer Assignments table in execution-plan.md). ' + 'For LSSD configs the boot disk is smaller; 100 GiB is fine.', +) + + +_ADD_SWAP_DISK = flags.DEFINE_boolean( + 'swap_encryption_add_swap_disk', + False, + 'Attach a dedicated second disk to the benchmark nodepool for use as ' + 'the swap device. Required for dm-crypt measurement on single-boot-disk ' + 'machines (n4-highmem-32, n4-highmem-8) because COS blocks device-mapper ' + 'from pod namespaces. The second disk is provisioned via ' + '--additional-node-disk using the same type/IOPS/throughput as the boot ' + 'disk flags.', +) + + +_SWAP_DISK_SIZE_GB = flags.DEFINE_integer( + 'swap_encryption_swap_disk_size_gb', + 500, + 'Size in GiB of the dedicated swap disk when ' + '--swap_encryption_add_swap_disk is True. Must satisfy the ' + 'hyperdisk-balanced IOPS constraint: provisioned_iops <= size_gb * 80.', +) + + +_STRESS_VM_BYTES = flags.DEFINE_string( + 'swap_encryption_stress_vm_bytes', + '28G', + 'stress-ng --vm-bytes value for Phase 2a swap-pressure stressor. ' + 'Should exceed available node RAM to force sustained paging.', +) + + +_STRESS_VM_BYTES_LIST = flags.DEFINE_list( + 'swap_encryption_stress_vm_bytes_list', + [], + 'Comma-separated list of --vm-bytes values to sweep in Phase 2a, ' + 'e.g. "14G,28G,56G". Overrides --swap_encryption_stress_vm_bytes.', +) + + +_STRESS_TIMEOUT_SEC = flags.DEFINE_integer( + 'swap_encryption_stress_timeout_sec', + 300, + 'Maximum seconds to wait for the stress-ng swap-pressure phase.', +) + +# DaemonSet constants used by both SwapDaemonSet construction and the EKS path. +_DS_NAME = 'pkb-swap-benchmark' +_DS_NAMESPACE = 'default' +_DS_LABEL = 'pkb-swap-benchmark' +_BENCHMARK_NODEPOOL = 'benchmark' + +_FIO_JOBS = ( + ('rand_write_iops', 'randwrite', '4k', 256, 'Random write IOPS'), + ('rand_read_iops', 'randread', '4k', 256, 'Random read IOPS'), + ('rand_rw_mixed', 'randrw', '4k', 256, 'Mixed random R/W (50/50)'), + ('seq_write_bw', 'write', '1m', 64, 'Sequential write bandwidth'), + ('seq_read_bw', 'read', '1m', 64, 'Sequential read bandwidth'), + ('lat_write', 'randwrite', '4k', 1, 'Random write latency'), + ('lat_read', 'randread', '4k', 1, 'Random read latency'), +) + +# Module-level stash for the io2 volume created in _ensure_io2_volume. +_IO2_VOLUME_ID = '' + + +def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]: + return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) + + +def Prepare(spec: _BenchmarkSpec) -> None: + """Two-step nodepool setup then DaemonSet deployment. + + PKB cluster creation automatically provisions the swap-enabled 'benchmark' + nodepool (swap_config in BENCHMARK_CONFIG). This function only: + 1. Deploys the privileged SwapDaemonSet and waits for Running. + 2. Deletes the cheap e2-medium default-pool (required at cluster create). + + DaemonSet is appended to spec.resources for PKB auto-cleanup. + """ + cluster = spec.container_cluster + + # The swap-enabled 'benchmark' nodepool is already provisioned by GKE + # cluster creation (swap_config declared in BENCHMARK_CONFIG). + # Prepare() only deploys the privileged DaemonSet + deletes the cheap + # e2-medium default pool that GKE requires at cluster creation time. + logging.info('[swap_encryption] Deploying privileged DaemonSet') + daemonset = _ds_mod.SwapDaemonSet( + name=_DS_NAME, + namespace=_DS_NAMESPACE, + label=_DS_LABEL, + nodepool=_BENCHMARK_NODEPOOL, + image=_DAEMONSET_IMAGE.value, + ) + daemonset.Create() + spec.resources.append(daemonset) + logging.info('[swap_encryption] Benchmark pod ready: %s', daemonset.pod_name) + _delete_default_pool(cluster) + daemonset.WaitForPod() + logging.info( + '[swap_encryption] Benchmark pod (post-deletion): %s', daemonset.pod_name + ) + + # Tune kernel swap aggressiveness. + daemonset.PodExec('sysctl -w vm.swappiness=100', ignore_failure=True) + if _MIN_FREE_KBYTES.value > 0: + daemonset.PodExec( + f'sysctl -w vm.min_free_kbytes={_MIN_FREE_KBYTES.value}' + ) + + # Unlock container cgroup swap. + daemonset.PodExec( + textwrap.dedent(""" + PKB_CG=$(awk -F: '/^0::/{print $3; exit}' /proc/self/cgroup 2>/dev/null) + if [ -n "$PKB_CG" ] && [ -f "/sys/fs/cgroup${PKB_CG}/memory.swap.max" ]; then + echo max > "/sys/fs/cgroup${PKB_CG}/memory.swap.max" 2>/dev/null || true + fi + PKB_CG1=$(awk -F: '/:memory:/{print $3; exit}' /proc/self/cgroup 2>/dev/null) + if [ -n "$PKB_CG1" ] && \ + [ -f "/sys/fs/cgroup/memory${PKB_CG1}/memory.memsw.limit_in_bytes" ]; then + echo -1 > "/sys/fs/cgroup/memory${PKB_CG1}/memory.memsw.limit_in_bytes" \ + 2>/dev/null || true + fi + """), + ignore_failure=True, + ) + + # Enable zswap if requested. + if _ENABLE_ZSWAP.value: + _enable_zswap(daemonset) + + # Configure cloud-specific swap. + cloud = _detect_cloud(daemonset) + logging.info('[swap_encryption] Detected cloud: %s', cloud) + + if cloud == 'gcp': + _setup_gke_swap(daemonset) + elif cloud == 'aws': + _setup_eks_swap(daemonset) + else: + logging.warning( + '[swap_encryption] Unknown cloud – falling back to plain swapfile' + ) + _setup_plain_swap_file(daemonset, _SWAP_SIZE_GB.value) + + +def _phase_selected(token: str) -> bool: + """Return True if phase `token` should run given --swap_encryption_phases. + + 'all' (the default) selects every phase. Otherwise only the comma-separated + tokens listed in the flag run. Tokens: fio, 2a, 2b, 3a, 3b, 3c. + """ + selected = [p.strip().lower() for p in _PHASES.value if p.strip()] + return (not selected) or ('all' in selected) or (token.lower() in selected) + + +def Run(spec: _BenchmarkSpec) -> list[sample.Sample]: + """Execute all benchmark phases with gate logic. + + Execution is structured in three gated tiers matching the execution plan: + + Tier 1 (Gate 1) — fio microbenchmarks + Raw I/O ceiling of the swap device. Gate 1 fails if fio produces + zero samples (device not found, O_DIRECT error, etc.). + + Tier 2 (Gate 2) — stress-ng CPU overhead + I/O interference + Requires an active swap device (Gate 1 must pass). Gate 2 fails if + stress-ng does not complete within timeout. + + Tier 3 (Gate 3) — real-world workloads (Redis, kernel build, OpenSearch) + Independent of Tier 2 results; always attempted if Gate 1 passed. + Individual workload failures are logged but do not abort the others. + + If Gate 1 fails, Tiers 2 and 3 are skipped — there is no point measuring + application-level swap performance when the raw device is inaccessible. + """ + daemonset = _get_daemonset(spec) + + pod = daemonset.WaitForPod() + if pod is None: + raise errors.Benchmarks.RunError( + '[swap_encryption] Benchmark pod never became ready.' + ) + # Reset per-run accumulators before starting phases. + daemonset.oom_events.clear() + daemonset.pod_lost.clear() + original_pod = pod + degraded_reasons: list[str] = [] + + swap_dev = _detect_swap_device(daemonset) + base_meta = _build_metadata(daemonset, swap_dev) + results: list[sample.Sample] = [] + t_run_start = time.time() + + logging.info('[swap_encryption] swap device: %s', swap_dev) + + # ── Tier 1 / Gate 1: fio microbenchmarks ───────────────────────────────── + tier1_results = [] + if _phase_selected('fio'): + logging.info( + '[swap_encryption] ── Tier 1 / Gate 1: fio microbenchmarks ──' + ) + try: + tier1_results = _phase1_fio(daemonset, swap_dev, base_meta) + results += tier1_results + except Exception as e: # pylint: disable=broad-except + logging.error( + '[swap_encryption] Gate 1 FAILED — fio phase error: %s', e + ) + logging.error( + '[swap_encryption] Skipping Tiers 2 and 3 (no swap device)' + ) + return results + + if not tier1_results: + logging.warning( + '[swap_encryption] Gate 1 produced no samples ' + '(loop-device skip or parse error) — ' + 'continuing to Tier 2 with caution' + ) + else: + logging.info( + '[swap_encryption] Skipping Tier 1 (fio) — not selected by ' + '--swap_encryption_phases=%s', + ','.join(_PHASES.value), + ) + + # ── Cost estimate ───────────────────────────────────────────────────────── + if _COLLECT_COST.value: + elapsed = time.time() - t_run_start + results += _collect_cost_sample(daemonset, elapsed, base_meta) + + # ── Final degradation gate ──────────────────────────────────────────────── + if daemonset.pod_name and daemonset.pod_name != original_pod: + degraded_reasons.append( + f'benchmark pod was replaced during the run ({original_pod} →' + f' {daemonset.pod_name}) — it was OOM-evicted under swap pressure;' + ' phases executed after the eviction ran against a' + ' freshly-initialised pod (empty /tmp, swap re-setup) and may be' + ' invalid' + ) + if daemonset.pod_lost: + degraded_reasons.append( + 'benchmark pod(s) went NotFound during the run' + f' ({", ".join(daemonset.pod_lost)}) — the pod died (node memory-pressure' + ' eviction or container exit) and any phase running at or after' + ' that point (e.g. kernel-build baseline, OpenSearch) produced' + ' invalid data' + ) + if daemonset.oom_events: + degraded_reasons.append( + f'OOM kill(s) (rc=137) occurred during the run on pod(s) ' + f'{", ".join(daemonset.oom_events)} — a phase exceeded memory and was' + ' killed by the OOM killer (the container may have restarted in place),' + ' so the affected phase(s) produced no or partial data' + ) + + if _phase_selected('fio') and not tier1_results: + if swap_dev.startswith('/dev/loop'): + # Expected: COS blocks device-mapper from pod namespaces on single-disk + # nodes. Tier 2/3 results are still valid; do NOT mark the run as degraded. + logging.warning( + '[swap_encryption] Gate 1 (fio) skipped — loop device %s has no' + ' dm-crypt support from inside a pod. Tier 2/3 results are' + ' valid. Use c4-*-lssd or --swap_encryption_add_swap_disk for' + ' fio data.', + swap_dev, + ) + else: + degraded_reasons.append( + 'Gate 1 (fio microbenchmarks) produced no samples — the raw' + ' swap device was never characterised' + ) + + degraded = bool(degraded_reasons) + results.append( + sample.Sample( + 'swap_encryption_run_status', + 0.0 if degraded else 1.0, + 'status', + dict( + base_meta, + degraded=degraded, + degraded_reasons='; '.join(degraded_reasons) or 'none', + num_samples=len(results) + 1, + ), + ) + ) + + if degraded: + msg = '[swap_encryption] RUN DEGRADED — ' + '; '.join(degraded_reasons) + logging.error(msg) + if _FAIL_ON_DEGRADED.value: + raise errors.Benchmarks.RunError(msg) + else: + logging.info( + '[swap_encryption] Run completed cleanly (%d samples)', len(results) + ) + + return results + + + +def _delete_default_pool(cluster) -> None: + """Delete the dummy e2-medium default-pool once the benchmark pod is Running. + + GKE requires at least one nodepool at cluster creation time; the e2-medium + default-pool satisfies that requirement. Deleting it before the DaemonSet + pod is Running can trigger a brief API-server timeout while two concurrent + nodepool operations are in progress. + """ + try: + cmd = cluster._GcloudCommand( # pylint: disable=protected-access + 'container', 'node-pools', 'delete', _DEFAULT_POOL, + '--cluster', cluster.name, + ) + cmd.args.append('--quiet') + logging.info('[swap_encryption] Deleting default nodepool: %s', _DEFAULT_POOL) + _, stderr, rc = cmd.Issue(timeout=300, raise_on_failure=False) + if rc != 0: + logging.warning( + '[swap_encryption] Could not delete default nodepool (rc=%d): %s', + rc, stderr, + ) + else: + logging.info('[swap_encryption] Default nodepool deleted') + except Exception as e: # pylint: disable=broad-except + logging.warning('[swap_encryption] _delete_default_pool failed: %s', e) +def Cleanup(spec: _BenchmarkSpec) -> None: + """Resources in spec.resources are auto-deleted by the PKB framework. + + SwapDaemonSet._Delete() runs in-pod teardown (swapoff, dmsetup remove, + losetup cleanup, pkill fio/stress-ng) then deletes the DaemonSet. + SwapNodePool._Delete() detaches+deletes the swap disk (if any) then + deletes the benchmark nodepool. + """ + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _get_daemonset(spec: _BenchmarkSpec) -> _ds_mod.SwapDaemonSet: + """Retrieve the SwapDaemonSet resource from spec.resources.""" + daemonset = next( + (r for r in spec.resources if isinstance(r, _ds_mod.SwapDaemonSet)), + None, + ) + if daemonset is None: + raise errors.Benchmarks.RunError( + '[swap_encryption] SwapDaemonSet not found in spec.resources —' + ' was Prepare() called?' + ) + return daemonset + + +def _configure_eks_kubelet_swap(spec) -> None: + """Configure EKS kubelet for LimitedSwap via nodeadm bootstrap. + + NOTE: Deferred — requires Ajay's PR #6780 (SwapConfigSpec + nodeadm + integration) to merge. When that lands, EKS node pools should include + a preBootstrapCommands block writing nodeadm config with + memorySwapBehavior: LimitedSwap before kubelet starts. + + See also: https://github.com/GoogleCloudPlatform/PerfKitBenchmarker/pull/6780 + """ + logging.warning( + '[swap_encryption] EKS kubelet LimitedSwap config via nodeadm is ' + 'deferred (blocked on PR #6780 — SwapConfigSpec). ' + 'EKS nodes will use default kubelet swap settings until that PR merges.' + ) + + +def _detect_cloud(daemonset: _ds_mod.SwapDaemonSet) -> str: + """Detect GCP vs AWS from DMI product info exposed via /sys hostPath mount. + + DMI is the most reliable in-container detection method because it reads + directly from the host kernel's SMBIOS table via /sys (already mounted). + It avoids HTTP metadata endpoint quoting issues and network timeouts. + + Falls back to metadata HTTP endpoints if DMI is inconclusive. + """ + # Primary: DMI product name / vendor (available via /sys hostPath mount) + dmi_out, _ = daemonset.PodExec( + 'cat /sys/class/dmi/id/sys_vendor /sys/class/dmi/id/product_name ' + '/sys/class/dmi/id/bios_vendor 2>/dev/null || echo ""', + ignore_failure=True, + ) + dmi = dmi_out.strip().lower() + if 'google' in dmi: + logging.info( + '[swap_encryption] Cloud detected via DMI: gcp (%s)', + dmi_out.strip(), + ) + return 'gcp' + if any(k in dmi for k in ('amazon', 'ec2', 'aws')): + logging.info( + '[swap_encryption] Cloud detected via DMI: aws (%s)', + dmi_out.strip(), + ) + return 'aws' + + # Secondary: GCP metadata endpoint. + gcp_out, _ = daemonset.PodExec( + 'curl -s -m 3 ' + 'http://metadata.google.internal/computeMetadata/v1/instance/zone ' + '-H Metadata-Flavor:Google 2>/dev/null || echo ""', + ignore_failure=True, + ) + if gcp_out.strip(): + logging.info('[swap_encryption] Cloud detected via metadata: gcp') + return 'gcp' + + # Tertiary: AWS IMDS (IMDSv2 token-based; IMDSv1 is often disabled). + aws_out, _ = daemonset.PodExec( + 'T=$(curl -s -m 3 -X PUT ' + 'http://169.254.169.254/latest/api/token ' + '-H "X-aws-ec2-metadata-token-ttl-seconds: 60" 2>/dev/null); ' + 'curl -s -m 3 -H "X-aws-ec2-metadata-token: $T" ' + 'http://169.254.169.254/latest/meta-data/instance-id ' + '2>/dev/null || echo ""', + ignore_failure=True, + ) + if aws_out.strip(): + logging.info('[swap_encryption] Cloud detected via IMDS: aws') + return 'aws' + + logging.warning( + '[swap_encryption] Could not detect cloud from DMI or metadata' + ) + return 'unknown' + + +def _setup_gke_swap(daemonset: _ds_mod.SwapDaemonSet) -> None: + """Configure dm-crypt swap on the GKE node, mirroring go/node:swap-encryption. + + GKE nodes use dm-crypt with an ephemeral random key so that swap contents + are encrypted at rest without requiring persistent key management. + We replicate this exactly using cryptsetup in plain mode (no LUKS header). + """ + swap_type = _SWAP_TYPE.value + if swap_type == 'auto': + # Check whether Local SSDs are present + lssd_out, _ = daemonset.PodExec( + "lsblk -d -o NAME,MODEL | grep -i 'local\\|nvme' | " + "grep -v 'nvme0' | awk '{print $1}' | head -1", + ignore_failure=True, + ) + swap_type = 'lssd' if lssd_out.strip() else 'hyperdisk' + + if swap_type == 'lssd': + _setup_gke_lssd_swap(daemonset) + elif swap_type == 'boot_disk': + _setup_gke_bootdisk_swap(daemonset) + else: + _setup_gke_hyperdisk_swap(daemonset) + + +def _setup_gke_hyperdisk_swap(daemonset: _ds_mod.SwapDaemonSet) -> None: + """Configure dm-crypt swap on hyperdisk-balanced (GKE default). + + Disk detection is split into two separate commands so that the boot-device + name is resolved first and then substituted as a literal string — nested + $() expansions inside a kubectl exec bash -c argument are unreliable. + + If no dedicated data disk is attached (single-disk node) dm-crypt is set up + over a loop device backed by a file on the boot hyperdisk, which still + exercises the full encryption path on the same storage tier. + """ + logging.info('[swap_encryption] GKE: setting up dm-crypt on hyperdisk') + + # Step 1: identify the boot device name (e.g. "nvme0n1", "sda") + boot_out, _ = daemonset.PodExec( + 'lsblk -no pkname "$(findmnt -n -o SOURCE /)" 2>/dev/null | head -1', + ignore_failure=True, + ) + boot_base = boot_out.strip() or 'nvme0n1' + logging.info('[swap_encryption] GKE: boot device: %s', boot_base) + + # Step 2: find a non-boot disk using the literal name from step 1 + disk_out, _ = daemonset.PodExec( + "lsblk -d -o NAME,TYPE | awk '$2==\"disk\"{print $1}' " + f"| grep -v '^{boot_base}$' | head -1", + ignore_failure=True, + ) + disk_name = disk_out.strip() + + if not disk_name: + logging.info( + '[swap_encryption] No dedicated data disk found – ' + 'falling back to loop device on /mnt/stateful_partition ' + '(direct-io=on, dm-crypt=%s)', + _ENABLE_DMCRYPT.value, + ) + _setup_gke_loop_device_swap(daemonset) + return + + disk = f'/dev/{disk_name}' + logging.info( + '[swap_encryption] GKE: swap target disk: %s dmcrypt=%s', + disk, + _ENABLE_DMCRYPT.value, + ) + + # Clean up any stale mapping from a previous failed run. + daemonset.PodExec( + textwrap.dedent(f""" + swapoff /dev/mapper/swap_encrypted 2>/dev/null || true + dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true + wipefs -a {disk} 2>/dev/null || true + """), + ignore_failure=True, + ) + + if _ENABLE_DMCRYPT.value: + daemonset.PodExec( + textwrap.dedent(f""" + grep -q dm_crypt /proc/modules 2>/dev/null || {{ + KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1) + [ -n "$KO" ] && insmod "$KO" 2>/dev/null || true + }} + KEY=$(dd if=/dev/urandom bs=32 count=1 2>/dev/null | od -A n -t x1 | tr -d ' \\n') + SIZE=$(blockdev --getsz {disk}) + printf "0 %s crypt aes-xts-plain64 %s 0 %s 0\\n" "$SIZE" "$KEY" "{disk}" | \\ + dmsetup create swap_encrypted --noudevrules --noudevsync + unset KEY + dmsetup mknodes swap_encrypted 2>/dev/null || true + mkswap /dev/mapper/swap_encrypted + swapon /dev/mapper/swap_encrypted + """), + ) + logging.info( + '[swap_encryption] GKE: dm-crypt swap active on ' + '/dev/mapper/swap_encrypted' + ) + else: + # Encryption-disabled column of the test matrix + daemonset.PodExec( + textwrap.dedent(f""" + mkswap {disk} && \\ + swapon {disk} + """), + ) + logging.info( + '[swap_encryption] GKE: plain (unencrypted) swap active on %s', disk + ) + + +def _setup_gke_loop_device_swap(daemonset: _ds_mod.SwapDaemonSet) -> None: + """Plain loop-device swap for single-disk GKE nodes (no dedicated swap disk). + + Used when _setup_gke_hyperdisk_swap finds no dedicated second disk (e.g. + n2-highmem-32 / n4-highmem-32 single-boot-disk nodes, regardless of image + type). + + dm-crypt is skipped on this path for two reasons: + 1. On COS (Container-Optimised OS): the device-mapper kernel subsystem is + inaccessible from inside a Kubernetes pod (even privileged). + 2. On UBUNTU_CONTAINERD: the loop device is created in the container + namespace; its behaviour under nsenter is untested. + + Therefore this path uses a plain loop device as swap without dm-crypt. + Phase 1 (fio) is skipped for plain loop devices. + """ + size_gb = _SWAP_SIZE_GB.value + backing = '/mnt/stateful_partition/pkb_swap_backing' + + # ── Step 0: detach any stale loop device from a previous failed run ─────── + daemonset.PodExec( + textwrap.dedent(f""" + losetup -j {backing} 2>/dev/null | awk -F: '{{print $1}}' | \ + while read dev + do + swapoff "$dev" 2>/dev/null || true + losetup -d "$dev" 2>/dev/null || true + done + rm -f {backing} + """), + ignore_failure=True, + ) + + # ── Step 1: allocate backing file on stateful partition (ext4) ─────────── + logging.info( + '[swap_encryption] GKE: creating %dG backing file on' + ' stateful_partition', + size_gb, + ) + daemonset.PodExec( + textwrap.dedent(f""" + fallocate -l {size_gb}G {backing} 2>/dev/null || \\ + truncate -s {size_gb}G {backing} + """), + ) + + # ── Step 2: loop device with direct-io passthrough ─────────────────────── + loop_out, _ = daemonset.PodExec( + textwrap.dedent(f""" + LOOP=$(losetup -f) && \\ + losetup --direct-io=on "$LOOP" {backing} && \\ + echo "$LOOP" + """), + ) + loop_dev = loop_out.strip() + if not loop_dev.startswith('/dev/loop'): + raise RuntimeError( + f'[swap_encryption] losetup failed – output: {loop_out!r}' + ) + logging.info( + '[swap_encryption] GKE: loop device: %s direct-io=on', loop_dev + ) + + # ── Step 3: plain mkswap + swapon (dm-crypt skipped on loop devices) ──────── + daemonset.PodExec(f'mkswap {loop_dev}') + daemonset.PodExec(f'swapon {loop_dev}') + logging.warning( + '[swap_encryption] GKE: plain loop swap active on %s ' + '(dm-crypt unavailable from COS pod — device-mapper is blocked by ' + 'COS kernel namespace restrictions). ' + 'Phase 1 (fio) will be skipped. ' + 'Use a machine with LSSD (c4-*-lssd) or attach a dedicated second ' + 'hyperdisk for dm-crypt measurement.', + loop_dev, + ) + + +def _setup_gke_bootdisk_swap(daemonset: _ds_mod.SwapDaemonSet) -> None: + """Swap on the OS BOOT disk — methodology Table 0 rows 1-4. + + Creates a loop-backed swap file on /mnt/stateful_partition (the node's boot + disk, whose type — pd-balanced or hyperdisk-balanced — is chosen at + nodepool-creation time via --swap_encryption_boot_disk_type). dm-crypt is + layered on the loop device when --swap_encryption_enable_dmcrypt is set + (encryption-on rows 2/4); otherwise plain swap is used (encryption-off rows + 1/3). + """ + size_gb = _SWAP_SIZE_GB.value + backing = '/mnt/stateful_partition/pkb_swap_backing' + logging.info( + '[swap_encryption] GKE: boot-disk swap (%dG backing, dmcrypt=%s)', + size_gb, + _ENABLE_DMCRYPT.value, + ) + + # Clean up any stale loop/mapping from a previous run. + daemonset.PodExec( + textwrap.dedent(f""" + swapoff /dev/mapper/swap_encrypted 2>/dev/null || true + dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true + losetup -j {backing} 2>/dev/null | awk -F: '{{print $1}}' | while read d + do + swapoff "$d" 2>/dev/null || true + losetup -d "$d" 2>/dev/null || true + done + rm -f {backing} + """), + ignore_failure=True, + ) + + # Allocate the backing file on the boot-disk ext4 stateful partition. + daemonset.PodExec( + textwrap.dedent(f""" + fallocate -l {size_gb}G {backing} 2>/dev/null || truncate -s {size_gb}G {backing} + """), + ) + + loop_out, _ = daemonset.PodExec( + textwrap.dedent(f""" + LOOP=$(losetup -f) && losetup --direct-io=on "$LOOP" {backing} && echo "$LOOP" + """), + ) + loop_dev = ( + loop_out.strip().splitlines()[-1].strip() if loop_out.strip() else '' + ) + if not loop_dev.startswith('/dev/loop'): + raise RuntimeError( + f'[swap_encryption] boot-disk losetup failed: {loop_out!r}' + ) + logging.info('[swap_encryption] GKE: boot-disk loop device: %s', loop_dev) + + if _ENABLE_DMCRYPT.value: + daemonset.PodExec( + textwrap.dedent(f""" + grep -q dm_crypt /proc/modules 2>/dev/null || {{ + KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1) + [ -n "$KO" ] && insmod "$KO" 2>/dev/null || true + }} + KEY=$(dd if=/dev/urandom bs=32 count=1 2>/dev/null | od -A n -t x1 | tr -d ' \\n') + SIZE=$(blockdev --getsz {loop_dev}) + printf "0 %s crypt aes-xts-plain64 %s 0 %s 0\\n" "$SIZE" "$KEY" "{loop_dev}" | \\ + dmsetup create swap_encrypted --noudevrules --noudevsync + unset KEY + dmsetup mknodes swap_encrypted 2>/dev/null || true + mkswap /dev/mapper/swap_encrypted + swapon /dev/mapper/swap_encrypted + """), + ) + logging.info( + '[swap_encryption] GKE: boot-disk dm-crypt swap active on ' + '/dev/mapper/swap_encrypted' + ) + else: + daemonset.PodExec( + textwrap.dedent(f""" + mkswap {loop_dev} && swapon {loop_dev} + """), + ) + logging.info( + '[swap_encryption] GKE: boot-disk plain swap active on %s', loop_dev + ) + + +def _setup_gke_lssd_swap(daemonset: _ds_mod.SwapDaemonSet) -> None: + """Configure dm-crypt on LSSD RAID-0 array (go/gke-swap-lssd).""" + logging.info('[swap_encryption] GKE: setting up LSSD RAID-0 swap') + + # Reused-node hygiene: tear down any prior PKB swap mapping FIRST. + daemonset.PodExec( + textwrap.dedent(""" + swapoff /dev/mapper/swap_encrypted 2>/dev/null || true + swapoff -a 2>/dev/null || true + dmsetup remove --force --noudevrules --noudevsync swap_encrypted 2>/dev/null || true + """), + ignore_failure=True, + ) + + # Log the full block-device topology up front for diagnosis. + topo, _ = daemonset.PodExec( + 'lsblk -o NAME,TYPE,SIZE,ROTA,MOUNTPOINT 2>/dev/null', + ignore_failure=True, + ) + logging.info( + '[swap_encryption] block device topology:\n%s', (topo or '').strip() + ) + + # Identify candidate swap devices = whole disks that are NOT the boot/OS disk. + lssd_out, _ = daemonset.PodExec( + textwrap.dedent(""" + for d in $(lsblk -dno NAME,ROTA | awk '$2==0{print $1}') + do + if lsblk -no TYPE "/dev/$d" 2>/dev/null | grep -q '^part$'; then + continue # has partitions -> boot/OS disk + fi + if lsblk -no MOUNTPOINT "/dev/$d" 2>/dev/null | grep -q '[^[:space:]]'; then + continue # mounted somewhere -> not a free swap device + fi + echo "/dev/$d" + done + """), + ignore_failure=True, + ) + devices = [d.strip() for d in lssd_out.strip().splitlines() if d.strip()] + if not devices: + logging.warning( + '[swap_encryption] No clean (unpartitioned, unmounted) local SSD' + ' found — falling back to hyperdisk swap path' + ) + _setup_gke_hyperdisk_swap(daemonset) + return + + device_list = ' '.join(devices) + n = len(devices) + logging.info( + '[swap_encryption] GKE: LSSD RAID-0 across %d clean device(s): ' + '%s dmcrypt=%s', + n, + device_list, + _ENABLE_DMCRYPT.value, + ) + + # Clean up stale mappings, RAID arrays, and GKE-managed mounts. + daemonset.PodExec( + textwrap.dedent(f""" + echo "[pkb-lssd-cleanup] /proc/mdstat:" >&2 + cat /proc/mdstat 2>/dev/null || true + echo "[pkb-lssd-cleanup] dmsetup ls:" >&2 + dmsetup ls 2>/dev/null || true + echo "[pkb-lssd-cleanup] /proc/swaps:" >&2 + cat /proc/swaps 2>/dev/null || true + echo "[pkb-lssd-cleanup] host mounts on {device_list}:" >&2 + grep -E '{('|'.join(devices))}' /proc-host/mounts 2>/dev/null || true + echo "[pkb-lssd-cleanup] sysfs holders:" >&2 + for dev in {device_list} + do + devname=$(basename "$dev") + ls -1 /sys/block/$devname/holders/ 2>/dev/null | while read h + do + echo "[pkb-lssd-cleanup] $dev held by $h" >&2 + done + done + echo "[pkb-lssd-cleanup] --- begin teardown ---" >&2 + for dev in {device_list} + do + test -b "$dev" || continue + devname=$(basename "$dev") + for holder in /sys/block/$devname/holders/* + do + test -e "$holder" || continue + h=$(basename "$holder") + echo "[pkb-lssd-cleanup] removing holder /dev/$h from $dev" >&2 + if echo "$h" | grep -q "^md" + then + mdadm --stop /dev/$h 2>/dev/null || true + else + dmsetup remove --force --noudevrules --noudevsync /dev/$h 2>/dev/null || true + fi + done + mounts=$(awk -v d="$dev" '$1==d{{print $2}}' /proc-host/mounts 2>/dev/null || true) + for mp in $mounts + do + echo "[pkb-lssd-cleanup] unmounting $mp from $dev" >&2 + umount -f "$mp" 2>/dev/null || true + done + done + swapoff -a 2>/dev/null || true + swapoff /dev/mapper/pkb_swap 2>/dev/null || true + swapoff /dev/mapper/swap_encrypted 2>/dev/null || true + dmsetup remove --force --noudevrules --noudevsync pkb_swap 2>/dev/null || true + dmsetup remove --force --noudevrules --noudevsync swap_encrypted 2>/dev/null || true + mdadm --stop --scan 2>/dev/null || true + mdadm --zero-superblock {device_list} 2>/dev/null || true + wipefs -a {device_list} 2>/dev/null || true + echo "[pkb-lssd-cleanup] lsblk after wipefs:" >&2 + lsblk {device_list} 2>/dev/null || true + partx -u {device_list} 2>/dev/null || true + losetup -D 2>/dev/null || true + rm -f /mnt/stateful_partition/pkb_swap.img 2>/dev/null || true + sleep 2 + """), + ignore_failure=True, + ) + + # Verify the devices are truly raw (unpartitioned). + raw_check_out, _ = daemonset.PodExec( + textwrap.dedent(f""" + for dev in {device_list} + do + if lsblk -ln -o TYPE "$dev" 2>/dev/null | grep -q '^part$' + then + echo "[pkb-lssd] $dev is partitioned — cannot use as raw block device" >&2 + else + echo "$dev" + fi + done + """), + ignore_failure=True, + ) + raw_devices = [ + d.strip() for d in raw_check_out.strip().splitlines() if d.strip() + ] + + if not raw_devices: + logging.info( + '[swap_encryption] GKE: all LSSD devices are partitioned — ' + 'falling back to loop device on /mnt/stateful_partition' + ) + _setup_gke_lssd_stateful_loop_swap(daemonset) + return + + # Use only raw (unpartitioned) devices going forward. + devices = raw_devices + device_list = ' '.join(devices) + n = len(devices) + logging.info( + '[swap_encryption] GKE: using %d raw LSSD device(s): %s dmcrypt=%s', + n, + device_list, + _ENABLE_DMCRYPT.value, + ) + + # For N=1 LSSD, skip mdadm entirely and target the raw device directly. + # For N>1 we stripe across multiple NVMe devices. + if n > 1: + daemonset.PodExec( + textwrap.dedent(f""" + mdadm --create /dev/md0 --force \\ + --level=0 --raid-devices={n} \\ + {device_list} + test -b /dev/md0 || {{ echo "mdadm: /dev/md0 not created" >&2; exit 1; }} + """), + ) + swap_block_dev = '/dev/md0' + else: + swap_block_dev = devices[0] + logging.info( + '[swap_encryption] GKE: single LSSD — skipping mdadm, ' + 'using %s directly', + swap_block_dev, + ) + + if _ENABLE_DMCRYPT.value: + daemonset.PodExec( + textwrap.dedent(f""" + grep -q dm_crypt /proc/modules 2>/dev/null || {{ + KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1) + [ -n "$KO" ] && insmod "$KO" 2>/dev/null || true + }} + udevadm control --stop-exec-queue 2>/dev/null || true + KEY=$(dd if=/dev/urandom bs=32 count=1 2>/dev/null | od -A n -t x1 | tr -d ' \\n') + SIZE=$(blockdev --getsz {swap_block_dev}) + printf "0 %s crypt aes-xts-plain64 %s 0 %s 0\\n" "$SIZE" "$KEY" "{swap_block_dev}" | \\ + dmsetup create swap_encrypted --noudevrules --noudevsync + udevadm control --start-exec-queue 2>/dev/null || true + unset KEY + dmsetup mknodes swap_encrypted 2>/dev/null || true + mkswap /dev/mapper/swap_encrypted + swapon /dev/mapper/swap_encrypted + """), + ) + logging.info( + '[swap_encryption] GKE: LSSD dm-crypt swap active on %s', + swap_block_dev, + ) + else: + daemonset.PodExec( + textwrap.dedent(f""" + mkswap {swap_block_dev} + swapon {swap_block_dev} + """), + ) + logging.info( + '[swap_encryption] GKE: LSSD plain swap active on %s', + swap_block_dev, + ) + + +def _setup_gke_lssd_stateful_loop_swap( + daemonset: _ds_mod.SwapDaemonSet, +) -> None: + """Set up swap on the LSSD partition via a loop device. + + Used when the local NVMe device is partitioned by GKE startup scripts + and cannot be opened as a whole raw block device (DM_TABLE_LOAD EBUSY). + The DaemonSet mounts /mnt/stateful_partition (hostPath) from the host's + nvme1n1p1 — which is still local SSD storage. We create a large file + there and layer loop → dm-crypt → swap on top of it. + """ + img_path = '/mnt/stateful_partition/pkb_swap.img' + + # Clean up any previous run artifacts. + daemonset.PodExec( + textwrap.dedent(f""" + swapoff -a 2>/dev/null || true + dmsetup remove --force --noudevrules --noudevsync swap_encrypted 2>/dev/null || true + losetup -D 2>/dev/null || true + rm -f {img_path} 2>/dev/null || true + """), + ignore_failure=True, + ) + + # Determine file size: 80% of available space, at least 16 GB. + size_out, _ = daemonset.PodExec( + f"df -P /mnt/stateful_partition | awk 'NR==2{{print $4}}'", + ignore_failure=True, + ) + avail_kb = int(size_out.strip() or '0') + swap_gb = max(16, int(avail_kb * 0.8 / 1024 / 1024)) + logging.info( + '[swap_encryption] GKE: LSSD stateful-loop: %d GB image at %s', + swap_gb, + img_path, + ) + + # Allocate file (fallocate is instant on ext4; dd fallback for others). + daemonset.PodExec( + textwrap.dedent(f""" + fallocate -l {swap_gb}G {img_path} 2>/dev/null || \\ + dd if=/dev/zero of={img_path} bs=1G count={swap_gb} + chmod 600 {img_path} + losetup --direct-io=on -f {img_path} + """), + timeout=300, + ) + + loop_out, _ = daemonset.PodExec( + f"losetup -j {img_path} | awk -F: '{{print $1}}' | head -1", + ignore_failure=True, + ) + loop_dev = loop_out.strip() + if not loop_dev.startswith('/dev/loop'): + raise RuntimeError( + f'[swap_encryption] losetup failed for {img_path} — got:' + f' {loop_out!r}' + ) + logging.info( + '[swap_encryption] GKE: LSSD stateful-loop device: %s', loop_dev + ) + + if _ENABLE_DMCRYPT.value: + daemonset.PodExec( + textwrap.dedent(f""" + grep -q dm_crypt /proc/modules 2>/dev/null || {{ + KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1) + [ -n "$KO" ] && insmod "$KO" 2>/dev/null || true + }} + udevadm control --stop-exec-queue 2>/dev/null || true + KEY=$(dd if=/dev/urandom bs=32 count=1 2>/dev/null | od -A n -t x1 | tr -d ' \\n') + SIZE=$(blockdev --getsz {loop_dev}) + printf "0 %s crypt aes-xts-plain64 %s 0 %s 0\\n" "$SIZE" "$KEY" "{loop_dev}" | \\ + dmsetup create swap_encrypted --noudevrules --noudevsync + udevadm control --start-exec-queue 2>/dev/null || true + unset KEY + dmsetup mknodes swap_encrypted 2>/dev/null || true + mkswap /dev/mapper/swap_encrypted + swapon /dev/mapper/swap_encrypted + """), + ) + logging.info( + '[swap_encryption] GKE: LSSD stateful-loop dm-crypt swap active ' + 'on %s → %s', + img_path, + loop_dev, + ) + else: + daemonset.PodExec( + textwrap.dedent(f""" + mkswap {loop_dev} + swapon {loop_dev} + """), + ) + logging.info( + '[swap_encryption] GKE: LSSD stateful-loop plain swap active ' + 'on %s → %s', + img_path, + loop_dev, + ) + + +def _ensure_io2_volume() -> None: + """Create + attach a dedicated io2 EBS volume to the benchmark node. + + No-op unless --swap_encryption_swap_type=io2 on an AWS/EKS cluster. + Best-effort: logs and returns on failure. Stashes the created volume id in + the module-level _IO2_VOLUME_ID for serial-based device detection in + _setup_eks_io2_swap. + """ + global _IO2_VOLUME_ID + if _SWAP_TYPE.value != 'io2': + return + out, _, rc = kubectl.RunKubectlCommand( + ['get', 'nodes', '-o', 'jsonpath={.items[0].spec.providerID}'], + raise_on_failure=False, + ) + provider = (out or '').strip() # aws:///us-east-1a/i-0abc... + if rc != 0 or 'aws://' not in provider: + logging.warning( + '[swap_encryption] io2 attach skipped: could not resolve ' + 'EC2 instance from providerID=%r', + provider, + ) + return + parts = [p for p in provider.split('/') if p] + instance_id, az = parts[-1], parts[-2] + region = az[:-1] + base = ['aws', 'ec2', '--region', region] + try: + create_args = [ + 'create-volume', + '--volume-type', + 'io2', + '--size', + '500', + '--iops', + '16000', + '--availability-zone', + az, + '--tag-specifications', + 'ResourceType=volume,Tags=[{Key=pkb,Value=swap_encryption}]', + ] + if _IO2_ENCRYPTED.value: + create_args.append('--encrypted') + if _IO2_KMS_KEY_ID.value: + create_args += ['--kms-key-id', _IO2_KMS_KEY_ID.value] + logging.info( + '[swap_encryption] io2 volume will be EBS-encrypted ' + '(row: hardware encryption)' + ) + else: + logging.info( + '[swap_encryption] io2 volume UNENCRYPTED (baseline row)' + ) + create_args += ['--query', 'VolumeId', '--output', 'text'] + vol_id, _, vrc = vm_util.IssueCommand( + base + create_args, raise_on_failure=False + ) + vol_id = (vol_id or '').strip() + if vrc != 0 or not vol_id.startswith('vol-'): + logging.warning( + '[swap_encryption] io2 create-volume failed: %r', vol_id + ) + return + vm_util.IssueCommand( + base + ['wait', 'volume-available', '--volume-ids', vol_id], + raise_on_failure=False, + ) + vm_util.IssueCommand( + base + + [ + 'attach-volume', + '--volume-id', + vol_id, + '--instance-id', + instance_id, + '--device', + '/dev/sdf', + ], + raise_on_failure=False, + ) + vm_util.IssueCommand( + base + ['wait', 'volume-in-use', '--volume-ids', vol_id], + raise_on_failure=False, + ) + _IO2_VOLUME_ID = vol_id + logging.info( + '[swap_encryption] Attached io2 volume %s to %s as /dev/sdf', + vol_id, + instance_id, + ) + time.sleep(15) # allow the NVMe device node to appear + except Exception as e: # pylint: disable=broad-except + logging.warning( + '[swap_encryption] io2 attach error (continuing): %s', e + ) + + +def _setup_eks_swap(daemonset: _ds_mod.SwapDaemonSet) -> None: + """Configure swap on EKS nodes — Instance Store OR io2 root disk. + + Swap type is selected by --swap_encryption_swap_type: + instance_store (default) – NVMe SSD attached by Nitro (i4i, m6id, c6id). + Nitro encrypts all block-device writes at hardware level. + io2 – EBS io2 volume provisioned as the node root/data disk. + Used for apples-to-apples comparison against GKE hyperdisk-balanced. + """ + swap_type = _SWAP_TYPE.value + if swap_type in ('auto', 'instance_store'): + _setup_eks_instance_store_swap(daemonset) + elif swap_type == 'io2': + _setup_eks_io2_swap(daemonset) + else: + logging.warning( + '[swap_encryption] Unknown EKS swap type %s – fallback', swap_type + ) + _setup_eks_instance_store_swap(daemonset) + + +def _setup_eks_instance_store_swap(daemonset: _ds_mod.SwapDaemonSet) -> None: + """Swap on AWS NVMe Instance Store (Nitro hardware-offloaded encryption).""" + logging.info('[swap_encryption] EKS: setting up Instance Store swap') + + # Find the Instance Store NVMe device (not the root EBS volume) + nvme_out, _ = daemonset.PodExec( + "nvme list 2>/dev/null | awk '/Instance Storage/{print $1}' | head -1" + " || lsblk -d -o NAME,MODEL | grep -i 'instance\\|nvme' | grep -v" + " 'nvme0' | awk '{print \"/dev/\"$1}' | head -1", + ignore_failure=True, + ) + device = nvme_out.strip() + if not device: + # Common Instance Store device paths on AWS + for candidate in ['/dev/nvme1n1', '/dev/nvme2n1', '/dev/xvdb']: + exists_out, _ = daemonset.PodExec( + f'test -b {candidate} && echo yes || echo no', + ignore_failure=True, + ) + if exists_out.strip() == 'yes': + device = candidate + break + + if not device: + logging.warning( + '[swap_encryption] No Instance Store NVMe found – creating swapfile' + ) + _setup_plain_swap_file(daemonset, _SWAP_SIZE_GB.value) + return + + logging.info('[swap_encryption] EKS: Instance Store device: %s', device) + + # Nitro encrypts all Instance Store writes automatically. + daemonset.PodExec( + textwrap.dedent(f""" + mkswap {device} && \\ + swapon {device} + """), + ) + logging.info( + '[swap_encryption] EKS: Instance Store swap active on %s', device + ) + + +def _setup_eks_io2_swap(daemonset: _ds_mod.SwapDaemonSet) -> None: + """Swap on AWS EBS io2 volume – apples-to-apples comparison vs GKE hyperdisk. + + EBS io2 volumes on Nitro instances are encrypted at rest by AWS KMS (if + enabled on the volume) or via Nitro-level hardware encryption. + + Device discovery order: + 1. Match the io2 volume created by _ensure_io2_volume() by its NVMe serial + (serial == volume id without the dash). + 2. First non-root EBS ("Elastic Block Store") block device that is not + currently mounted. + """ + logging.info('[swap_encryption] EKS: setting up io2 EBS swap') + + # Identify root device so we can exclude it. + root_out, _ = daemonset.PodExec( + 'lsblk -no pkname $(findmnt -n -o SOURCE /) 2>/dev/null || echo' + ' nvme0n1', + ignore_failure=True, + ) + root_base = root_out.strip() or 'nvme0n1' + + # Identify the io2 volume UNAMBIGUOUSLY by its NVMe serial == volume id. + device = '' + target = _IO2_VOLUME_ID.replace('-', '') + if target: + ser_out, _ = daemonset.PodExec( + 'for d in /sys/block/nvme*n1; do ' + '[ -e "$d" ] || continue; ' + 's=$(cat "$d/device/serial" 2>/dev/null | tr -d "-" | tr -d " "); ' + f'[ "$s" = "{target}" ] && {{ echo "/dev/$(basename "$d")"; break;' + ' }; ' + 'done', + ignore_failure=True, + ) + device = ser_out.strip() + if device: + logging.info( + '[swap_encryption] EKS: io2 matched by serial %s -> %s', + target, + device, + ) + + if not device: + # Fallback: first non-root EBS device that is not currently mounted. + disk_out, _ = daemonset.PodExec( + 'for d in /sys/block/nvme*n1 /sys/block/xvd[b-z]' + ' /sys/block/sd[b-z];' + ' do [ -e "$d" ] || continue; n=$(basename "$d"); [ "$n" =' + f' "{root_base}" ] && continue; m=$(cat "$d/device/model"' + ' 2>/dev/null);' + ' echo "$m" | grep -qi "Elastic Block Store" || continue;' + ' mnt=$(lsblk' + ' -no MOUNTPOINT "/dev/$n" 2>/dev/null | tr -d " "); [ -n "$mnt"' + ' ] &&' + ' continue; echo "/dev/$n"; break; done', + ignore_failure=True, + ) + device = disk_out.strip() + if device: + logging.info( + '[swap_encryption] EKS: io2 fallback EBS device: %s', device + ) + + if not device: + logging.warning( + '[swap_encryption] No io2 EBS disk found – creating plain swapfile' + ) + _setup_plain_swap_file(daemonset, _SWAP_SIZE_GB.value) + return + + logging.info('[swap_encryption] EKS: io2 EBS device: %s', device) + + # EBS io2 encryption is handled at the AWS level (Nitro / KMS). + out, _ = daemonset.PodExec( + textwrap.dedent(f""" + swapoff {device} 2>/dev/null || true + wipefs -a {device} 2>/dev/null || true + mkswap -f {device} && swapon {device} + swapon --show + """), + ignore_failure=True, + ) + if device not in out: + raise RuntimeError( + f'[swap_encryption] io2 swap did not activate on {device}; ' + f'swapon --show output: {out!r}. The device may be busy/mounted ' + '(wrong device picked) or mkswap failed.' + ) + logging.info('[swap_encryption] EKS: io2 EBS swap active on %s', device) + + +def _setup_plain_swap_file( + daemonset: _ds_mod.SwapDaemonSet, size_gb: int +) -> None: + """Fallback: create a loop-device-backed swapfile. + + A plain file on overlayfs (the container root) cannot be used as swap — + the kernel rejects it with EINVAL. Routing it through a loop device + presents a proper block device to the mm subsystem and succeeds. + """ + logging.info('[swap_encryption] Creating %dGB loop-device swap', size_gb) + daemonset.PodExec( + textwrap.dedent(f""" + fallocate -l {size_gb}G /tmp/pkb_swapfile && \\ + chmod 600 /tmp/pkb_swapfile && \\ + LOOP=$(losetup -f) && \\ + losetup "$LOOP" /tmp/pkb_swapfile && \\ + mkswap "$LOOP" && \\ + swapon "$LOOP" && \\ + echo "swap loop device: $LOOP" + """), + ) + + +def _enable_zswap(daemonset: _ds_mod.SwapDaemonSet) -> None: + """Enable zswap with lz4 compressor and 20% pool limit inside the pod.""" + logging.info('[swap_encryption] Enabling zswap (lz4, 20%% pool)') + for cmd in [ + 'echo 1 > /sys/module/zswap/parameters/enabled', + 'echo lz4 > /sys/module/zswap/parameters/compressor', + 'echo 20 > /sys/module/zswap/parameters/max_pool_percent', + 'echo z3fold > /sys/module/zswap/parameters/zpool', + ]: + daemonset.PodExec(cmd, ignore_failure=True) + + +def _phase1_fio( + daemonset: _ds_mod.SwapDaemonSet, swap_dev: str, base_meta: dict +) -> list[sample.Sample]: + """Run fio directly on the swap block device for raw I/O characterisation. + + Skipped only for an UNINTENTIONAL loop fallback (a single-disk node with no + dedicated swap disk, where fio on the loop would measure the boot ext4 + filesystem rather than the swap stack). When the user explicitly selects the + boot_disk target (--swap_encryption_swap_type=boot_disk, methodology rows + 1-4), the loop over the boot disk IS the device under test, so fio runs and + characterises it. + """ + if swap_dev.startswith('/dev/loop') and _SWAP_TYPE.value != 'boot_disk': + logging.warning( + '[swap_encryption] Phase 1 (fio) SKIPPED for plain loop device %s' + ' (unintentional single-disk fallback). fio on a loop-backed device' + ' measures the underlying ext4 filesystem (stateful_partition), not' + ' the swap stack. Use c4-*-lssd, --swap_encryption_add_swap_disk,' + ' or --swap_encryption_swap_type=boot_disk for fio data.', + swap_dev, + ) + return [] + + results = [] + + daemonset.PodExec(f'swapoff {swap_dev}', ignore_failure=True) + + # Pre-fill device so read tests have real data. + # Cap at 20 GiB — enough to warm up the dm-crypt pipeline. + _PREFILL_GIB = 20 + prefill_timeout = _PREFILL_GIB * 1024 // 150 + 60 + prefill_timeout = max(prefill_timeout, 300) + logging.info( + '[swap_encryption] Pre-filling %d GiB of %s', _PREFILL_GIB, swap_dev + ) + daemonset.PodExec( + ( + f'fio --name=prefill --filename={swap_dev} --ioengine=libaio' + f' --direct=1 --rw=write --bs=1m --size={_PREFILL_GIB}g --verify=0' + ' --output=/tmp/pkb_fio_prefill.log' + ), + timeout=prefill_timeout, + ignore_failure=True, + ) + + # Each fio job: runtime + 90 s buffer (run + JSON write + file read). + fio_run_timeout = _FIO_RUNTIME_SEC.value + 90 + fio_read_timeout = 60 + + for name, rw, bs, depth, label in _FIO_JOBS: + logging.info('[swap_encryption] fio: %s', name) + out_file = f'/tmp/pkb_fio_{name}.json' + # Remove stale output first to avoid silently reusing a previous result. + daemonset.PodExec( + f'rm -f {out_file}', + ignore_failure=True, + _retries=0, + timeout=15, + ) + run_cmd = ( + f'fio --name={name} --filename={swap_dev} ' + '--ioengine=libaio --direct=1 --verify=0 --randrepeat=0 ' + f'--bs={bs} --iodepth={depth} --rw={rw} ' + f'--time_based --runtime={_FIO_RUNTIME_SEC.value}s ' + f'--output-format=json --output={out_file}' + ) + _, err = daemonset.PodExec( + run_cmd, + timeout=fio_run_timeout, + ignore_failure=True, + _retries=0, + ) + if 'connection reset by peer' in err: + logging.warning( + '[swap_encryption] fio %s: kubectl exec connection ' + 'reset; result may be incomplete', + name, + ) + out, _ = daemonset.PodExec( + f'cat {out_file} 2>/dev/null || echo ""', + timeout=fio_read_timeout, + ignore_failure=True, + ) + results += _parse_fio_json(out, name, label, base_meta) + + # fio prefill overwrites the entire device, destroying the mkswap header. + # Re-stamp and re-enable before the remaining phases need active swap. + daemonset.PodExec( + f'mkswap {swap_dev} && swapon {swap_dev}', + ignore_failure=True, + timeout=120, + ) + return results + + +def _parse_fio_json( + stdout: str, job_name: str, label: str, base_meta: dict +) -> list[sample.Sample]: + """Parse fio JSON output into PKB Samples.""" + results = [] + try: + data = json.loads(stdout) + except (json.JSONDecodeError, ValueError): + logging.warning( + '[swap_encryption] fio JSON parse failed for %s', job_name + ) + return results + + meta = dict(base_meta, fio_job=job_name, fio_label=label) + for job in data.get('jobs', []): + for direction in ('read', 'write'): + d = job.get(direction, {}) + if not d or d.get('io_bytes', 0) == 0: + continue + iops = float(d.get('iops', 0)) + bw_kib = float(d.get('bw', 0)) + clat = d.get('clat_ns', {}) + pct = clat.get('percentile', {}) + lat_mean = float(clat.get('mean', 0)) / 1000.0 + lat_p50 = float(pct.get('50.000000', 0)) / 1000.0 + lat_p99 = float(pct.get('99.000000', 0)) / 1000.0 + lat_p999 = float(pct.get('99.900000', 0)) / 1000.0 + m = dict(meta, direction=direction) + results += [ + sample.Sample(f'{job_name}_{direction}_iops', iops, 'iops', m), + sample.Sample( + f'{job_name}_{direction}_bw_mbps', bw_kib / 1024, 'MB/s', m + ), + sample.Sample( + f'{job_name}_{direction}_lat_mean', lat_mean, 'us', m + ), + sample.Sample( + f'{job_name}_{direction}_lat_p50', lat_p50, 'us', m + ), + sample.Sample( + f'{job_name}_{direction}_lat_p99', lat_p99, 'us', m + ), + sample.Sample( + f'{job_name}_{direction}_lat_p999', lat_p999, 'us', m + ), + ] + return results + + +_INSTANCE_PRICE_USD_PER_HR: dict[str, float] = { + # GCP (on-demand, us-central1 unless noted) + 'c4-standard-8-lssd': 0.5888, # 8 vCPU, 32 GB RAM + 1×375 GB LSSD + 'c4-standard-8': 0.5008, # 8 vCPU, 32 GB RAM, no LSSD + 'n4-highmem-32': 3.0256, # 32 vCPU, 256 GB RAM + 'n2-highmem-32': 2.5216, # 32 vCPU, 256 GB RAM + 'n2-standard-32': 1.5264, # 32 vCPU, 120 GB RAM + 'z3-highmem-8': 2.7248, # 8 vCPU + 4× LSSD + # AWS + 'i4i.4xlarge': 1.4960, # 16 vCPU, 128 GB RAM, NVMe Instance Store + 'i4i.2xlarge': 0.7480, + 'm6id.4xlarge': 0.9072, # 16 vCPU, 64 GB RAM, NVMe Instance Store + 'm6i.4xlarge': 0.7680, # 16 vCPU, 64 GB RAM, no Instance Store + 'r6i.4xlarge': 1.0080, # 16 vCPU, 128 GB RAM, no Instance Store +} + + +def _collect_cost_sample( + daemonset: _ds_mod.SwapDaemonSet, + elapsed_sec: float, + base_meta: dict, +) -> list[sample.Sample]: + """Emit a cost_estimate_usd sample for the benchmark run. + + Instance type is read from cloud metadata inside the pod. Price is looked + up from _INSTANCE_PRICE_USD_PER_HR; if unknown, the sample is omitted and + a warning is logged. + + Args: + daemonset: Active SwapDaemonSet resource. + elapsed_sec: Wall-clock seconds the benchmark phases took. + base_meta: Shared metadata dict. + + Returns: + A list of zero or one sample.Sample. + """ + instance_type = '' + + # GCP: machine type is the last segment of the metadata URL value + gcp_type_out, _ = daemonset.PodExec( + 'curl -s -m 3 --fail' + ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type' + ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', + ignore_failure=True, + ) + if gcp_type_out.strip(): + instance_type = gcp_type_out.strip().split('/')[-1] + + if not instance_type: + # AWS: instance-type is a plain string + aws_type_out, _ = daemonset.PodExec( + 'curl -s -m 3 --fail ' + 'http://169.254.169.254/latest/meta-data/instance-type ' + '2>/dev/null || echo ""', + ignore_failure=True, + ) + instance_type = aws_type_out.strip() + + # Allow explicit override. + if _INSTANCE_SIZE_LABEL.value: + instance_type = _INSTANCE_SIZE_LABEL.value + + # Last resort: fall back to the benchmark machine type flag. + if not instance_type and _BENCHMARK_MACHINE_TYPE.value: + instance_type = _BENCHMARK_MACHINE_TYPE.value + logging.info( + '[swap_encryption] Instance type from metadata unavailable; using' + ' --swap_encryption_benchmark_machine_type=%s for cost tracking', + instance_type, + ) + + price = _INSTANCE_PRICE_USD_PER_HR.get(instance_type) + if price is None: + logging.warning( + '[swap_encryption] Unknown instance type "%s" — skipping cost' + ' sample. Add it to _INSTANCE_PRICE_USD_PER_HR to enable cost' + ' tracking.', + instance_type, + ) + return [] + + hours = elapsed_sec / 3600.0 + cost = hours * price + meta = dict( + base_meta, + instance_type=instance_type, + price_usd_per_hr=price, + benchmark_elapsed_sec=round(elapsed_sec, 1), + ) + return [sample.Sample('cost_estimate_usd', cost, 'USD', meta)] + + +def _detect_swap_device(daemonset: _ds_mod.SwapDaemonSet) -> str: + """Return the active swap device path on the cluster node.""" + if _SWAP_DEVICE.value: + return _SWAP_DEVICE.value + + # /proc/swaps is the source of truth: it lists the swap device that is + # ACTUALLY active. We must NOT just `test -e /dev/mapper/swap_encrypted`, + # because a stale dm-crypt mapping from a previous run on a reused node can + # still exist as a /dev node while being non-functional. + dm_out, _ = daemonset.PodExec( + textwrap.dedent(""" + ACTIVE=$(awk 'NR==2{print $1}' /proc/swaps 2>/dev/null) + if [ -n "$ACTIVE" ] + then + echo "$ACTIVE" + elif test -e /dev/mapper/swap_encrypted + then + echo /dev/mapper/swap_encrypted + fi + """), + ignore_failure=True, + ) + dev = dm_out.strip().splitlines()[-1].strip() if dm_out.strip() else '' + if dev: + return dev + raise ValueError( + 'No active swap device found in the benchmark pod. ' + 'Use --swap_encryption_device to specify one.' + ) + + +def _build_metadata( + daemonset: _ds_mod.SwapDaemonSet, swap_dev: str +) -> dict[str, Any]: + """Collect node environment, encryption type, and config into a dict.""" + kernel_out, _ = daemonset.PodExec('uname -r', ignore_failure=True) + mem_out, _ = daemonset.PodExec( + "awk '/MemTotal/{print $2}' /proc/meminfo", + ignore_failure=True, + ) + swap_out, _ = daemonset.PodExec( + "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps", + ignore_failure=True, + ) + + try: + mem_gb = round(int(mem_out.strip()) / (1024 * 1024), 1) + except ValueError: + mem_gb = 0 + try: + swap_gb = round(int(swap_out.strip()) / (1024 * 1024), 1) + except ValueError: + swap_gb = 0 + + # Encryption type — key off dm-crypt presence + the swap target, NOT the + # device path. A GKE plain Local SSD is /dev/nvme0n1 but is NOT Nitro- + # encrypted; only the AWS targets (instance_store / io2) are. + enc = 'unknown' + if '/dev/mapper/' in swap_dev: + table_out, _ = daemonset.PodExec( + f'dmsetup table {swap_dev.split("/")[-1]} 2>/dev/null || echo ""', + ignore_failure=True, + ) + enc = 'dm-crypt-plain' if 'crypt' in table_out.lower() else 'dm-other' + elif _SWAP_TYPE.value in ('instance_store', 'io2'): + enc = 'nitro_hardware_offload' # AWS: encrypted by the Nitro card + elif not _ENABLE_DMCRYPT.value: + enc = 'none' # GKE plain swap (encryption OFF) + + cloud = _detect_cloud(daemonset) + + instance_label = _INSTANCE_SIZE_LABEL.value + if not instance_label: + gcp_type_out, _ = daemonset.PodExec( + 'curl -s -m 3 --fail' + ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type' + ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', + ignore_failure=True, + ) + if gcp_type_out.strip(): + instance_label = gcp_type_out.strip().split('/')[-1] + if not instance_label: + aws_type_out, _ = daemonset.PodExec( + 'curl -s -m 3 --fail ' + 'http://169.254.169.254/latest/meta-data/instance-type ' + '2>/dev/null || echo ""', + ignore_failure=True, + ) + instance_label = aws_type_out.strip() + + return { + 'benchmark': BENCHMARK_NAME, + 'execution_mode': 'kubernetes_privileged_pod', + 'cloud': cloud, + 'instance_size': instance_label, + 'kernel_version': kernel_out.strip(), + 'host_memory_gb': mem_gb, + 'swap_device': swap_dev, + 'swap_size_gb': swap_gb, + 'swap_encryption': enc, + # Test-matrix columns: storage target, encryption on/off, image, IOPS + 'storage_target': _SWAP_TYPE.value, + 'boot_disk_type': _BOOT_DISK_TYPE.value, + 'dmcrypt_enabled': _ENABLE_DMCRYPT.value, + 'node_image_type': _NODE_IMAGE_TYPE.value, + 'boot_disk_iops_target': _BOOT_DISK_IOPS.value, + 'benchmark_machine_type': _BENCHMARK_MACHINE_TYPE.value, + # Other config + 'zswap_enabled': _ENABLE_ZSWAP.value, + 'min_free_kbytes': _MIN_FREE_KBYTES.value, + 'fio_runtime_sec': _FIO_RUNTIME_SEC.value, + # Requested config value only. + 'stress_vm_bytes_requested': _STRESS_VM_BYTES.value, + 'stress_vm_bytes_list': _STRESS_VM_BYTES_LIST.value, + 'stress_timeout_sec': _STRESS_TIMEOUT_SEC.value, + 'nodepool': _NODEPOOL.value, + }