GoogleCloudPlatform · DevVegeta · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026 · Jun 24, 2026
diff --git a/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2
@@ -0,0 +1,266 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: {{ ds_name }}
+  namespace: {{ ds_namespace }}
+  labels:
+    app: {{ ds_label }}
+spec:
+  selector:
+    matchLabels:
+      app: {{ ds_label }}
+  template:
+    metadata:
+      labels:
+        app: {{ ds_label }}
+    spec:
+      hostPID: true
+      hostNetwork: true
+      # Pin to the benchmark nodepool — never schedule on the dummy default pool.
+      nodeSelector:
+        pkb_nodepool: {{ benchmark_nodepool }}
+      tolerations:
+      - operator: Exists
+      containers:
+      - name: benchmark
+        image: {{ image }}
+        command:
+        - bash
+        - -c
+        - |
+          echo "[pkb] Installing benchmark tools..."
+          # Retry apt-get up to 3 times — transient network failures are
+          # common on a freshly-started GKE node.  Critical tools (fio,
+          # stress-ng) must be present before we write the ready sentinel;
+          # a silent || true here would cause /tmp/pkb_ready to appear even
+          # when tools are missing, breaking all subsequent phases.
+          PKB_APT_OK=0
+          for _attempt in 1 2 3; do
+            apt-get update -qq 2>&1 || true
+            DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \\
+              fio \\
+              stress-ng \\
+              sysstat \\
+              cryptsetup \\
+              mdadm \\
+              redis-server \\
+              redis-tools \\
+              git \\
+              wget \\
+              curl \\
+              make \\
+              gcc \\
+              bc \\
+              flex \\
+              bison \\
+              libelf-dev \\
+              libssl-dev \\
+              cgroup-tools \\
+              nvme-cli \\
+              util-linux \\
+              python3-pip \\
+              libevent-dev \\
+              libssl-dev \\
+              libpcre3-dev \\
+              zlib1g-dev \\
+              build-essential \\
+              autoconf \\
+              automake \\
+              libtool \\
+              libtool-bin \\
+              pkg-config \\
+              python3-dev \\
+              default-jre-headless \\
+              2>&1 && PKB_APT_OK=1 && break
+            echo "[pkb] apt-get attempt $_attempt failed, retrying in 15s..." >&2
+            sleep 15
+          done
+          if [ "$PKB_APT_OK" != "1" ] || \\
+             ! command -v fio >/dev/null 2>&1 || \\
+             ! command -v stress-ng >/dev/null 2>&1; then
+            echo "[pkb] FATAL: critical tools (fio, stress-ng) not installed after 3 attempts" >&2
+            exit 1
+          fi
+          echo "[pkb] Installing memtier_benchmark from source..."
+          # Pin a stable release tag — building from the moving default
+          # branch (HEAD) intermittently broke (memtier_benchmark not found
+          # → Phase 3a lost its P50/P90/P99 latency).  2.2.1 matches the
+          # version PKB's memtier package (memtier.MemtierResult.Parse) is
+          # validated against and builds cleanly with the apt deps above.
+          # Fall back to HEAD only if the tagged clone fails.
+          if ! command -v memtier_benchmark >/dev/null 2>&1; then
+            (cd /tmp && \\
+              rm -rf memtier_benchmark && \\
+              ( git clone --depth 1 --branch 2.2.1 \\
+                  https://github.com/RedisLabs/memtier_benchmark.git 2>&1 || \\
+                git clone --depth 1 \\
+                  https://github.com/RedisLabs/memtier_benchmark.git 2>&1 ) && \\
+              cd memtier_benchmark && \\
+              autoreconf -ivf 2>&1 && \\
+              ./configure 2>&1 && \\
+              make -j$(nproc) 2>&1 && \\
+              make install 2>&1) > /tmp/pkb_memtier_build.log 2>&1 || \\
+              echo "[pkb] WARNING: memtier_benchmark build failed (see /tmp/pkb_memtier_build.log); redis-benchmark fallback will be used"
+          fi
+          if command -v memtier_benchmark >/dev/null 2>&1; then
+            echo "[pkb] memtier_benchmark installed: $(memtier_benchmark --version 2>&1 | head -1)"
+          fi
+          echo "[pkb] Installing esrally (lightweight)..."
+          python3 -m pip install --upgrade --break-system-packages pip setuptools wheel > /tmp/pkb_esrally_build.log 2>&1 || true
+          pip3 install --break-system-packages elastic-transport esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\
+            pip3 install --break-system-packages esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\
+            echo "[pkb] WARNING: esrally install failed (see /tmp/pkb_esrally_build.log); opensearch curl fallback will be used"
+          if command -v esrally >/dev/null 2>&1; then
+            echo "[pkb] esrally installed: $(esrally --version 2>&1 | head -1)"
+          else
+            echo "[pkb] WARNING: esrally binary not on PATH after install; opensearch curl fallback will be used" >&2
+          fi
+          echo "[pkb] Installing OpenSearch (single-node, security off) for Phase 3c..."
+          # Phase 3c needs a real search server on :9200.  Nothing in apt
+          # ships one and the pod has no systemd, so install the OpenSearch
+          # bundle (ships its own JDK) and launch the binary directly in the
+          # phase.  All best-effort: if any step fails the phase probes the
+          # endpoint and skips cleanly rather than recording fake timings.
+          if [ ! -x /opt/opensearch/bin/opensearch ]; then
+            OS_VER=2.15.0
+            (cd /opt && \\
+              wget -q --timeout=600 -O os.tgz \\
+                "https://artifacts.opensearch.org/releases/bundle/opensearch/$OS_VER/opensearch-$OS_VER-linux-x64.tar.gz" && \\
+              tar -xzf os.tgz && rm -f os.tgz && \\
+              mv "opensearch-$OS_VER" opensearch) > /tmp/pkb_opensearch_build.log 2>&1 || \\
+              echo "[pkb] WARNING: OpenSearch download/extract failed (see /tmp/pkb_opensearch_build.log); Phase 3c will skip" >&2
+          fi
+          if [ -x /opt/opensearch/bin/opensearch ]; then
+            # pkbos owns and runs OpenSearch (it refuses to run as root).
+            # Give it a home so HOME/temp paths are writable.
+            id pkbos >/dev/null 2>&1 || useradd -r -d /opt/opensearch -s /bin/bash pkbos 2>/dev/null || true
+            printf 'discovery.type: single-node\\nnetwork.host: 127.0.0.1\\nplugins.security.disabled: true\\n' \\
+              > /opt/opensearch/config/opensearch.yml
+            mkdir -p /opt/opensearch/config/jvm.options.d
+            # 2 GB heap: 512 MB was too small and OpenSearch aborted early.
+            # On a 252 GB node this still leaves plenty of page cache to
+            # pressure into swap during the phase.
+            printf -- '-Xms2g\\n-Xmx2g\\n' \\
+              > /opt/opensearch/config/jvm.options.d/pkb-heap.options
+            sysctl -w vm.max_map_count=262144 >/dev/null 2>&1 || true
+            # CRITICAL: never run the binary as root here (it bails and
+            # leaves root-owned files in logs/ that block the pkbos server).
+            # Clear any stale logs and chown everything to pkbos LAST.
+            rm -f /opt/opensearch/logs/* 2>/dev/null || true
+            chown -R pkbos /opt/opensearch 2>/dev/null || true
+            echo "[pkb] OpenSearch installed at /opt/opensearch (heap 2g, runs as pkbos)"
+          fi
+          echo "[pkb] Pre-fetching kernel source for Phase 3b build workload..."
+          PKB_KVER="{{ kernel_version }}"
+          PKB_KROOT="/mnt/stateful_partition/pkb_kernel"
+          PKB_KTARBALL="$PKB_KROOT/linux-$PKB_KVER.tar.xz"
+          PKB_KSRC="$PKB_KROOT/linux-$PKB_KVER"
+          PKB_KURL="https://cdn.kernel.org/pub/linux/kernel/v${PKB_KVER%%.*}.x/linux-$PKB_KVER.tar.xz"
+          mkdir -p "$PKB_KROOT"
+          if [ ! -f "$PKB_KTARBALL" ]; then
+            wget -q --timeout=300 -O "$PKB_KTARBALL" "$PKB_KURL" 2>&1 || \\
+              echo "[pkb] WARNING: kernel tarball download failed" >&2
+          fi
+          if [ -f "$PKB_KTARBALL" ] && [ ! -d "$PKB_KSRC" ]; then
+            echo "[pkb] Extracting kernel source (xz)..."
+            tar -xf "$PKB_KTARBALL" -C "$PKB_KROOT" 2>&1 || \\
+              echo "[pkb] WARNING: kernel source extraction failed" >&2
+          fi
+          echo "[pkb] Unlocking container cgroup swap limits..."
+          # GKE cgroup v2 sets memory.swap.max=0 per-container, which
+          # prevents swap usage even when the node has a swap device and
+          # vm.swappiness>0.  Stress-ng gets OOM-killed in ~15s because
+          # the kernel can't page out to swap for this cgroup.
+          #
+          # NOTE: the old approach derived the cgroup path from
+          # /proc/self/cgroup, but inside a cgroup namespace that reports
+          # "0::/" — so the write targeted the host ROOT cgroup, silently
+          # no-op'd, and swap stayed locked (the OOM-in-15s symptom above).
+          # /sys is the host cgroup tree (hostPath mount) and this pod is
+          # privileged, so instead unlock swap across the entire kubepods
+          # hierarchy, which is guaranteed to contain our own container.
+          if [ -d /sys/fs/cgroup/kubepods.slice ] || \
+             [ -d /sys/fs/cgroup/kubepods ]; then
+            # cgroup v2: write 'max' to every memory.swap.max under kubepods*.
+            find /sys/fs/cgroup -path '*kubepods*' -name memory.swap.max \
+              2>/dev/null | while read -r _f; do
+                echo max > "$_f" 2>/dev/null || true
+              done
+          fi
+          # Best-effort: our own namespaced path and the unified root.
+          PKB_CG=$(awk -F: '$2==""{print $3; exit}' /proc/self/cgroup \
+            2>/dev/null)
+          for _cgf in "/sys/fs/cgroup${PKB_CG}/memory.swap.max" \
+                      /sys/fs/cgroup/memory.swap.max; do
+            [ -f "$_cgf" ] && { echo max > "$_cgf" 2>/dev/null || true; }
+          done
+          # cgroup v1 fallback: lift the combined RAM+swap hard ceiling.
+          find /sys/fs/cgroup/memory -path '*kubepods*' \
+            -name memory.memsw.limit_in_bytes 2>/dev/null \
+            | while read -r _f; do
+                echo -1 > "$_f" 2>/dev/null || true
+              done
+          # Verify and surface the result in the pod log.  grep -L lists
+          # files that do NOT contain 'max' on their first line, i.e. ones
+          # still capping swap.
+          PKB_STILL_CAPPED=$(find /sys/fs/cgroup -path '*kubepods*' \
+            -name memory.swap.max 2>/dev/null \
+            | xargs -r grep -L '^max' 2>/dev/null | head -1)
+          if [ -n "$PKB_STILL_CAPPED" ]; then
+            echo "[pkb] WARNING: cgroup swap still capped at \
+            $PKB_STILL_CAPPED=$(cat "$PKB_STILL_CAPPED" 2>/dev/null) — stress-ng may be \
+            OOM-killed before swap is exercised" >&2
+          else
+            echo "[pkb] cgroup swap unlocked (memory.swap.max=max across kubepods)"
+          fi
+          echo "[pkb] Tools installed. Writing ready sentinel."
+          touch /tmp/pkb_ready
+          sleep infinity
+        securityContext:
+          privileged: true
+          capabilities:
+            add: ["SYS_ADMIN", "IPC_LOCK"]
+        resources:
+          requests:
+            memory: "512Mi"
+        env:
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: spec.nodeName
+        volumeMounts:
+        - name: dev
+          mountPath: /dev
+        - name: sys
+          mountPath: /sys
+        - name: run
+          mountPath: /run
+        - name: proc-host
+          mountPath: /proc-host
+          readOnly: true
+        - name: stateful-partition
+          mountPath: /mnt/stateful_partition
+        - name: lib-modules
+          mountPath: /lib/modules
+          readOnly: true
+      volumes:
+      - name: dev
+        hostPath:
+          path: /dev
+      - name: sys
+        hostPath:
+          path: /sys
+      - name: run
+        hostPath:
+          path: /run
+      - name: proc-host
+        hostPath:
+          path: /proc
+      - name: stateful-partition
+        hostPath:
+          path: /mnt/stateful_partition
+          type: DirectoryOrCreate
+      - name: lib-modules
+        hostPath:
+          path: /lib/modules
+          type: Directory