Skip to content
266 changes: 266 additions & 0 deletions perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{ ds_name }}
namespace: {{ ds_namespace }}
labels:
app: {{ ds_label }}
spec:
selector:
matchLabels:
app: {{ ds_label }}
template:
metadata:
labels:
app: {{ ds_label }}
spec:
hostPID: true
hostNetwork: true
# Pin to the benchmark nodepool — never schedule on the dummy default pool.
nodeSelector:
pkb_nodepool: {{ benchmark_nodepool }}
tolerations:
- operator: Exists
containers:
- name: benchmark
image: {{ image }}
command:
- bash
- -c
- |
echo "[pkb] Installing benchmark tools..."
# Retry apt-get up to 3 times — transient network failures are
# common on a freshly-started GKE node. Critical tools (fio,
# stress-ng) must be present before we write the ready sentinel;
# a silent || true here would cause /tmp/pkb_ready to appear even
# when tools are missing, breaking all subsequent phases.
PKB_APT_OK=0
for _attempt in 1 2 3; do
apt-get update -qq 2>&1 || true
DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \\
fio \\
stress-ng \\
sysstat \\
cryptsetup \\
mdadm \\
redis-server \\
redis-tools \\
git \\
wget \\
curl \\
make \\
gcc \\
bc \\
flex \\
bison \\
libelf-dev \\
libssl-dev \\
cgroup-tools \\
nvme-cli \\
util-linux \\
python3-pip \\
libevent-dev \\
libssl-dev \\
libpcre3-dev \\
zlib1g-dev \\
build-essential \\
autoconf \\
automake \\
libtool \\
libtool-bin \\
pkg-config \\
python3-dev \\
default-jre-headless \\
2>&1 && PKB_APT_OK=1 && break
echo "[pkb] apt-get attempt $_attempt failed, retrying in 15s..." >&2
sleep 15
done
if [ "$PKB_APT_OK" != "1" ] || \\
! command -v fio >/dev/null 2>&1 || \\
! command -v stress-ng >/dev/null 2>&1; then
echo "[pkb] FATAL: critical tools (fio, stress-ng) not installed after 3 attempts" >&2
exit 1
fi
echo "[pkb] Installing memtier_benchmark from source..."
# Pin a stable release tag — building from the moving default
# branch (HEAD) intermittently broke (memtier_benchmark not found
# → Phase 3a lost its P50/P90/P99 latency). 2.2.1 matches the
# version PKB's memtier package (memtier.MemtierResult.Parse) is
# validated against and builds cleanly with the apt deps above.
# Fall back to HEAD only if the tagged clone fails.
if ! command -v memtier_benchmark >/dev/null 2>&1; then
(cd /tmp && \\
rm -rf memtier_benchmark && \\
( git clone --depth 1 --branch 2.2.1 \\
https://github.com/RedisLabs/memtier_benchmark.git 2>&1 || \\
git clone --depth 1 \\
https://github.com/RedisLabs/memtier_benchmark.git 2>&1 ) && \\
cd memtier_benchmark && \\
autoreconf -ivf 2>&1 && \\
./configure 2>&1 && \\
make -j$(nproc) 2>&1 && \\
make install 2>&1) > /tmp/pkb_memtier_build.log 2>&1 || \\
echo "[pkb] WARNING: memtier_benchmark build failed (see /tmp/pkb_memtier_build.log); redis-benchmark fallback will be used"
fi
if command -v memtier_benchmark >/dev/null 2>&1; then
echo "[pkb] memtier_benchmark installed: $(memtier_benchmark --version 2>&1 | head -1)"
fi
echo "[pkb] Installing esrally (lightweight)..."
python3 -m pip install --upgrade --break-system-packages pip setuptools wheel > /tmp/pkb_esrally_build.log 2>&1 || true
pip3 install --break-system-packages elastic-transport esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\
pip3 install --break-system-packages esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\
echo "[pkb] WARNING: esrally install failed (see /tmp/pkb_esrally_build.log); opensearch curl fallback will be used"
if command -v esrally >/dev/null 2>&1; then
echo "[pkb] esrally installed: $(esrally --version 2>&1 | head -1)"
else
echo "[pkb] WARNING: esrally binary not on PATH after install; opensearch curl fallback will be used" >&2
fi
echo "[pkb] Installing OpenSearch (single-node, security off) for Phase 3c..."
# Phase 3c needs a real search server on :9200. Nothing in apt
# ships one and the pod has no systemd, so install the OpenSearch
# bundle (ships its own JDK) and launch the binary directly in the
# phase. All best-effort: if any step fails the phase probes the
# endpoint and skips cleanly rather than recording fake timings.
if [ ! -x /opt/opensearch/bin/opensearch ]; then
OS_VER=2.15.0
(cd /opt && \\
wget -q --timeout=600 -O os.tgz \\
"https://artifacts.opensearch.org/releases/bundle/opensearch/$OS_VER/opensearch-$OS_VER-linux-x64.tar.gz" && \\
tar -xzf os.tgz && rm -f os.tgz && \\
mv "opensearch-$OS_VER" opensearch) > /tmp/pkb_opensearch_build.log 2>&1 || \\
echo "[pkb] WARNING: OpenSearch download/extract failed (see /tmp/pkb_opensearch_build.log); Phase 3c will skip" >&2
fi
if [ -x /opt/opensearch/bin/opensearch ]; then
# pkbos owns and runs OpenSearch (it refuses to run as root).
# Give it a home so HOME/temp paths are writable.
id pkbos >/dev/null 2>&1 || useradd -r -d /opt/opensearch -s /bin/bash pkbos 2>/dev/null || true
printf 'discovery.type: single-node\\nnetwork.host: 127.0.0.1\\nplugins.security.disabled: true\\n' \\
> /opt/opensearch/config/opensearch.yml
mkdir -p /opt/opensearch/config/jvm.options.d
# 2 GB heap: 512 MB was too small and OpenSearch aborted early.
# On a 252 GB node this still leaves plenty of page cache to
# pressure into swap during the phase.
printf -- '-Xms2g\\n-Xmx2g\\n' \\
> /opt/opensearch/config/jvm.options.d/pkb-heap.options
sysctl -w vm.max_map_count=262144 >/dev/null 2>&1 || true
# CRITICAL: never run the binary as root here (it bails and
# leaves root-owned files in logs/ that block the pkbos server).
# Clear any stale logs and chown everything to pkbos LAST.
rm -f /opt/opensearch/logs/* 2>/dev/null || true
chown -R pkbos /opt/opensearch 2>/dev/null || true
echo "[pkb] OpenSearch installed at /opt/opensearch (heap 2g, runs as pkbos)"
fi
echo "[pkb] Pre-fetching kernel source for Phase 3b build workload..."
PKB_KVER="{{ kernel_version }}"
PKB_KROOT="/mnt/stateful_partition/pkb_kernel"
PKB_KTARBALL="$PKB_KROOT/linux-$PKB_KVER.tar.xz"
PKB_KSRC="$PKB_KROOT/linux-$PKB_KVER"
PKB_KURL="https://cdn.kernel.org/pub/linux/kernel/v${PKB_KVER%%.*}.x/linux-$PKB_KVER.tar.xz"
mkdir -p "$PKB_KROOT"
if [ ! -f "$PKB_KTARBALL" ]; then
wget -q --timeout=300 -O "$PKB_KTARBALL" "$PKB_KURL" 2>&1 || \\
echo "[pkb] WARNING: kernel tarball download failed" >&2
fi
if [ -f "$PKB_KTARBALL" ] && [ ! -d "$PKB_KSRC" ]; then
echo "[pkb] Extracting kernel source (xz)..."
tar -xf "$PKB_KTARBALL" -C "$PKB_KROOT" 2>&1 || \\
echo "[pkb] WARNING: kernel source extraction failed" >&2
fi
echo "[pkb] Unlocking container cgroup swap limits..."
# GKE cgroup v2 sets memory.swap.max=0 per-container, which
# prevents swap usage even when the node has a swap device and
# vm.swappiness>0. Stress-ng gets OOM-killed in ~15s because
# the kernel can't page out to swap for this cgroup.
#
# NOTE: the old approach derived the cgroup path from
# /proc/self/cgroup, but inside a cgroup namespace that reports
# "0::/" — so the write targeted the host ROOT cgroup, silently
# no-op'd, and swap stayed locked (the OOM-in-15s symptom above).
# /sys is the host cgroup tree (hostPath mount) and this pod is
# privileged, so instead unlock swap across the entire kubepods
# hierarchy, which is guaranteed to contain our own container.
if [ -d /sys/fs/cgroup/kubepods.slice ] || \
[ -d /sys/fs/cgroup/kubepods ]; then
# cgroup v2: write 'max' to every memory.swap.max under kubepods*.
find /sys/fs/cgroup -path '*kubepods*' -name memory.swap.max \
2>/dev/null | while read -r _f; do
echo max > "$_f" 2>/dev/null || true
done
fi
# Best-effort: our own namespaced path and the unified root.
PKB_CG=$(awk -F: '$2==""{print $3; exit}' /proc/self/cgroup \
2>/dev/null)
for _cgf in "/sys/fs/cgroup${PKB_CG}/memory.swap.max" \
/sys/fs/cgroup/memory.swap.max; do
[ -f "$_cgf" ] && { echo max > "$_cgf" 2>/dev/null || true; }
done
# cgroup v1 fallback: lift the combined RAM+swap hard ceiling.
find /sys/fs/cgroup/memory -path '*kubepods*' \
-name memory.memsw.limit_in_bytes 2>/dev/null \
| while read -r _f; do
echo -1 > "$_f" 2>/dev/null || true
done
# Verify and surface the result in the pod log. grep -L lists
# files that do NOT contain 'max' on their first line, i.e. ones
# still capping swap.
PKB_STILL_CAPPED=$(find /sys/fs/cgroup -path '*kubepods*' \
-name memory.swap.max 2>/dev/null \
| xargs -r grep -L '^max' 2>/dev/null | head -1)
if [ -n "$PKB_STILL_CAPPED" ]; then
echo "[pkb] WARNING: cgroup swap still capped at \
$PKB_STILL_CAPPED=$(cat "$PKB_STILL_CAPPED" 2>/dev/null) — stress-ng may be \
OOM-killed before swap is exercised" >&2
else
echo "[pkb] cgroup swap unlocked (memory.swap.max=max across kubepods)"
fi
echo "[pkb] Tools installed. Writing ready sentinel."
touch /tmp/pkb_ready
sleep infinity
securityContext:
privileged: true
capabilities:
add: ["SYS_ADMIN", "IPC_LOCK"]
resources:
requests:
memory: "512Mi"
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- name: dev
mountPath: /dev
- name: sys
mountPath: /sys
- name: run
mountPath: /run
- name: proc-host
mountPath: /proc-host
readOnly: true
- name: stateful-partition
mountPath: /mnt/stateful_partition
- name: lib-modules
mountPath: /lib/modules
readOnly: true
volumes:
- name: dev
hostPath:
path: /dev
- name: sys
hostPath:
path: /sys
- name: run
hostPath:
path: /run
- name: proc-host
hostPath:
path: /proc
- name: stateful-partition
hostPath:
path: /mnt/stateful_partition
type: DirectoryOrCreate
- name: lib-modules
hostPath:
path: /lib/modules
type: Directory
Loading