Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 4 additions & 18 deletions roles/regluit_prod/tasks/cron.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,21 +81,7 @@
path: /etc/cron.d/cleanup-apache-logs
state: absent

# DOAB OAI harvest — runs nightly on production only.
# Matches doab-check's cadence (nightly load_doab) per Gluejar/regluit#1129
# and Eric's direction (2026-04-23 meeting).
#
# Window: 3 days back -> open-ended. The 3-day overlap is intentional —
# load_doab dedupes by record id, so re-fetching is cheap and resilient
# to occasional missed nights (server reboot, transient OAI 5xx, etc.).
# Output (incl. stderr) appended to /var/log/regluit/doab-harvest.log,
# which is rotated by the existing "regluit-log-cleanup" task above.
- name: DOAB harvest (nightly)
become: yes
ansible.builtin.cron:
name: "regluit-doab-harvest"
minute: "30"
hour: "4"
user: "{{ user_name }}"
job: 'cd {{ project_path }} && DJANGO_SETTINGS_MODULE={{ django_settings_module }} {{ project_path }}/{{ virtualenv_name }}/bin/django-admin load_doab "$(date -u -d ''3 days ago'' +\%Y-\%m-\%d)" --max=20000 >> /var/log/regluit/doab-harvest.log 2>&1'
when: deploy_type | default('') == 'prod'
# DOAB OAI harvest + backfill cron, the wrapper scripts that install them,
# and the shared flock that serialises them are all owned by doab.yml
# (imported from main.yml). Kept together because the nightly harvest and
# the one-off backfill must share a single host lock — see doab.yml.
76 changes: 76 additions & 0 deletions roles/regluit_prod/tasks/doab.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
---
# DOAB OAI orchestration — nightly harvest + one-off backfill.
#
# Both hit the same DOAB OAI endpoint. They must NEVER run concurrently
# (slow-and-gentle: never double the request rate). Coordination is a single
# host lock /var/lock/doab-oai.lock, acquired non-blocking (flock -n) inside
# BOTH wrapper scripts: whichever is running, the other skips its tick.
#
# - Harvest (Gluejar/regluit#1129): nightly, 3-day rolling window.
# - Backfill (Gluejar/regluit#1151): one-off ~20.6k catch-up, bounded per
# tick, resumable, halt-aware via .done/.halted markers. Drains over days.
#
# Production only (deploy_type == 'prod').
#
# CROSS-HOST INVARIANT: this flock is host-local. doab-check (separate DO
# host) also drains a DOAB backfill against the same OAI endpoint. Only ONE
# may be armed at a time. regluit is the active runner (auto via discovery);
# doab-check's runner is inert until its operator explicitly sets IDS_FILE.
# Do not arm doab-check's backfill until this one's .done marker is present.

- name: Create DOAB backfill state directory
become: yes
ansible.builtin.file:
path: /var/lib/regluit/doab-backfill
state: directory
owner: "{{ user_name }}"
group: "www-data"
mode: "02775"
when: deploy_type | default('') == 'prod'

# /var/log/regluit is created in main.yml; doab-harvest.log /
# doab-backfill.log land there and are rotated by the existing
# "regluit-log-cleanup" task in cron.yml (30-day retention).

- name: Install DOAB harvest wrapper script
become: yes
ansible.builtin.template:
src: doab-harvest.sh.j2
dest: "/home/{{ user_name }}/doab-harvest.sh"
owner: "{{ user_name }}"
mode: "0755"
when: deploy_type | default('') == 'prod'

- name: Install DOAB backfill runner script
become: yes
ansible.builtin.template:
src: doab-backfill.sh.j2
dest: "/home/{{ user_name }}/doab-backfill.sh"
owner: "{{ user_name }}"
mode: "0755"
when: deploy_type | default('') == 'prod'

- name: DOAB harvest (nightly, flock-serialised)
become: yes
ansible.builtin.cron:
name: "regluit-doab-harvest"
minute: "30"
hour: "4"
user: "{{ user_name }}"
job: "/home/{{ user_name }}/doab-harvest.sh"
when: deploy_type | default('') == 'prod'

# Backfill ticks every 30 min, deliberately offset from the 04:30 harvest.
# ~20.6k records / ~500 per pass ≈ 40+ ticks ≈ ~1 day at this cadence —
# the intended slow drip. The flock guarantees a tick that overlaps the
# nightly harvest simply skips (and vice-versa). Self-disables via the
# .done marker once drained; freezes via .halted on a circuit-breaker.
- name: DOAB backfill (bounded pass every 30 min, flock-serialised)
become: yes
ansible.builtin.cron:
name: "regluit-doab-backfill"
minute: "0,30"
hour: "*"
user: "{{ user_name }}"
job: "/home/{{ user_name }}/doab-backfill.sh"
when: deploy_type | default('') == 'prod'
3 changes: 3 additions & 0 deletions roles/regluit_prod/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@
- name: Run cron tasks
import_tasks: cron.yml

- name: Run DOAB orchestration tasks (harvest + backfill)
import_tasks: doab.yml

- name: Run log management tasks
import_tasks: log_management.yml

Expand Down
76 changes: 76 additions & 0 deletions roles/regluit_prod/templates/doab-backfill.sh.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/bin/bash
# DOAB backfill runner — one bounded pass per cron tick.
#
# Managed by Ansible (roles/regluit_prod/templates/doab-backfill.sh.j2).
# Drains the ~20.6k-record DOAB catch-up worklist (Gluejar/regluit#1151) by
# invoking `backfill_doab` repeatedly, slowly, never concurrently with the
# nightly DOAB harvest (shared flock), honoring the in-command circuit
# breakers ACROSS ticks via marker files.
#
# Exit-code contract (set by the management command):
# 0 drained / nothing-to-do -> write .done, stop running
# 3 benign checkpoint -> do nothing, cron re-fires next tick
# 4 circuit-breaker halt -> write .halted, freeze until operator clears
# * unexpected -> fail safe: write .halted
#
# NOTE: deliberately NOT `set -e` — we must inspect the command's rc.
set -uo pipefail

STATE_DIR=/var/lib/regluit/doab-backfill
LOG=/var/log/regluit/doab-backfill.log
LOCK=/var/lock/doab-oai.lock
DONE="$STATE_DIR/.done"
HALTED="$STATE_DIR/.halted"
STATE="$STATE_DIR/state.json"

ts() { date -u +%Y-%m-%dT%H:%M:%SZ; }
log() { echo "$(ts) [doab-backfill] $*" >> "$LOG"; }

mkdir -p "$STATE_DIR" 2>/dev/null || true

if [ -f "$DONE" ]; then
log ".done present — worklist drained; nothing to do"
exit 0
fi
if [ -f "$HALTED" ]; then
log ".halted present — circuit broken; awaiting operator review of $STATE"
exit 0
fi

# Single DOAB-OAI client per host: this lock is ALSO held by the nightly
# harvest wrapper. -n => if held, skip this tick (cron fires again later).
exec 9>"$LOCK" || { log "cannot open lock $LOCK; skipping tick"; exit 0; }
if ! flock -n 9; then
log "DOAB-OAI lock held (harvest or prior backfill running); skipping tick"
exit 0
fi

log "starting bounded backfill pass"
cd {{ project_path }} || { log "cd {{ project_path }} failed; skipping tick"; exit 0; }

DJANGO_SETTINGS_MODULE={{ django_settings_module }} \
{{ project_path }}/{{ virtualenv_name }}/bin/django-admin backfill_doab \
--state-file "$STATE" >> "$LOG" 2>&1
rc=$?

case "$rc" in
0)
touch "$DONE"
log "DRAINED (rc=0) -> wrote .done; backfill complete"
;;
3)
log "checkpoint (rc=3) -> cron will re-fire next tick"
;;
4)
touch "$HALTED"
log "HALT (rc=4) -> wrote .halted; OPERATOR REVIEW NEEDED ($STATE)"
;;
*)
touch "$HALTED"
log "UNEXPECTED rc=$rc -> wrote .halted (fail-safe); OPERATOR REVIEW NEEDED"
;;
esac

# Never exit non-zero: a failing cron job mails root, and Eric treats
# error-email volume as a signal. Failure is expressed via .halted + log.
exit 0
42 changes: 42 additions & 0 deletions roles/regluit_prod/templates/doab-harvest.sh.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash
# Nightly DOAB OAI harvest (Gluejar/regluit#1129) — wrapped so it shares the
# DOAB-OAI host lock with the backfill runner.
#
# Managed by Ansible (roles/regluit_prod/templates/doab-harvest.sh.j2).
# Behaviour is unchanged from the original inline #31 cron except that it
# now runs under `flock -n /var/lock/doab-oai.lock`, guaranteeing at most
# one DOAB-OAI client process per host (slow-and-gentle: never double the
# request rate by running harvest + backfill simultaneously).
#
# Window: 3 days back -> open-ended. load_doab dedupes by record id, so the
# overlap is cheap and resilient to an occasional skipped night.
#
# Retry-After: this wrapper does NOT re-check the shared sentinel — the
# `load_doab` management command reads it natively at the top of handle()
# (read_block_deadline -> "SKIP: DOAB OAI rate-limited ..." -> return) using
# the exact same race-free helpers as backfill. Keeping the wrapper thin
# makes the command the single source of sentinel truth; duplicating the
# check here would risk the two drifting.
set -uo pipefail

LOG=/var/log/regluit/doab-harvest.log
LOCK=/var/lock/doab-oai.lock

ts() { date -u +%Y-%m-%dT%H:%M:%SZ; }

exec 9>"$LOCK" || { echo "$(ts) [doab-harvest] cannot open lock $LOCK" >> "$LOG"; exit 0; }
if ! flock -n 9; then
# Backfill (or a prior harvest) is using the OAI endpoint. Skip tonight;
# the 3-day rolling window self-heals the gap on the next clear night.
echo "$(ts) [doab-harvest] DOAB-OAI lock held; skipping this run" >> "$LOG"
exit 0
fi

cd {{ project_path }} || { echo "$(ts) [doab-harvest] cd failed" >> "$LOG"; exit 0; }
FROM="$(date -u -d '3 days ago' +%Y-%m-%d)"
echo "$(ts) [doab-harvest] starting load_doab from $FROM" >> "$LOG"
DJANGO_SETTINGS_MODULE={{ django_settings_module }} \
{{ project_path }}/{{ virtualenv_name }}/bin/django-admin load_doab \
"$FROM" --max=20000 >> "$LOG" 2>&1
echo "$(ts) [doab-harvest] load_doab exited rc=$?" >> "$LOG"
exit 0