diff --git a/roles/regluit_prod/tasks/cron.yml b/roles/regluit_prod/tasks/cron.yml index 5be926d..3642a82 100644 --- a/roles/regluit_prod/tasks/cron.yml +++ b/roles/regluit_prod/tasks/cron.yml @@ -81,21 +81,7 @@ path: /etc/cron.d/cleanup-apache-logs state: absent -# DOAB OAI harvest — runs nightly on production only. -# Matches doab-check's cadence (nightly load_doab) per Gluejar/regluit#1129 -# and Eric's direction (2026-04-23 meeting). -# -# Window: 3 days back -> open-ended. The 3-day overlap is intentional — -# load_doab dedupes by record id, so re-fetching is cheap and resilient -# to occasional missed nights (server reboot, transient OAI 5xx, etc.). -# Output (incl. stderr) appended to /var/log/regluit/doab-harvest.log, -# which is rotated by the existing "regluit-log-cleanup" task above. -- name: DOAB harvest (nightly) - become: yes - ansible.builtin.cron: - name: "regluit-doab-harvest" - minute: "30" - hour: "4" - user: "{{ user_name }}" - job: 'cd {{ project_path }} && DJANGO_SETTINGS_MODULE={{ django_settings_module }} {{ project_path }}/{{ virtualenv_name }}/bin/django-admin load_doab "$(date -u -d ''3 days ago'' +\%Y-\%m-\%d)" --max=20000 >> /var/log/regluit/doab-harvest.log 2>&1' - when: deploy_type | default('') == 'prod' +# DOAB OAI harvest + backfill cron, the wrapper scripts that install them, +# and the shared flock that serialises them are all owned by doab.yml +# (imported from main.yml). Kept together because the nightly harvest and +# the one-off backfill must share a single host lock — see doab.yml. diff --git a/roles/regluit_prod/tasks/doab.yml b/roles/regluit_prod/tasks/doab.yml new file mode 100644 index 0000000..d272121 --- /dev/null +++ b/roles/regluit_prod/tasks/doab.yml @@ -0,0 +1,76 @@ +--- +# DOAB OAI orchestration — nightly harvest + one-off backfill. +# +# Both hit the same DOAB OAI endpoint. They must NEVER run concurrently +# (slow-and-gentle: never double the request rate). Coordination is a single +# host lock /var/lock/doab-oai.lock, acquired non-blocking (flock -n) inside +# BOTH wrapper scripts: whichever is running, the other skips its tick. +# +# - Harvest (Gluejar/regluit#1129): nightly, 3-day rolling window. +# - Backfill (Gluejar/regluit#1151): one-off ~20.6k catch-up, bounded per +# tick, resumable, halt-aware via .done/.halted markers. Drains over days. +# +# Production only (deploy_type == 'prod'). +# +# CROSS-HOST INVARIANT: this flock is host-local. doab-check (separate DO +# host) also drains a DOAB backfill against the same OAI endpoint. Only ONE +# may be armed at a time. regluit is the active runner (auto via discovery); +# doab-check's runner is inert until its operator explicitly sets IDS_FILE. +# Do not arm doab-check's backfill until this one's .done marker is present. + +- name: Create DOAB backfill state directory + become: yes + ansible.builtin.file: + path: /var/lib/regluit/doab-backfill + state: directory + owner: "{{ user_name }}" + group: "www-data" + mode: "02775" + when: deploy_type | default('') == 'prod' + +# /var/log/regluit is created in main.yml; doab-harvest.log / +# doab-backfill.log land there and are rotated by the existing +# "regluit-log-cleanup" task in cron.yml (30-day retention). + +- name: Install DOAB harvest wrapper script + become: yes + ansible.builtin.template: + src: doab-harvest.sh.j2 + dest: "/home/{{ user_name }}/doab-harvest.sh" + owner: "{{ user_name }}" + mode: "0755" + when: deploy_type | default('') == 'prod' + +- name: Install DOAB backfill runner script + become: yes + ansible.builtin.template: + src: doab-backfill.sh.j2 + dest: "/home/{{ user_name }}/doab-backfill.sh" + owner: "{{ user_name }}" + mode: "0755" + when: deploy_type | default('') == 'prod' + +- name: DOAB harvest (nightly, flock-serialised) + become: yes + ansible.builtin.cron: + name: "regluit-doab-harvest" + minute: "30" + hour: "4" + user: "{{ user_name }}" + job: "/home/{{ user_name }}/doab-harvest.sh" + when: deploy_type | default('') == 'prod' + +# Backfill ticks every 30 min, deliberately offset from the 04:30 harvest. +# ~20.6k records / ~500 per pass ≈ 40+ ticks ≈ ~1 day at this cadence — +# the intended slow drip. The flock guarantees a tick that overlaps the +# nightly harvest simply skips (and vice-versa). Self-disables via the +# .done marker once drained; freezes via .halted on a circuit-breaker. +- name: DOAB backfill (bounded pass every 30 min, flock-serialised) + become: yes + ansible.builtin.cron: + name: "regluit-doab-backfill" + minute: "0,30" + hour: "*" + user: "{{ user_name }}" + job: "/home/{{ user_name }}/doab-backfill.sh" + when: deploy_type | default('') == 'prod' diff --git a/roles/regluit_prod/tasks/main.yml b/roles/regluit_prod/tasks/main.yml index 3bd0954..f24dfe0 100644 --- a/roles/regluit_prod/tasks/main.yml +++ b/roles/regluit_prod/tasks/main.yml @@ -130,6 +130,9 @@ - name: Run cron tasks import_tasks: cron.yml +- name: Run DOAB orchestration tasks (harvest + backfill) + import_tasks: doab.yml + - name: Run log management tasks import_tasks: log_management.yml diff --git a/roles/regluit_prod/templates/doab-backfill.sh.j2 b/roles/regluit_prod/templates/doab-backfill.sh.j2 new file mode 100644 index 0000000..82cf7c0 --- /dev/null +++ b/roles/regluit_prod/templates/doab-backfill.sh.j2 @@ -0,0 +1,76 @@ +#!/bin/bash +# DOAB backfill runner — one bounded pass per cron tick. +# +# Managed by Ansible (roles/regluit_prod/templates/doab-backfill.sh.j2). +# Drains the ~20.6k-record DOAB catch-up worklist (Gluejar/regluit#1151) by +# invoking `backfill_doab` repeatedly, slowly, never concurrently with the +# nightly DOAB harvest (shared flock), honoring the in-command circuit +# breakers ACROSS ticks via marker files. +# +# Exit-code contract (set by the management command): +# 0 drained / nothing-to-do -> write .done, stop running +# 3 benign checkpoint -> do nothing, cron re-fires next tick +# 4 circuit-breaker halt -> write .halted, freeze until operator clears +# * unexpected -> fail safe: write .halted +# +# NOTE: deliberately NOT `set -e` — we must inspect the command's rc. +set -uo pipefail + +STATE_DIR=/var/lib/regluit/doab-backfill +LOG=/var/log/regluit/doab-backfill.log +LOCK=/var/lock/doab-oai.lock +DONE="$STATE_DIR/.done" +HALTED="$STATE_DIR/.halted" +STATE="$STATE_DIR/state.json" + +ts() { date -u +%Y-%m-%dT%H:%M:%SZ; } +log() { echo "$(ts) [doab-backfill] $*" >> "$LOG"; } + +mkdir -p "$STATE_DIR" 2>/dev/null || true + +if [ -f "$DONE" ]; then + log ".done present — worklist drained; nothing to do" + exit 0 +fi +if [ -f "$HALTED" ]; then + log ".halted present — circuit broken; awaiting operator review of $STATE" + exit 0 +fi + +# Single DOAB-OAI client per host: this lock is ALSO held by the nightly +# harvest wrapper. -n => if held, skip this tick (cron fires again later). +exec 9>"$LOCK" || { log "cannot open lock $LOCK; skipping tick"; exit 0; } +if ! flock -n 9; then + log "DOAB-OAI lock held (harvest or prior backfill running); skipping tick" + exit 0 +fi + +log "starting bounded backfill pass" +cd {{ project_path }} || { log "cd {{ project_path }} failed; skipping tick"; exit 0; } + +DJANGO_SETTINGS_MODULE={{ django_settings_module }} \ + {{ project_path }}/{{ virtualenv_name }}/bin/django-admin backfill_doab \ + --state-file "$STATE" >> "$LOG" 2>&1 +rc=$? + +case "$rc" in + 0) + touch "$DONE" + log "DRAINED (rc=0) -> wrote .done; backfill complete" + ;; + 3) + log "checkpoint (rc=3) -> cron will re-fire next tick" + ;; + 4) + touch "$HALTED" + log "HALT (rc=4) -> wrote .halted; OPERATOR REVIEW NEEDED ($STATE)" + ;; + *) + touch "$HALTED" + log "UNEXPECTED rc=$rc -> wrote .halted (fail-safe); OPERATOR REVIEW NEEDED" + ;; +esac + +# Never exit non-zero: a failing cron job mails root, and Eric treats +# error-email volume as a signal. Failure is expressed via .halted + log. +exit 0 diff --git a/roles/regluit_prod/templates/doab-harvest.sh.j2 b/roles/regluit_prod/templates/doab-harvest.sh.j2 new file mode 100644 index 0000000..6ad936e --- /dev/null +++ b/roles/regluit_prod/templates/doab-harvest.sh.j2 @@ -0,0 +1,42 @@ +#!/bin/bash +# Nightly DOAB OAI harvest (Gluejar/regluit#1129) — wrapped so it shares the +# DOAB-OAI host lock with the backfill runner. +# +# Managed by Ansible (roles/regluit_prod/templates/doab-harvest.sh.j2). +# Behaviour is unchanged from the original inline #31 cron except that it +# now runs under `flock -n /var/lock/doab-oai.lock`, guaranteeing at most +# one DOAB-OAI client process per host (slow-and-gentle: never double the +# request rate by running harvest + backfill simultaneously). +# +# Window: 3 days back -> open-ended. load_doab dedupes by record id, so the +# overlap is cheap and resilient to an occasional skipped night. +# +# Retry-After: this wrapper does NOT re-check the shared sentinel — the +# `load_doab` management command reads it natively at the top of handle() +# (read_block_deadline -> "SKIP: DOAB OAI rate-limited ..." -> return) using +# the exact same race-free helpers as backfill. Keeping the wrapper thin +# makes the command the single source of sentinel truth; duplicating the +# check here would risk the two drifting. +set -uo pipefail + +LOG=/var/log/regluit/doab-harvest.log +LOCK=/var/lock/doab-oai.lock + +ts() { date -u +%Y-%m-%dT%H:%M:%SZ; } + +exec 9>"$LOCK" || { echo "$(ts) [doab-harvest] cannot open lock $LOCK" >> "$LOG"; exit 0; } +if ! flock -n 9; then + # Backfill (or a prior harvest) is using the OAI endpoint. Skip tonight; + # the 3-day rolling window self-heals the gap on the next clear night. + echo "$(ts) [doab-harvest] DOAB-OAI lock held; skipping this run" >> "$LOG" + exit 0 +fi + +cd {{ project_path }} || { echo "$(ts) [doab-harvest] cd failed" >> "$LOG"; exit 0; } +FROM="$(date -u -d '3 days ago' +%Y-%m-%d)" +echo "$(ts) [doab-harvest] starting load_doab from $FROM" >> "$LOG" +DJANGO_SETTINGS_MODULE={{ django_settings_module }} \ + {{ project_path }}/{{ virtualenv_name }}/bin/django-admin load_doab \ + "$FROM" --max=20000 >> "$LOG" 2>&1 +echo "$(ts) [doab-harvest] load_doab exited rc=$?" >> "$LOG" +exit 0