From 8317d67a8e57fcd86063e9ade2f40261af1d7ea2 Mon Sep 17 00:00:00 2001 From: sysangels | Michal Koeckeis-Fresel Date: Sat, 14 Mar 2026 12:34:55 +0100 Subject: [PATCH] LE renewal: on config file errors reissue the cert #3324 LE renewal: on config file errors reissue the cert #3324 --- .../core/letsencrypt/jobs/certbot-new.py | 143 ++++++++++++----- .../core/letsencrypt/jobs/certbot-renew.py | 24 ++- .../letsencrypt/jobs/letsencrypt_utils.py | 144 ++++++++++++++++++ 3 files changed, 276 insertions(+), 35 deletions(-) diff --git a/src/common/core/letsencrypt/jobs/certbot-new.py b/src/common/core/letsencrypt/jobs/certbot-new.py index 006ad1c3aa..07499a2408 100644 --- a/src/common/core/letsencrypt/jobs/certbot-new.py +++ b/src/common/core/letsencrypt/jobs/certbot-new.py @@ -80,6 +80,8 @@ build_certbot_env, get_expected_acme_directory, prepare_logs_dir, + remove_empty_renewal_configs, + repair_live_symlinks, resolve_certbot_entrypoint, ) @@ -93,6 +95,11 @@ RUNNING_CERTBOT = 0 MONITOR_STOP = Event() MONITOR_THREAD: Optional[Thread] = None +# NOTE: certbot-new is executed as a script in normal operation, but certbot-renew +# imports it (via importlib) to reuse issuance logic when it needs to reissue after +# repairing a broken renewal state. To keep imports side-effect free, script execution +# is gated under __main__ and helpers accept an optional job instance. +JOB = None # Set by main(); reissue_for_removed_configs passes job explicitly when called from certbot-renew def _monitor_certbot_progress() -> None: @@ -631,6 +638,61 @@ def _determine_wildcard_bases(labels_list: List[List[str]]) -> Set[str]: return bases +def load_services() -> Dict[str, Dict[str, Union[str, bool, int, Dict[str, str]]]]: + """Build the services dict from SERVER_NAME and per-service env. Used by main() and reissue_for_removed_configs().""" + server_names = getenv("SERVER_NAME", "www.example.com").strip() + if not server_names: + return {} + services: Dict[str, Dict[str, Union[str, bool, int, Dict[str, str]]]] = {} + for service in server_names.split(): + if not service.strip(): + continue + for cert_name, config in build_service_entries(service).items(): + if cert_name in services: + if certificate_fingerprint(services[cert_name]) == certificate_fingerprint(config): + merged = normalize_server_names(services[cert_name]["server_names"]) | normalize_server_names(config["server_names"]) + services[cert_name]["server_names"] = format_server_names(merged) + continue + LOGGER.warning(f"[Service: {cert_name}] Certificate configuration conflict detected, keeping first occurrence.") + continue + services[cert_name] = config + return services + + +def reissue_for_removed_configs( + removed_cert_names: List[str], + job, + cmd_env: Dict[str, str], + logger, +) -> bool: + """ + Reissue certificates for services whose renewal config was removed (empty or broken). + Uses service definitions from env (load_services), not the failing config file. + Called from certbot-renew when certbot-new runs only once at init. + """ + services = load_services() + if not services: + return True + to_reissue: List[Tuple[str, Dict[str, Union[str, bool, int, Dict[str, str]]]]] = [] + for cert_name in removed_cert_names: + # Renewal filenames can include certbot profile suffixes (-ecdsa/-rsa). We resolve both + # the stripped and unstripped variants to find the correct service key to reissue. + service_name = ( + cert_name[:-6] if cert_name.endswith("-ecdsa") else (cert_name[:-4] if cert_name.endswith("-rsa") else cert_name) + ) + key = service_name if service_name in services else (cert_name if cert_name in services else None) + if key is not None and services[key].get("activated"): + to_reissue.append((key, services[key])) + if not to_reissue: + return True + for service_name, config in to_reissue: + config["force_renew"] = True + logger.info(f"Reissuing certificate for {service_name} (broken renewal config was removed, using service definition).") + if not generate_certificate(service_name, config, cmd_env, job=job): + return False + return True + + def certbot_delete(service: str, cmd_env: Dict[str, str] = None) -> int: # * Building the certbot command command = [ @@ -671,6 +733,7 @@ def certbot_new( config: Dict[str, Union[str, bool, int, Dict[str, str]]], cmd_env: Dict[str, str], paths: CertbotPaths, + job=None, ) -> int: certbot_entrypoint = resolve_certbot_entrypoint( str(config.get("acme_server") or "letsencrypt"), @@ -745,7 +808,7 @@ def certbot_new( credentials_file = f"credentials_{bytes_hash(credentials, algorithm='sha256')[:12]}.{config['provider'].get_file_type()}" credentials_path = CACHE_PATH.joinpath(credentials_file) if not credentials_path.is_file() or config["force_renew"]: - JOB.cache_file(credentials_file, credentials) + (job or JOB).cache_file(credentials_file, credentials) credentials_path.chmod(0o600) # Set permissions to read/write for owner only # * Adding the credentials to the command @@ -793,7 +856,12 @@ def certbot_new( return process.returncode -def generate_certificate(service: str, config: Dict[str, Union[str, bool, int, Dict[str, str]]], cmd_env: Dict[str, str]) -> bool: +def generate_certificate( + service: str, + config: Dict[str, Union[str, bool, int, Dict[str, str]]], + cmd_env: Dict[str, str], + job=None, +) -> bool: LOGGER.info( f"Asking{' wildcard' if config['wildcard'] else ''} certificates for domain(s) : {config['server_names']} (email = {config['email'] or 'not provided'}){' using staging' if config['staging'] else ''} with {config['challenge']} challenge, using {config['profile']!r} profile on {config['acme_server']}..." ) @@ -819,7 +887,7 @@ def generate_certificate(service: str, config: Dict[str, Union[str, bool, int, D global RUNNING_CERTBOT RUNNING_CERTBOT += 1 try: - ret = certbot_new(service, config, cmd_env.copy(), paths) + ret = certbot_new(service, config, cmd_env.copy(), paths, job=job) finally: with RUNNING_LOCK: RUNNING_CERTBOT -= 1 @@ -835,37 +903,28 @@ def generate_certificate(service: str, config: Dict[str, Union[str, bool, int, D return False -try: - # ? Load services configuration - server_names = getenv("SERVER_NAME", "www.example.com").strip() - - if not server_names: +def main() -> int: + status = 0 + services = load_services() + if not services: LOGGER.warning("No services found, exiting...") - sys_exit(0) - - services = {} - for service in server_names.split(): - if not service.strip(): - continue - - for cert_name, config in build_service_entries(service).items(): - if cert_name in services: - if certificate_fingerprint(services[cert_name]) == certificate_fingerprint(config): - merged = normalize_server_names(services[cert_name]["server_names"]) | normalize_server_names(config["server_names"]) - services[cert_name]["server_names"] = format_server_names(merged) - continue - LOGGER.warning(f"[Service: {cert_name}] Certificate configuration conflict detected, keeping first occurrence.") - continue - services[cert_name] = config - + return 0 if not any(service["activated"] for service in services.values()): LOGGER.info("No services uses Let's Encrypt, skipping generation of new certificates...") - sys_exit(0) + return 0 prepare_logs_dir(LOGS_DIR, LOGGER) + global JOB JOB = Job(LOGGER, __file__.replace("new", "renew")) + # Repair live/ symlinks first, then remove broken renewal configs (empty, unparseable, or missing required file reference). + # Reissue uses service definitions from env (load_services), not the failing config file. + removed_empty_configs: List[str] = [] + if DATA_PATH.is_dir(): + repair_live_symlinks(DATA_PATH, LOGGER) + removed_empty_configs = remove_empty_renewal_configs(DATA_PATH, LOGGER) + # ? Fetch existing certificates cmd_env = build_certbot_env(JOB, DEPS_PATH) @@ -948,6 +1007,16 @@ def generate_certificate(service: str, config: Dict[str, Union[str, bool, int, D zerossl_api_key_hashes = load_zerossl_api_key_hashes() LOGGER_CERTBOT.debug(f"Stored ZeroSSL API key hashes: {list(zerossl_api_key_hashes.keys())}") + # If we removed broken renewal configs, force reissue for the affected services from service definition (cert_name can have -ecdsa/-rsa suffix) + for cert_name in removed_empty_configs: + service_name = ( + cert_name[:-6] if cert_name.endswith("-ecdsa") else (cert_name[:-4] if cert_name.endswith("-rsa") else cert_name) + ) + key = service_name if service_name in services else (cert_name if cert_name in services else None) + if key is not None and services[key].get("activated"): + services[key]["force_renew"] = True + LOGGER.info(f"Service {key}: broken renewal config was removed, will reissue certificate from service definition.") + # ? Check if the services' certificates already exist for server_name, config in services.items(): if config["force_renew"] or not config["activated"] or server_name not in existing_certificates: @@ -1107,11 +1176,17 @@ def generate_certificate(service: str, config: Dict[str, Union[str, bool, int, D LOGGER.error(f"Error while saving data to db cache : {err}") else: LOGGER.info("Successfully saved data to db cache") -except SystemExit as e: - status = e.code -except BaseException as e: - status = 2 - LOGGER.debug(format_exc()) - LOGGER.error(f"Exception while running certbot-new.py :\n{e}") - -sys_exit(status) + return status + + +if __name__ == "__main__": + status = 0 + try: + status = main() + except SystemExit as e: + status = e.code if e.code is not None else 0 + except BaseException as e: + status = 2 + LOGGER.debug(format_exc()) + LOGGER.error(f"Exception while running certbot-new.py :\n{e}") + sys_exit(status) diff --git a/src/common/core/letsencrypt/jobs/certbot-renew.py b/src/common/core/letsencrypt/jobs/certbot-renew.py index e4430953fc..57db804570 100644 --- a/src/common/core/letsencrypt/jobs/certbot-renew.py +++ b/src/common/core/letsencrypt/jobs/certbot-renew.py @@ -1,7 +1,9 @@ #!/usr/bin/env python3 +import importlib.util from os import getenv, sep -from os.path import join +from os.path import dirname, join +from pathlib import Path from subprocess import DEVNULL, PIPE, Popen from sys import exit as sys_exit, path as sys_path from traceback import format_exc @@ -22,6 +24,8 @@ build_certbot_env, is_zerossl_used_in_env, prepare_logs_dir, + remove_empty_renewal_configs, + repair_live_symlinks, resolve_certbot_entrypoint, ) @@ -50,7 +54,25 @@ JOB = Job(LOGGER, __file__) + # Repair live/ symlinks first (avoids certbot "expected to be a symlink"), then remove broken renewal configs. + # Broken = empty, unparseable, or missing required file reference. Reissue uses service definitions from env. + removed_empty_configs: list = [] + if DATA_PATH.is_dir(): + repair_live_symlinks(DATA_PATH, LOGGER) + removed_empty_configs = remove_empty_renewal_configs(DATA_PATH, LOGGER) + cmd_env = build_certbot_env(JOB, DEPS_PATH) + + # certbot-new is scheduled as "every: once" (init-only), while certbot-renew is periodic. + # If we removed a broken renewal config, trigger reissue now using service definitions from env (not the bad file). + if removed_empty_configs: + certbot_new_path = Path(dirname(__file__)).resolve() / "certbot-new.py" + spec = importlib.util.spec_from_file_location("certbot_new", certbot_new_path) + certbot_new_mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(certbot_new_mod) + if not certbot_new_mod.reissue_for_removed_configs(removed_empty_configs, JOB, cmd_env, LOGGER): + status = 2 + LOGGER.error("Reissue after removing broken renewal config(s) failed") acme_server = "zerossl" if is_zerossl_used_in_env() else "letsencrypt" certbot_entrypoint = resolve_certbot_entrypoint( acme_server, diff --git a/src/common/core/letsencrypt/jobs/letsencrypt_utils.py b/src/common/core/letsencrypt/jobs/letsencrypt_utils.py index a0ffaa0c45..cd999c9add 100644 --- a/src/common/core/letsencrypt/jobs/letsencrypt_utils.py +++ b/src/common/core/letsencrypt/jobs/letsencrypt_utils.py @@ -1,3 +1,4 @@ +from configparser import ConfigParser from os import W_OK, X_OK, access, environ, getenv, sep, umask from os.path import join from pathlib import Path @@ -129,3 +130,146 @@ def get_expected_acme_directory(server: str, staging: bool) -> str: if staging: return LETSENCRYPT_STAGING_DIRECTORY return LETSENCRYPT_PRODUCTION_DIRECTORY + + +def _renewal_config_is_broken(conf: Path) -> bool: + """ + Return True if the renewal config is empty, unparseable, or missing required file references + so certbot would fail on it (parsefail / "missing a required file reference"). + Reissue does not use this file; callers use service definitions from env (load_services) instead. + """ + try: + if conf.stat().st_size == 0: + return True + parser = ConfigParser(interpolation=None) # paths may contain %; avoid InterpolationError + read_ok = parser.read(str(conf), encoding="utf-8") + if not read_ok or len(parser.sections()) == 0: + return True + # Certbot requires at least one section with cert/fullchain/privkey pointing into archive/ + has_required_ref = False + for section in parser.sections(): + for key in ("cert", "fullchain", "privkey"): + if parser.has_option(section, key): + val = parser.get(section, key).strip() + if val and "archive" in val: + has_required_ref = True + break + if has_required_ref: + break + if not has_required_ref: + return True + # Certbot also fails with "expected .../live//cert.pem to be a symlink" when live/ files are regular files. + # Treat as broken so we remove the config and reissue from service definition (repair may have skipped if archive missing). + etc_path = conf.parent.parent # renewal/ -> etc/ + live_lineage = etc_path.joinpath("live", conf.stem) + for name in ("cert.pem", "fullchain.pem", "privkey.pem"): + f = live_lineage.joinpath(name) + if f.exists() and not f.is_symlink(): + return True + return False + except Exception: + # Any read/parse error (e.g. InterpolationError, bad INI) = broken; treat as such and let caller remove/reissue. + return True + + +def remove_empty_renewal_configs(etc_path: Path, logger) -> List[str]: + """ + Remove broken (0-byte or unparseable) renewal config files so certbot renew does not fail. + Returns the list of removed cert names (filename without .conf) so callers can trigger reissue. + Reissue uses service definitions from env (load_services), not the failing config file. + Cert names can have an -ecdsa or -rsa suffix (e.g. www.example.com-ecdsa). + + Why this exists: after a DB restore or corruption, renewal configs can be empty or invalid INI. + Certbot aborts the whole renewal run on one bad file. We remove broken configs and reissue + using the service configuration so the certificate is recreated without relying on the bad file. + """ + removed: List[str] = [] + renewal_dir = etc_path.joinpath("renewal") + if not renewal_dir.is_dir(): + return removed + for conf in renewal_dir.glob("*.conf"): + try: + try: + is_broken = _renewal_config_is_broken(conf) + except Exception as e: + logger.debug(f"Error checking renewal config {conf.name}: {e}, treating as broken.") + is_broken = True + if not is_broken: + continue + cert_name = conf.stem + conf.unlink() + removed.append(cert_name) + logger.info(f"Removed broken renewal config {conf.name}; will reissue from service definition.") + except OSError as e: + logger.debug(f"Could not remove renewal config {conf}: {e}") + return removed + + +def repair_live_symlinks(etc_path: Path, logger) -> None: + """ + Ensure live//cert.pem, fullchain.pem, privkey.pem (and chain.pem) are symlinks + into archive/. After restore from DB, tar extraction may have created regular files + instead of symlinks, which breaks certbot renew. + can have an -ecdsa or -rsa suffix (e.g. www.example.com-ecdsa). + + Why this exists: when Let's Encrypt state is restored from a tar archive, symlinks may be + materialized as regular files. Certbot expects live/ files to be symlinks into archive/. + """ + live_dir = etc_path.joinpath("live") + archive_dir = etc_path.joinpath("archive") + renewal_dir = etc_path.joinpath("renewal") + if not live_dir.is_dir() or not archive_dir.is_dir(): + return + for live_domain in live_dir.iterdir(): + if not live_domain.is_dir(): + continue + domain = live_domain.name + archive_domain = archive_dir.joinpath(domain) + if not archive_domain.is_dir(): + continue + # Resolve target filenames (cert1.pem, fullchain1.pem, etc.) from renewal config or default to 1 + targets: Dict[str, str] = {} + renewal_conf = renewal_dir.joinpath(f"{domain}.conf") + if renewal_conf.is_file(): + try: + parser = ConfigParser(interpolation=None) # paths may contain % + parser.read(str(renewal_conf), encoding="utf-8") + for section in parser.sections(): + for key in ("cert", "privkey", "fullchain", "chain"): + if parser.has_option(section, key): + val = parser.get(section, key).strip() + if val and "archive" in val: + # path is relative to config file (renewal/), e.g. ../../archive/domain/cert1.pem + targets[key] = val + break + if targets: + break + except Exception as e: + logger.debug(f"Could not parse renewal config {renewal_conf}: {e}") + # Fallback: use cert1.pem, fullchain1.pem, privkey1.pem, chain1.pem + for key, default in (("cert", "cert1.pem"), ("privkey", "privkey1.pem"), ("fullchain", "fullchain1.pem"), ("chain", "chain1.pem")): + if key not in targets: + if key == "chain" and not archive_domain.joinpath("chain1.pem").exists(): + continue + targets[key] = f"../../archive/{domain}/{default}" + for live_name, archive_rel in (("cert.pem", targets.get("cert")), ("fullchain.pem", targets.get("fullchain")), ("privkey.pem", targets.get("privkey")), ("chain.pem", targets.get("chain"))): + if not archive_rel: + continue + live_file = live_domain.joinpath(live_name) + archive_basename = Path(archive_rel).name + archive_file = archive_domain.joinpath(archive_basename) + if not archive_file.exists(): + continue + # Symlink from live/domain/ to archive/domain/: use relative path ../../archive/domain/basename + link_target = f"../../archive/{domain}/{archive_basename}" + try: + if live_file.exists(): + if live_file.is_symlink(): + if live_file.resolve() == archive_file.resolve(): + continue + live_file.unlink() + else: + live_file.unlink() + live_file.symlink_to(link_target) + except OSError as e: + logger.debug(f"Could not repair symlink {live_file} -> {link_target}: {e}")