diff --git a/networking-calico/devstack/plugin.sh b/networking-calico/devstack/plugin.sh index a6049c18ef2..2b8a0b011de 100644 --- a/networking-calico/devstack/plugin.sh +++ b/networking-calico/devstack/plugin.sh @@ -104,6 +104,14 @@ EOF iniset $NEUTRON_CONF calico etcd_host $SERVICE_HOST iniset $NEUTRON_CONF calico etcd_port $ETCD_PORT + # Turn on the connection-fairy GC diagnostics in the + # mech driver so any in-hub fairy finalisation in CI + # logs a WARNING with the originating-checkout stack. + # Cheap enough to leave on for benchmark runs and + # essential for catching the deadlock pattern that + # has hung resync_scale_test in the past. + iniset $NEUTRON_CONF calico fairy_gc_diagnostics True + # If CALICO_ETCD_COMPACTION_PERIOD_MINS is # defined, set that as the value of the # etcd_compaction_period_mins setting. diff --git a/networking-calico/networking_calico/plugins/ml2/drivers/calico/fairy_gc_diagnostics.py b/networking-calico/networking_calico/plugins/ml2/drivers/calico/fairy_gc_diagnostics.py new file mode 100644 index 00000000000..a6a9e09d80b --- /dev/null +++ b/networking-calico/networking_calico/plugins/ml2/drivers/calico/fairy_gc_diagnostics.py @@ -0,0 +1,225 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2026 Tigera, Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + +"""Diagnostics for the SQLAlchemy-fairy GC-in-eventlet-hub race. + +The race +-------- +We have sometimes been seeing this exception in journalctl.txt / neutron-server.log: + +Exception ignored in: . + at 0x72810792e290> +Traceback (most recent call last): + File ".../site-packages/sqlalchemy/pool/base.py", line 509, in + and _finalize_fairy( + File ".../site-packages/sqlalchemy/pool/base.py", line 800, in _finalize_fairy + connection_record.checkin() + File ".../site-packages/sqlalchemy/pool/base.py", line 544, in checkin + pool.dispatch.checkin(connection, self) + File ".../site-packages/sqlalchemy/event/attr.py", line 346, in __call__ + fn(*args, **kw) + File ".../site-packages/oslo_db/sqlalchemy/engines.py", line 52, in _thread_yield + time.sleep(0) + File ".../site-packages/eventlet/greenthread.py", line 37, in sleep + hub.switch() + File ".../site-packages/eventlet/hubs/hub.py", line 310, in switch + return self.greenlet.switch() +TimeoutError: timed out + +This occurs when a Neutron DB context uses a session (for some reason) and then leaks +it, and GC kicks in on eventlet's "hub" greenlet. An sqlalchemy connection fairy that +is GC'd while the eventlet hub greenlet is the current greenlet triggers oslo.db's +``_thread_yield`` checkin listener to call ``time.sleep(0)`` -> ``hub.switch()``, which +deadlocks because the hub cannot switch to itself. The ``TimeoutError`` that eventually +fires is silently swallowed by Python's "Exception ignored in" finalizer-exception +mechanism, but each occurrence wedges the hub for ~10s and leaves the connection +record's pool state indeterminate. + +I _think_ #13015 fixes the primary root cause of this, by adding a transaction wrapper +around raw `context.session.query` calls. The wrapper properly closes the session after +those calls, instead of leaking it to GC. However, in case there are any remaining +cases, e.g. because a Neutron-framework path outside our control drops a session +unclosed, or because a future change reintroduces a code path that bypasses the +``using`` pattern -- we want a log line that points at the leaking code path rather than +just the in-hub finalizer stack that the existing "Exception ignored in" trace gives us. + +Low-level detail +---------------- +An SQLAlchemy connection ``_ConnectionFairy`` is the user-facing wrapper returned by +``Pool.connect()`` (or, transitively, by ``Session.connection()`` when a session +executes its first SQL). Every fairy has a weakref finalizer (the ```` +registered inside ``_ConnectionRecord.checkout``) that, if the fairy is GC'd without +explicit checkin, calls ``connection_record.checkin()`` -- which dispatches the +SQLAlchemy ``checkin`` event. oslo.db registers ``_thread_yield`` on that event, which +calls ``time.sleep(0)``. + +Under eventlet, ``time.sleep(0)`` is monkey-patched to ``hub.switch()``. If we're +already executing in the hub greenlet at the moment of GC (possible whenever GC fires +during the hub's own bookkeeping), ``hub.switch()`` from the hub to itself fails to find +a target and times out as ``TimeoutError``. Python's "Exception ignored in" mechanism +swallows the error, but the hub has been wedged for ~10s and the connection record's +state is indeterminate. + +In current code (after PR 13015), all of our DB access is inside +``db_api.CONTEXT_*.using`` blocks, which close the session deterministically at block +exit (oslo.db's ``_TransactionContext._session`` finally-block calls +``session.close()``) and so never leave a fairy for GC. In principle that should be +enough. This module is the safety net for the case where it isn't -- e.g. if a +Neutron-framework path outside our control still drops a session unclosed, or if we add +a code path later that bypasses the ``using`` pattern. If the race fires again the +diagnostic output identifies which code path checked out the leaking connection, so a +targeted fix can follow. + +What this module does +--------------------- +Two SQLAlchemy event listeners, installed on ``sqlalchemy.pool.Pool`` (so all engines in +the process inherit them): + +1. A ``checkout`` listener that captures a stack trace at the point the connection was + checked out, stashing it on ``connection_record.info["calico_checkout_stack"]``. + This is how we attribute a later GC to a specific code path: when the fairy is + eventually GC'd, the same connection record is used for checkin, and we can read the + stored stack to see *who* originally asked for this connection. + +2. A ``checkin`` listener with ``insert=True`` (so it runs *before* oslo.db's + ``_thread_yield``) that: + + * Checks whether we're currently running in the eventlet hub greenlet. + + * If yes -- i.e. we're about to hit the deadlock -- logs a WARNING containing both + the checkout-time stack and the current finalizer stack. + + It does *not* attempt to suppress oslo.db's listener; the ``TimeoutError`` will still + fire and be silently caught. This keeps the module purely diagnostic (no behavioural + change), so it can be left on indefinitely. + +Cost +---- +Capturing a stack trace per pool checkout is not free. At rough benchmark depth (~30 +frames, plain Neutron handler chain) it adds ~50-100us per checkout. Under a heavy +scale workload this adds up to a small number of seconds of total overhead -- noticeable +but not crippling. The module is therefore default-off; enable via: + + [calico] + fairy_gc_diagnostics = True + +in neutron.conf when running workloads where the race is suspected. +""" + +import traceback + +from oslo_log import log +from sqlalchemy import event +from sqlalchemy.pool import Pool + + +LOG = log.getLogger(__name__) + + +# Module-level flag so install() runs at most once per process: subsequent calls are +# no-ops regardless of whether the first call actually registered listeners or bailed +# out early. The flag means "decision made", not "listeners active" -- without this, +# repeat calls would re-emit the early-return WARNING each time, contradicting the +# docstring's "single WARNING" promise. +_INSTALL_ATTEMPTED = False + + +def install(): + """Idempotently install the fairy-GC diagnostic listeners. + + Safe to call multiple times (subsequent calls in the same process are no-ops). + Calling install() in the parent before workers are forked is sufficient: the + SQLAlchemy event listeners attach to the ``Pool`` class, so each forked worker + inherits them as part of its post-fork memory image and does not need to + re-install. + + If eventlet is not importable -- e.g. after the planned eventlet removal in + neutron-server -- this is a no-op with a single WARNING log line. The race the + listeners detect only fires under eventlet, so there is nothing useful to install in + that case, and we deliberately do not want a per-checkin ImportError-driven log + storm. + """ + global _INSTALL_ATTEMPTED + if _INSTALL_ATTEMPTED: + return + # Commit to the decision now so any early return below still counts as + # "already attempted" and we do not re-log the early-return WARNING on + # subsequent calls. + _INSTALL_ATTEMPTED = True + + try: + import eventlet.greenthread + import eventlet.hubs + import eventlet.patcher + except ImportError: + LOG.warning( + "Calico fairy-GC diagnostics: eventlet not importable, not " + "installing listeners. The race this module detects only " + "fires under eventlet." + ) + return + + if not eventlet.patcher.is_monkey_patched("time"): + LOG.warning( + "Calico fairy-GC diagnostics: eventlet is importable but time is not " + "monkey-patched; not installing listeners. The race this module detects " + "requires eventlet monkey-patching." + ) + return + + @event.listens_for(Pool, "checkout") + def _calico_track_checkout(dbapi_conn, connection_record, connection_proxy): + # Capture the call stack at the moment of checkout, so a later finalizer-fired + # checkin can report where the connection was originally given out. Format + # eagerly (rather than storing a ``sys._getframe()`` reference) because the + # frames need to outlive the current call -- the connection may sit in the pool + # for a long time before being GC'd, and lazy formatting would race the frames + # being collected themselves. + # + # Bounded at the most-recent 50 frames: a Neutron handler chain is typically + # ~30 frames, so 50 covers the full path through the driver in normal cases + # and caps the worst case (e.g. a pathological deep-recursion middleware) so + # we do not burn unbounded CPU per checkout or stash unbounded strings on + # long-lived ``connection_record.info`` entries. + connection_record.info["calico_checkout_stack"] = "".join( + traceback.format_stack(limit=50) + ) + + @event.listens_for(Pool, "checkin", insert=True) + def _calico_detect_in_hub_checkin(dbapi_conn, connection_record): + # ``insert=True`` prepends this listener so it runs before oslo.db's + # ``_thread_yield``. If we're in the hub greenlet, ``_thread_yield`` is about + # to call ``time.sleep(0)`` -> ``hub.switch()`` and deadlock. Log enough + # context to identify the originating code path before that happens. + creating_stack = connection_record.info.pop( + "calico_checkout_stack", "" + ) + hub = eventlet.hubs.get_hub() + if eventlet.greenthread.getcurrent() is hub.greenlet: + LOG.warning( + "Calico fairy-GC diagnostic: connection checkin firing " + "in eventlet hub greenlet -- oslo.db's _thread_yield is " + "about to call time.sleep(0), which will deadlock as " + "TimeoutError.\n" + " Connection was checked out at:\n%s\n" + " Current finalizer stack:\n%s", + creating_stack, + # Finalizer stack is typically shallower than the checkout-time + # stack (just the GC / weakref-finalizer path into us), so 30 + # frames is plenty. + "".join(traceback.format_stack(limit=30)), + ) + + LOG.info("Calico fairy-GC diagnostics installed") diff --git a/networking-calico/networking_calico/plugins/ml2/drivers/calico/mech_calico.py b/networking-calico/networking_calico/plugins/ml2/drivers/calico/mech_calico.py index 5560568aee2..33a2219a39f 100644 --- a/networking-calico/networking_calico/plugins/ml2/drivers/calico/mech_calico.py +++ b/networking-calico/networking_calico/plugins/ml2/drivers/calico/mech_calico.py @@ -163,6 +163,29 @@ ), help="Deprecated and unused. Retained to avoid neutron.conf errors.", ), + cfg.BoolOpt( + "fairy_gc_diagnostics", + default=False, + help=( + "DIAGNOSTIC: install SQLAlchemy event listeners that " + "capture a stack trace at every connection-pool checkout " + "and detect when a connection-checkin (typically fired by " + "GC of a session) is happening in the eventlet hub " + "greenlet -- a failure mode in which oslo.db's " + "_thread_yield listener calls time.sleep(0) -> " + "hub.switch() and deadlocks because the hub greenlet " + "cannot switch to itself. When the in-hub case is " + "detected, the originating-checkout stack is logged at " + "WARNING so the leaking code path can be identified. See " + "the module docstring in " + "networking_calico/plugins/ml2/drivers/calico/" + "fairy_gc_diagnostics.py for the full failure-mode " + "explanation. Default off because the per-checkout stack " + "capture adds non-trivial overhead at high " + "connection-churn rates; enable when investigating a " + "suspected occurrence." + ), + ), cfg.IntOpt( "startup_resync_inject_per_item_delay_ms", default=0, @@ -401,6 +424,16 @@ def initialize(self): """ super(CalicoMechanismDriver, self).initialize() _check_mysql_driver() + if cfg.CONF.calico.fairy_gc_diagnostics: + # Install once in the parent process before workers are forked. + # The listeners attach to the SQLAlchemy Pool class; each forked + # worker inherits them as part of its post-fork memory image, so + # we do not need to re-install in each child. + from networking_calico.plugins.ml2.drivers.calico import ( + fairy_gc_diagnostics, + ) + + fairy_gc_diagnostics.install() registry.subscribe( self.post_fork_initialize, resources.PROCESS,