I found the airflow-analytics-test scheduler in a broken state (in which it had been for ~24h), with the following logs around the time of failure:
[2024-11-30T18:52:44.119+0000] {scheduler_job_runner.py:1004} ERROR - Exception when executing SchedulerJob._run_scheduler_loop
Traceback (most recent call last):
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 3371, in _wrap_pool_connect
return fn()
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 327, in connect
return _ConnectionFairy._checkout(self)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 894, in _checkout
fairy = _ConnectionRecord.checkout(pool)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 493, in checkout
rec = pool._do_get()
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/pool/impl.py", line 256, in _do_get
return self._create_connection()
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 273, in _create_connection
return _ConnectionRecord(self)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 388, in __init__
self.__connect()
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 690, in __connect
with util.safe_reraise():
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/util/langhelpers.py", line 70, in __exit__
compat.raise_(
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/util/compat.py", line 211, in raise_
raise exception
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 686, in __connect
self.dbapi_connection = connection = pool._invoke_creator(self)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/engine/create.py", line 574, in connect
return dialect.connect(*cargs, **cparams)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/engine/default.py", line 598, in connect
return self.dbapi.connect(*cargs, **cparams)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/psycopg2/__init__.py", line 122, in connect
conn = _connect(dsn, connection_factory=connection_factory, **kwasync)
psycopg2.OperationalError: could not translate host name "postgresql-airflow-analytics-test-pooler-rw.airflow-analytics-test" to address: Name or service not known
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/airflow/jobs/scheduler_job_runner.py", line 987, in _execute
self._run_scheduler_loop()
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/airflow/jobs/scheduler_job_runner.py", line 1160, in _run_scheduler_loop
next_event = timers.run(blocking=False)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/sched.py", line 151, in run
action(*argument, **kwargs)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/airflow/utils/event_scheduler.py", line 40, in repeat
action(*args, **kwargs)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/airflow/utils/session.py", line 97, in wrapper
return func(*args, session=session, **kwargs)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/airflow/jobs/scheduler_job_runner.py", line 1823, in _emit_pool_metrics
pools = Pool.slots_stats(session=session)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/airflow/utils/session.py", line 94, in wrapper
return func(*args, **kwargs)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/airflow/models/pool.py", line 178, in slots_stats
pool_rows = session.execute(query)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/orm/session.py", line 1716, in execute
conn = self._connection_for_bind(bind)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/orm/session.py", line 1555, in _connection_for_bind
return self._transaction._connection_for_bind(
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/orm/session.py", line 750, in _connection_for_bind
conn = bind.connect()
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/future/engine.py", line 412, in connect
return super(Engine, self).connect()
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 3325, in connect
return self._connection_cls(self, close_with_result=close_with_result)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 96, in __init__
else engine.raw_connection()
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 3404, in raw_connection
return self._wrap_pool_connect(self.pool.connect, _connection)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 3374, in _wrap_pool_connect
Connection._handle_dbapi_exception_noconnection(
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 2208, in _handle_dbapi_exception_noconnection
util.raise_(
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/util/compat.py", line 211, in raise_
raise exception
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 3371, in _wrap_pool_connect
return fn()
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 327, in connect
return _ConnectionFairy._checkout(self)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 894, in _checkout
fairy = _ConnectionRecord.checkout(pool)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 493, in checkout
rec = pool._do_get()
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/pool/impl.py", line 256, in _do_get
return self._create_connection()
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 273, in _create_connection
return _ConnectionRecord(self)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 388, in __init__
self.__connect()
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 690, in __connect
with util.safe_reraise():
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/util/langhelpers.py", line 70, in __exit__
compat.raise_(
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/util/compat.py", line 211, in raise_
raise exception
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/pool/base.py", line 686, in __connect
self.dbapi_connection = connection = pool._invoke_creator(self)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/engine/create.py", line 574, in connect
return dialect.connect(*cargs, **cparams)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/sqlalchemy/engine/default.py", line 598, in connect
return self.dbapi.connect(*cargs, **cparams)
File "/tmp/pyenv/versions/3.10.15/lib/python3.10/site-packages/psycopg2/__init__.py", line 122, in connect
conn = _connect(dsn, connection_factory=connection_factory, **kwasync)
sqlalchemy.exc.OperationalError: (psycopg2.OperationalError) could not translate host name "postgresql-airflow-analytics-test-pooler-rw.airflow-analytics-test" to address: Name or service not known
(Background on this error at: https://sqlalche.me/e/14/e3q8)
[2024-11-30T18:52:44.123+0000] {kubernetes_executor.py:760} INFO - Shutting down Kubernetes executor
10.64.136.7 - - [30/Nov/2024 18:52:45] "GET /health HTTP/1.1" 200 -
[2024-11-30T18:52:45.913+0000] {kubernetes_executor_utils.py:291} INFO - Event: refine-to-hive-hourly-test-refine-hive-dataset-wait-fo-fsc8bfkr is Running, annotations: <omitted>
[2024-11-30T18:52:47.081+0000] {kubernetes_executor_utils.py:291} INFO - Event: refine-to-hive-hourly-test-refine-hive-dataset-wait-fo-fsc8bfkr is Running, annotations: <omitted>
[2024-11-30T18:52:47.948+0000] {kubernetes_executor_utils.py:276} INFO - Event: refine-to-hive-hourly-test-refine-hive-dataset-wait-fo-fsc8bfkr Succeeded, annotations: <omitted>
10.64.136.7 - - [30/Nov/2024 18:52:55] "GET /health HTTP/1.1" 200 -
10.64.136.7 - - [30/Nov/2024 18:53:05] code 503, message Service Unavailable
10.64.136.7 - - [30/Nov/2024 18:53:05] "GET /health HTTP/1.1" 503 -
10.64.136.7 - - [30/Nov/2024 18:53:15] code 503, message Service Unavailable
10.64.136.7 - - [30/Nov/2024 18:53:15] "GET /health HTTP/1.1" 503 -
[2024-11-30T18:53:17.973+0000] {kubernetes_executor_utils.py:93} INFO - Kubernetes watch timed out waiting for events. Restarting watch.
[2024-11-30T18:53:18.975+0000] {kubernetes_executor_utils.py:132} INFO - Event: and now my watch begins starting at resource_version: 0
[2024-11-30T18:53:19.026+0000] {kubernetes_executor_utils.py:276} INFO - Event: refine-to-hive-hourly-test-refine-hive-dataset-wait-fo-fsc8bfkr Succeeded, annotations: <omitted>The /health endpoint proceeded to respond with HTTP 503 for a day, and the pod was not restarted.
After a brief investigation, it turns out that our liveness check uses a tcpSocket check, meaning that the fact that the /health endpoint returns an HTTP 200 or HTTP 503 is equivalent: the container is live as long as a TCP connection can be established.