fix: ChromaDB unreachable blocks app startup for 30-60s (#326) (#476)

* fix: fail fast when ChromaDB is unreachable instead of blocking startup

* fix: only cache the ChromaDB client after a successful heartbeat

* test: cover ChromaDB fast-fail preflight and no-cache-on-failure
This commit is contained in:
Afonso Coutinho
2026-06-01 14:22:41 +01:00
committed by GitHub
parent 171c29dcf3
commit 1eff46579a
2 changed files with 80 additions and 3 deletions

View File

@@ -6,12 +6,27 @@ Connects to a ChromaDB instance running as a standalone service.
""" """
import os import os
import socket
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_client = None _client = None
# A short connect probe so an unreachable ChromaDB fails fast instead of
# blocking on the OS connection timeout (~30-60s, WinError 10060 on Windows),
# which otherwise stalls app startup. Tunable via CHROMADB_CONNECT_TIMEOUT.
_CONNECT_TIMEOUT = float(os.getenv("CHROMADB_CONNECT_TIMEOUT", "2.0"))
def _port_open(host: str, port: int, timeout: float = None) -> bool:
"""Return True if a TCP connection to host:port succeeds within timeout."""
try:
with socket.create_connection((host, port), timeout=timeout or _CONNECT_TIMEOUT):
return True
except OSError:
return False
def get_chroma_client(): def get_chroma_client():
"""Get or create the singleton ChromaDB HTTP client. """Get or create the singleton ChromaDB HTTP client.
@@ -34,10 +49,20 @@ def get_chroma_client():
host = os.getenv("CHROMADB_HOST", "localhost") host = os.getenv("CHROMADB_HOST", "localhost")
port = int(os.getenv("CHROMADB_PORT", "8100")) port = int(os.getenv("CHROMADB_PORT", "8100"))
_client = chromadb.HttpClient(host=host, port=port) if not _port_open(host, port):
raise RuntimeError(
f"ChromaDB is not reachable at {host}:{port}. Start the ChromaDB "
f"service (e.g. `docker compose up chromadb`) or set CHROMADB_HOST / "
f"CHROMADB_PORT to point at a running instance."
)
# Health check client = chromadb.HttpClient(host=host, port=port)
_client.heartbeat()
# Health check before caching — if the port is open but the service isn't
# healthy yet (e.g. still starting), don't poison the singleton with a dead
# client; leave _client unset so the next call retries.
client.heartbeat()
_client = client
logger.info(f"ChromaDB connected: {host}:{port}") logger.info(f"ChromaDB connected: {host}:{port}")
return _client return _client

View File

@@ -0,0 +1,52 @@
"""Regression tests for the ChromaDB singleton client (issue #326).
Covers the fast-fail preflight (so an unreachable ChromaDB doesn't block
startup for the full OS connection timeout) and the rule that a failed
connection must not poison the cached singleton.
"""
import socket
import time
import pytest
import src.chroma_client as cc
def _free_port() -> int:
"""Bind to port 0, grab the assigned port, release it — nothing listens."""
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(("127.0.0.1", 0))
port = s.getsockname()[1]
s.close()
return port
def test_port_open_false_for_closed_port_and_is_fast():
port = _free_port()
t0 = time.monotonic()
assert cc._port_open("127.0.0.1", port, timeout=1.0) is False
# The whole point: we fail fast, nowhere near the 30-60s OS timeout.
assert time.monotonic() - t0 < 5.0
def test_port_open_true_for_listening_socket():
srv = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
srv.bind(("127.0.0.1", 0))
srv.listen(1)
host, port = srv.getsockname()
try:
assert cc._port_open(host, port, timeout=1.0) is True
finally:
srv.close()
def test_get_chroma_client_does_not_cache_when_unreachable(monkeypatch):
pytest.importorskip("chromadb")
cc.reset_client()
monkeypatch.setenv("CHROMADB_HOST", "127.0.0.1")
monkeypatch.setenv("CHROMADB_PORT", str(_free_port()))
with pytest.raises(RuntimeError):
cc.get_chroma_client()
# A failed connection must leave the singleton unset so a later call
# (once ChromaDB is up) can succeed.
assert cc._client is None