Add native Windows compatibility layer

This commit is contained in:
pewdiepie-archdaemon
2026-06-01 15:09:47 +09:00
parent ead7c01822
commit 0888a3b3e6
54 changed files with 1104 additions and 267 deletions

View File

@@ -13,6 +13,17 @@ Set EMBEDDING_URL in .env, e.g.:
"""
import os
# Windows: force HuggingFace/fastembed to COPY model files rather than symlink
# them. On a network-share/UNC cache dir Windows can't follow HF's symlinks
# ([WinError 1463] "symbolic link cannot be followed"), so ONNX fails to load the
# model and semantic memory dies. huggingface_hub reads this flag at import time,
# so it must be set before huggingface_hub is first imported — hence module-top.
# (app.py sets the same guard for the server entrypoint.)
if os.name == "nt":
os.environ.setdefault("HF_HUB_DISABLE_SYMLINKS", "1")
os.environ.setdefault("HF_HUB_DISABLE_SYMLINKS_WARNING", "1")
import logging
import numpy as np
import httpx
@@ -109,6 +120,35 @@ class FastEmbedClient:
"data", "fastembed_cache",
)
os.makedirs(cache_dir, exist_ok=True)
# Windows self-heal: the HuggingFace-hub cache stores model files as
# symlinks (snapshots/<rev>/model.onnx -> ../../blobs/<hash>). On a
# network-share / UNC data dir Windows refuses to follow them
# ([WinError 1463] "symbolic link cannot be followed because its type is
# disabled"), and a cache copied between machines can carry dead symlinks
# too. Either way fastembed tries to load a broken symlink and fails
# *without* re-downloading, leaving semantic memory degraded. Detect a
# broken-symlink model in the cache and drop the contaminated hub dir so
# fastembed re-fetches (it falls back to its CDN tarball of real files,
# which load fine). Best-effort; only ever removes a verifiably dead link.
if os.name == "nt":
try:
import glob, shutil
for _onnx in glob.glob(os.path.join(cache_dir, "**", "*.onnx"), recursive=True):
if os.path.islink(_onnx) and not os.path.exists(_onnx):
_root = _onnx
while os.path.basename(_root) and not os.path.basename(_root).startswith("models--"):
_parent = os.path.dirname(_root)
if _parent == _root:
break
_root = _parent
if os.path.basename(_root).startswith("models--"):
logger.warning(
"Embedding cache has a broken symlink (%s); clearing %s "
"so fastembed re-downloads real files", _onnx, _root,
)
shutil.rmtree(_root, ignore_errors=True)
except Exception as _e:
logger.debug("embedding cache symlink-heal skipped: %s", _e)
kwargs = {"model_name": self.model, "cache_dir": cache_dir}
self._embedding = TextEmbedding(**kwargs)
self._dim: Optional[int] = None
@@ -152,7 +192,7 @@ def _load_persisted_endpoint() -> dict:
)
if os.path.exists(endpoint_file):
import json
data = json.loads(open(endpoint_file).read())
data = json.loads(open(endpoint_file, encoding="utf-8").read())
if data.get("url"):
return data
except Exception: