From 14e8cffa414af8bca1d12842c42d5225ee63c279 Mon Sep 17 00:00:00 2001 From: Fernando Lazzarin Date: Mon, 1 Jun 2026 03:20:29 -0300 Subject: [PATCH] Fail closed on untrusted teacher draft confidence Follow-up to #275. get_relevant_skills() treats a missing/unparseable confidence as 1.0, so it always clears the injection threshold. For teacher-escalation drafts -- auto-written from a possibly untrusted trace and then injected as authoritative guidance -- that means a draft can be auto-injected regardless of the configured confidence bar. Require teacher-escalation drafts to carry an explicit, parseable confidence that meets min_confidence; fail closed otherwise. Hand-authored legacy drafts keep the lenient "unset -> keep" behavior so they don't silently vanish, and published skills are unaffected. Ran: python -m py_compile services/memory/skills.py + a get_relevant_skills unit check (teacher drafts with None/garbage/0.8 excluded at min=0.85; 0.9 included; legacy + published unaffected; gate-off control unchanged). Co-authored-by: Fernando Lazzarin <263019791+waitdeadai@users.noreply.github.com> Co-authored-by: Claude Opus 4.8 (1M context) --- services/memory/skills.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/services/memory/skills.py b/services/memory/skills.py index 74a3917..68eb400 100644 --- a/services/memory/skills.py +++ b/services/memory/skills.py @@ -577,6 +577,17 @@ class SkillsManager: def _passes(s): if s.get("status") == "published": return True + # Teacher-escalation drafts are auto-written from a (possibly + # untrusted) trace and injected as authoritative guidance, so they + # must EARN injection with an explicit, parseable confidence that + # clears the bar — fail closed on a missing/garbage value instead + # of treating it as 1.0. Hand-authored legacy drafts keep the + # lenient "unset → keep" behavior so they don't silently vanish. + if s.get("source") == "teacher-escalation": + c = s.get("confidence") + if c is None: + return False + return _to_float(c, 0.0) >= min_confidence # unparseable → fail closed c = s.get("confidence") if c is None: return True # unset → don't filter (legacy)