odysseus/services/memory/skill_format.py

"""SKILL.md parser & writer.

Reads/writes a single skill from a `SKILL.md` file with YAML frontmatter
and a structured markdown body. Inspired by Hermes' skills format
(https://hermes-agent.nousresearch.com/docs/user-guide/features/skills).

Frontmatter shape (YAML):

    ---
    name: open-pr-from-branch
    description: One-line summary surfaced in the skills index.
    version: 1.0.0
    category: dev
    tags: [git, github]
    platforms: [linux, macos]            # optional
    requires_toolsets: []                # optional
    fallback_for_toolsets: []            # optional
    status: published                    # draft | published
    confidence: 0.8                      # 0..1
    source: learned                      # learned | taught | imported
    teacher_model: claude-opus-4-7       # optional
    created: 2026-05-09T21:43:00Z
    ---

Body sections (any subset; rendered as headings):

    ## When to Use
    Trigger conditions in plain English.

    ## Procedure
    1. First step
    2. Second step

    ## Pitfalls
    - Common failure mode + how to recover

    ## Verification
    - How to confirm success

    Anything else (raw paragraphs after the last known section) is preserved
    in `body_extra` and round-trips on save.

Usage counters (`uses`, `last_used`) live in a sidecar `_usage.json` keyed
by skill name, so the SKILL.md file doesn't churn on every retrieval.
"""

from __future__ import annotations

import json
import logging
import re
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any, Dict, List, Optional

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Slugify
# ---------------------------------------------------------------------------

_SLUG_RE = re.compile(r"[^a-z0-9]+")


def slugify(text: str, fallback: str = "skill") -> str:
    """Convert a free-form title to a kebab-case slug suitable for a directory
    name. Strips non-alphanumerics, collapses runs, trims leading/trailing
    dashes. Caps at 60 chars."""
    s = str(text or "").strip().lower()
    s = _SLUG_RE.sub("-", s)
    s = s.strip("-")
    return (s or fallback)[:60]


# ---------------------------------------------------------------------------
# Frontmatter (minimal YAML — we don't pull in PyYAML for one feature)
# ---------------------------------------------------------------------------

# We accept a tiny subset of YAML: scalar `key: value`, inline lists `[a, b]`,
# and block lists with `-`. That covers everything in our schema and avoids
# a new dependency.

_FM_KEY_RE = re.compile(r"^([a-z_][a-z0-9_]*):\s*(.*)$", re.IGNORECASE)
_FM_BLOCK_LIST_RE = re.compile(r"^\s*-\s*(.*)$")


def _parse_scalar(raw: str) -> Any:
    raw = raw.strip()
    if raw == "":
        return ""
    if raw.startswith("[") and raw.endswith("]"):
        inner = raw[1:-1].strip()
        if not inner:
            return []
        return [_parse_scalar(p) for p in _split_top_level(inner, ",")]
    if raw.lower() in ("true", "yes"):
        return True
    if raw.lower() in ("false", "no"):
        return False
    if raw.lower() in ("null", "none", "~"):
        return None
    if (raw[0] == raw[-1]) and raw[0] in ("'", '"'):
        return raw[1:-1]
    # Try number
    try:
        if "." in raw:
            return float(raw)
        return int(raw)
    except ValueError:
        pass
    return raw


def _split_top_level(s: str, sep: str) -> List[str]:
    """Split `s` on `sep` ignoring separators inside [] or quotes."""
    out, buf, depth, quote = [], [], 0, None
    for ch in s:
        if quote:
            buf.append(ch)
            if ch == quote:
                quote = None
            continue
        if ch in ("'", '"'):
            quote = ch
            buf.append(ch)
            continue
        if ch == "[":
            depth += 1
        elif ch == "]":
            depth = max(0, depth - 1)
        if ch == sep and depth == 0:
            out.append("".join(buf).strip())
            buf = []
            continue
        buf.append(ch)
    if buf:
        out.append("".join(buf).strip())
    return out


def parse_frontmatter(text: str) -> tuple[Dict[str, Any], str]:
    """Pull the YAML frontmatter out of a SKILL.md and return (fm, body)."""
    if not text.startswith("---"):
        return {}, text
    end = text.find("\n---", 3)
    if end < 0:
        return {}, text
    fm_text = text[3:end].lstrip("\n")
    body = text[end + 4:].lstrip("\n")
    fm: Dict[str, Any] = {}
    pending_key: Optional[str] = None
    for line in fm_text.splitlines():
        if not line.strip() or line.lstrip().startswith("#"):
            continue
        m = _FM_KEY_RE.match(line)
        if m:
            key, val = m.group(1), m.group(2)
            if val.strip() == "":
                pending_key = key
                fm[key] = []
            else:
                fm[key] = _parse_scalar(val)
                pending_key = None
            continue
        m2 = _FM_BLOCK_LIST_RE.match(line)
        if m2 and pending_key:
            existing = fm.get(pending_key)
            if not isinstance(existing, list):
                fm[pending_key] = []
            fm[pending_key].append(_parse_scalar(m2.group(1)))
    return fm, body


def _emit_scalar(v: Any) -> str:
    if v is None:
        return "null"
    if isinstance(v, bool):
        return "true" if v else "false"
    if isinstance(v, (int, float)):
        return str(v)
    if isinstance(v, list):
        return "[" + ", ".join(_emit_scalar(x) for x in v) + "]"
    s = str(v)
    if any(c in s for c in (":", "#", "\n", "[", "]", "{", "}", ",", "&", "*", "!", "|", ">", "'", '"', "%", "@")):
        return json.dumps(s)
    return s


def _as_list(v: Any) -> List[str]:
    if v is None:
        return []
    if isinstance(v, list):
        return [str(x) for x in v if x not in (None, "")]
    return [str(v)]


def _as_float(v: Any, default: float = 0.8) -> float:
    try:
        return float(v)
    except (TypeError, ValueError):
        return default


def emit_frontmatter(fm: Dict[str, Any]) -> str:
    lines = []
    for k, v in fm.items():
        if v is None or v == [] or v == "":
            continue
        lines.append(f"{k}: {_emit_scalar(v)}")
    return "\n".join(lines)


# ---------------------------------------------------------------------------
# Skill body sections
# ---------------------------------------------------------------------------

_KNOWN_SECTIONS = ("when_to_use", "procedure", "pitfalls", "verification")
_HEADING_TO_KEY = {
    "when to use": "when_to_use",
    "procedure": "procedure",
    "steps": "procedure",
    "pitfalls": "pitfalls",
    "verification": "verification",
}
_KEY_TO_HEADING = {
    "when_to_use": "When to Use",
    "procedure": "Procedure",
    "pitfalls": "Pitfalls",
    "verification": "Verification",
}


def parse_body(body: str) -> Dict[str, Any]:
    """Split a SKILL.md body into known sections.

    Returns:
        {
            "when_to_use": str,
            "procedure":   list[str],   # numbered/bulleted lines
            "pitfalls":    list[str],
            "verification": list[str],
            "body_extra":  str,         # anything not under a known heading
        }
    """
    out = {k: ([] if k != "when_to_use" else "") for k in _KNOWN_SECTIONS}
    out["body_extra"] = ""
    if not body or not body.strip():
        return out

    sections: List[tuple[Optional[str], List[str]]] = [(None, [])]
    for line in body.splitlines():
        m = re.match(r"^##\s+(.*?)\s*$", line)
        if m:
            heading = m.group(1).strip().lower()
            key = _HEADING_TO_KEY.get(heading)
            sections.append((key, []))
            continue
        sections[-1][1].append(line)

    for key, lines in sections:
        text = "\n".join(lines).strip("\n")
        if key is None:
            extras = text.strip()
            if extras:
                out["body_extra"] = (out["body_extra"] + "\n\n" + extras).strip()
            continue
        if key == "when_to_use":
            out["when_to_use"] = text.strip()
        else:
            out[key] = _parse_list_lines(text)
    return out


def _parse_list_lines(text: str) -> List[str]:
    """Pull bullet/numbered lines out of a section body. Plain paragraphs are
    treated as a single entry."""
    items: List[str] = []
    for line in (text or "").splitlines():
        s = line.strip()
        if not s:
            continue
        m = re.match(r"^(?:[-*]|\d+[.)])\s+(.*)$", s)
        if m:
            items.append(m.group(1).strip())
        elif items:
            # continuation of previous bullet
            items[-1] = items[-1] + " " + s
        else:
            items.append(s)
    return items


def emit_body(sections: Dict[str, Any]) -> str:
    parts: List[str] = []
    when = (sections.get("when_to_use") or "").strip()
    if when:
        parts.append(f"## {_KEY_TO_HEADING['when_to_use']}\n\n{when}")
    for key in ("procedure", "pitfalls", "verification"):
        items = sections.get(key) or []
        if not items:
            continue
        heading = _KEY_TO_HEADING[key]
        if key == "procedure":
            body = "\n".join(f"{i + 1}. {x}" for i, x in enumerate(items))
        else:
            body = "\n".join(f"- {x}" for x in items)
        parts.append(f"## {heading}\n\n{body}")
    extra = (sections.get("body_extra") or "").strip()
    if extra:
        parts.append(extra)
    return "\n\n".join(parts) + ("\n" if parts else "")


# ---------------------------------------------------------------------------
# Skill record
# ---------------------------------------------------------------------------


@dataclass
class Skill:
    name: str                                          # slug, dir name
    description: str = ""
    version: str = "1.0.0"
    category: str = "general"
    tags: List[str] = field(default_factory=list)
    platforms: List[str] = field(default_factory=list)
    requires_toolsets: List[str] = field(default_factory=list)
    fallback_for_toolsets: List[str] = field(default_factory=list)
    status: str = "draft"                              # draft | published
    confidence: float = 0.8
    source: str = "learned"
    teacher_model: Optional[str] = None
    owner: Optional[str] = None
    created: str = ""                                  # ISO8601
    when_to_use: str = ""
    procedure: List[str] = field(default_factory=list)
    pitfalls: List[str] = field(default_factory=list)
    verification: List[str] = field(default_factory=list)
    body_extra: str = ""
    # Sidecar (not persisted in SKILL.md)
    uses: int = 0
    last_used: Optional[int] = None
    # File path on disk (set when read)
    path: Optional[str] = None

    # ----------------------------------------------------------------------
    # Serialization
    # ----------------------------------------------------------------------

    def to_frontmatter(self) -> Dict[str, Any]:
        fm: Dict[str, Any] = {
            "name": self.name,
            "description": self.description,
            "version": self.version,
            "category": self.category,
        }
        if self.tags:                  fm["tags"] = list(self.tags)
        if self.platforms:             fm["platforms"] = list(self.platforms)
        if self.requires_toolsets:     fm["requires_toolsets"] = list(self.requires_toolsets)
        if self.fallback_for_toolsets: fm["fallback_for_toolsets"] = list(self.fallback_for_toolsets)
        fm["status"] = self.status
        fm["confidence"] = round(float(self.confidence), 3)
        fm["source"] = self.source
        if self.teacher_model: fm["teacher_model"] = self.teacher_model
        if self.owner:         fm["owner"] = self.owner
        fm["created"] = self.created or _now_iso()
        return fm

    def to_dict(self) -> Dict[str, Any]:
        d = {
            "id": self.name,        # slug doubles as id
            "name": self.name,
            "description": self.description,
            "version": self.version,
            "category": self.category,
            "tags": list(self.tags),
            "platforms": list(self.platforms),
            "requires_toolsets": list(self.requires_toolsets),
            "fallback_for_toolsets": list(self.fallback_for_toolsets),
            "status": self.status,
            "confidence": round(float(self.confidence), 3),
            "source": self.source,
            "teacher_model": self.teacher_model,
            "owner": self.owner,
            "created": self.created,
            "when_to_use": self.when_to_use,
            "procedure": list(self.procedure),
            "pitfalls": list(self.pitfalls),
            "verification": list(self.verification),
            "body_extra": self.body_extra,
            "uses": int(self.uses or 0),
            "last_used": self.last_used,
            "path": self.path,
        }
        # Back-compat aliases for the old API/UI
        d["title"] = self.description or self.name.replace("-", " ").title()
        d["problem"] = self.when_to_use
        d["solution"] = (self.procedure[0] if self.procedure else "") if not self.body_extra else self.body_extra
        d["steps"] = list(self.procedure)
        return d

    @classmethod
    def from_markdown(cls, text: str, *, path: Optional[str] = None) -> "Skill":
        fm, body = parse_frontmatter(text)
        sections = parse_body(body)
        raw_name = fm.get("name")
        name = slugify(raw_name if raw_name not in (None, "") else fm.get("description", ""), fallback="skill")
        return cls(
            name=name,
            description=str(fm.get("description", "") or ""),
            version=str(fm.get("version", "1.0.0") or "1.0.0"),
            category=str(fm.get("category", "general") or "general"),
            tags=_as_list(fm.get("tags")),
            platforms=_as_list(fm.get("platforms")),
            requires_toolsets=_as_list(fm.get("requires_toolsets")),
            fallback_for_toolsets=_as_list(fm.get("fallback_for_toolsets")),
            status=str(fm.get("status", "draft") or "draft"),
            confidence=_as_float(fm.get("confidence", 0.8), 0.8),
            source=str(fm.get("source", "learned") or "learned"),
            teacher_model=str(fm.get("teacher_model")) if fm.get("teacher_model") else None,
            owner=str(fm.get("owner")) if fm.get("owner") else None,
            created=str(fm.get("created") or _now_iso()),
            when_to_use=sections["when_to_use"],
            procedure=list(sections["procedure"]),
            pitfalls=list(sections["pitfalls"]),
            verification=list(sections["verification"]),
            body_extra=sections["body_extra"],
            path=path,
        )

    def to_markdown(self) -> str:
        fm = emit_frontmatter(self.to_frontmatter())
        body = emit_body({
            "when_to_use": self.when_to_use,
            "procedure": self.procedure,
            "pitfalls": self.pitfalls,
            "verification": self.verification,
            "body_extra": self.body_extra,
        })
        return f"---\n{fm}\n---\n\n{body}"


def _now_iso() -> str:
    return datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")