From 2d94e38d23818a1b8f8a95bae06428f9cb2035ab Mon Sep 17 00:00:00 2001
From: Afonso Coutinho <afonso@omelhorsite.pt>
Date: Wed, 3 Jun 2026 00:59:55 +0100
Subject: [PATCH] fix: document_actions title/content helpers crash on
 non-string input (#1621)

---
 src/document_actions.py                  |  8 +++++---
 tests/test_document_actions_nonstring.py | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+), 3 deletions(-)
 create mode 100644 tests/test_document_actions_nonstring.py

diff --git a/src/document_actions.py b/src/document_actions.py
index dfae1e2..cd74b4b 100644
--- a/src/document_actions.py
+++ b/src/document_actions.py
@@ -21,7 +21,8 @@ _JUNK_TITLES = {
 
 def _norm_title(t: str) -> str:
     """Normalize a title for grouping: trim, collapse whitespace, lowercase."""
-    return re.sub(r"\s+", " ", (t or "").strip()).lower()
+    t = t if isinstance(t, str) else ""
+    return re.sub(r"\s+", " ", t.strip()).lower()
 
 
 def _content_fingerprint(content: str) -> str:
@@ -32,7 +33,7 @@ def _content_fingerprint(content: str) -> str:
     that N imports of the same file collapse to one fingerprint. Whitespace is
     collapsed and the result lowercased.
     """
-    c = content or ""
+    c = content if isinstance(content, str) else ""
     c = re.sub(r'upload_id="[^"]*"', "upload_id", c)          # pdf_source re-imports
     c = re.sub(r"\bid=ann-[A-Za-z0-9_-]+", "id=ann", c)        # annotation ids
     c = re.sub(r"\s+", " ", c).strip().lower()
@@ -41,7 +42,8 @@ def _content_fingerprint(content: str) -> str:
 
 def _real_len(content: str) -> int:
     """Length of content with markdown noise stripped — a 'completeness' proxy."""
-    stripped = re.sub(r"^#{1,6}\s+", "", content or "", flags=re.MULTILINE)
+    content = content if isinstance(content, str) else ""
+    stripped = re.sub(r"^#{1,6}\s+", "", content, flags=re.MULTILINE)
     stripped = re.sub(r"[*_`>\-=]+", "", stripped)
     stripped = re.sub(r"\s+", " ", stripped).strip()
     return len(stripped)
diff --git a/tests/test_document_actions_nonstring.py b/tests/test_document_actions_nonstring.py
new file mode 100644
index 0000000..9a0d01e
--- /dev/null
+++ b/tests/test_document_actions_nonstring.py
@@ -0,0 +1,18 @@
+"""Regression: document_actions title/content helpers must tolerate non-strings.
+
+_norm_title/_content_fingerprint/_real_len used `(x or "")`, which only guards
+falsy; a non-string (e.g. an int) is truthy, so `.strip()`/`re.sub(..., x)`
+raised. They now coerce non-strings to "".
+"""
+from src.document_actions import _norm_title, _content_fingerprint, _real_len
+
+
+def test_non_string_inputs_do_not_crash():
+    assert _norm_title(123) == ""
+    assert _content_fingerprint(123) == ""
+    assert _real_len(["x"]) == 0
+
+
+def test_valid_inputs_unchanged():
+    assert _norm_title("  Hello   World ") == "hello world"
+    assert _real_len("# Title") == len("Title")