From 2d94e38d23818a1b8f8a95bae06428f9cb2035ab Mon Sep 17 00:00:00 2001 From: Afonso Coutinho Date: Wed, 3 Jun 2026 00:59:55 +0100 Subject: [PATCH] fix: document_actions title/content helpers crash on non-string input (#1621) --- src/document_actions.py | 8 +++++--- tests/test_document_actions_nonstring.py | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) create mode 100644 tests/test_document_actions_nonstring.py diff --git a/src/document_actions.py b/src/document_actions.py index dfae1e2..cd74b4b 100644 --- a/src/document_actions.py +++ b/src/document_actions.py @@ -21,7 +21,8 @@ _JUNK_TITLES = { def _norm_title(t: str) -> str: """Normalize a title for grouping: trim, collapse whitespace, lowercase.""" - return re.sub(r"\s+", " ", (t or "").strip()).lower() + t = t if isinstance(t, str) else "" + return re.sub(r"\s+", " ", t.strip()).lower() def _content_fingerprint(content: str) -> str: @@ -32,7 +33,7 @@ def _content_fingerprint(content: str) -> str: that N imports of the same file collapse to one fingerprint. Whitespace is collapsed and the result lowercased. """ - c = content or "" + c = content if isinstance(content, str) else "" c = re.sub(r'upload_id="[^"]*"', "upload_id", c) # pdf_source re-imports c = re.sub(r"\bid=ann-[A-Za-z0-9_-]+", "id=ann", c) # annotation ids c = re.sub(r"\s+", " ", c).strip().lower() @@ -41,7 +42,8 @@ def _content_fingerprint(content: str) -> str: def _real_len(content: str) -> int: """Length of content with markdown noise stripped — a 'completeness' proxy.""" - stripped = re.sub(r"^#{1,6}\s+", "", content or "", flags=re.MULTILINE) + content = content if isinstance(content, str) else "" + stripped = re.sub(r"^#{1,6}\s+", "", content, flags=re.MULTILINE) stripped = re.sub(r"[*_`>\-=]+", "", stripped) stripped = re.sub(r"\s+", " ", stripped).strip() return len(stripped) diff --git a/tests/test_document_actions_nonstring.py b/tests/test_document_actions_nonstring.py new file mode 100644 index 0000000..9a0d01e --- /dev/null +++ b/tests/test_document_actions_nonstring.py @@ -0,0 +1,18 @@ +"""Regression: document_actions title/content helpers must tolerate non-strings. + +_norm_title/_content_fingerprint/_real_len used `(x or "")`, which only guards +falsy; a non-string (e.g. an int) is truthy, so `.strip()`/`re.sub(..., x)` +raised. They now coerce non-strings to "". +""" +from src.document_actions import _norm_title, _content_fingerprint, _real_len + + +def test_non_string_inputs_do_not_crash(): + assert _norm_title(123) == "" + assert _content_fingerprint(123) == "" + assert _real_len(["x"]) == 0 + + +def test_valid_inputs_unchanged(): + assert _norm_title(" Hello World ") == "hello world" + assert _real_len("# Title") == len("Title")