fix: document_actions title/content helpers crash on non-string input (#1621)

This commit is contained in:
Afonso Coutinho
2026-06-03 00:59:55 +01:00
committed by GitHub
parent 03ddc5d2c4
commit 2d94e38d23
2 changed files with 23 additions and 3 deletions

View File

@@ -21,7 +21,8 @@ _JUNK_TITLES = {
def _norm_title(t: str) -> str: def _norm_title(t: str) -> str:
"""Normalize a title for grouping: trim, collapse whitespace, lowercase.""" """Normalize a title for grouping: trim, collapse whitespace, lowercase."""
return re.sub(r"\s+", " ", (t or "").strip()).lower() t = t if isinstance(t, str) else ""
return re.sub(r"\s+", " ", t.strip()).lower()
def _content_fingerprint(content: str) -> str: def _content_fingerprint(content: str) -> str:
@@ -32,7 +33,7 @@ def _content_fingerprint(content: str) -> str:
that N imports of the same file collapse to one fingerprint. Whitespace is that N imports of the same file collapse to one fingerprint. Whitespace is
collapsed and the result lowercased. collapsed and the result lowercased.
""" """
c = content or "" c = content if isinstance(content, str) else ""
c = re.sub(r'upload_id="[^"]*"', "upload_id", c) # pdf_source re-imports c = re.sub(r'upload_id="[^"]*"', "upload_id", c) # pdf_source re-imports
c = re.sub(r"\bid=ann-[A-Za-z0-9_-]+", "id=ann", c) # annotation ids c = re.sub(r"\bid=ann-[A-Za-z0-9_-]+", "id=ann", c) # annotation ids
c = re.sub(r"\s+", " ", c).strip().lower() c = re.sub(r"\s+", " ", c).strip().lower()
@@ -41,7 +42,8 @@ def _content_fingerprint(content: str) -> str:
def _real_len(content: str) -> int: def _real_len(content: str) -> int:
"""Length of content with markdown noise stripped — a 'completeness' proxy.""" """Length of content with markdown noise stripped — a 'completeness' proxy."""
stripped = re.sub(r"^#{1,6}\s+", "", content or "", flags=re.MULTILINE) content = content if isinstance(content, str) else ""
stripped = re.sub(r"^#{1,6}\s+", "", content, flags=re.MULTILINE)
stripped = re.sub(r"[*_`>\-=]+", "", stripped) stripped = re.sub(r"[*_`>\-=]+", "", stripped)
stripped = re.sub(r"\s+", " ", stripped).strip() stripped = re.sub(r"\s+", " ", stripped).strip()
return len(stripped) return len(stripped)

View File

@@ -0,0 +1,18 @@
"""Regression: document_actions title/content helpers must tolerate non-strings.
_norm_title/_content_fingerprint/_real_len used `(x or "")`, which only guards
falsy; a non-string (e.g. an int) is truthy, so `.strip()`/`re.sub(..., x)`
raised. They now coerce non-strings to "".
"""
from src.document_actions import _norm_title, _content_fingerprint, _real_len
def test_non_string_inputs_do_not_crash():
assert _norm_title(123) == ""
assert _content_fingerprint(123) == ""
assert _real_len(["x"]) == 0
def test_valid_inputs_unchanged():
assert _norm_title(" Hello World ") == "hello world"
assert _real_len("# Title") == len("Title")