fix: document_actions title/content helpers crash on non-string input (#1621)
This commit is contained in:
@@ -21,7 +21,8 @@ _JUNK_TITLES = {
|
||||
|
||||
def _norm_title(t: str) -> str:
|
||||
"""Normalize a title for grouping: trim, collapse whitespace, lowercase."""
|
||||
return re.sub(r"\s+", " ", (t or "").strip()).lower()
|
||||
t = t if isinstance(t, str) else ""
|
||||
return re.sub(r"\s+", " ", t.strip()).lower()
|
||||
|
||||
|
||||
def _content_fingerprint(content: str) -> str:
|
||||
@@ -32,7 +33,7 @@ def _content_fingerprint(content: str) -> str:
|
||||
that N imports of the same file collapse to one fingerprint. Whitespace is
|
||||
collapsed and the result lowercased.
|
||||
"""
|
||||
c = content or ""
|
||||
c = content if isinstance(content, str) else ""
|
||||
c = re.sub(r'upload_id="[^"]*"', "upload_id", c) # pdf_source re-imports
|
||||
c = re.sub(r"\bid=ann-[A-Za-z0-9_-]+", "id=ann", c) # annotation ids
|
||||
c = re.sub(r"\s+", " ", c).strip().lower()
|
||||
@@ -41,7 +42,8 @@ def _content_fingerprint(content: str) -> str:
|
||||
|
||||
def _real_len(content: str) -> int:
|
||||
"""Length of content with markdown noise stripped — a 'completeness' proxy."""
|
||||
stripped = re.sub(r"^#{1,6}\s+", "", content or "", flags=re.MULTILINE)
|
||||
content = content if isinstance(content, str) else ""
|
||||
stripped = re.sub(r"^#{1,6}\s+", "", content, flags=re.MULTILINE)
|
||||
stripped = re.sub(r"[*_`>\-=]+", "", stripped)
|
||||
stripped = re.sub(r"\s+", " ", stripped).strip()
|
||||
return len(stripped)
|
||||
|
||||
18
tests/test_document_actions_nonstring.py
Normal file
18
tests/test_document_actions_nonstring.py
Normal file
@@ -0,0 +1,18 @@
|
||||
"""Regression: document_actions title/content helpers must tolerate non-strings.
|
||||
|
||||
_norm_title/_content_fingerprint/_real_len used `(x or "")`, which only guards
|
||||
falsy; a non-string (e.g. an int) is truthy, so `.strip()`/`re.sub(..., x)`
|
||||
raised. They now coerce non-strings to "".
|
||||
"""
|
||||
from src.document_actions import _norm_title, _content_fingerprint, _real_len
|
||||
|
||||
|
||||
def test_non_string_inputs_do_not_crash():
|
||||
assert _norm_title(123) == ""
|
||||
assert _content_fingerprint(123) == ""
|
||||
assert _real_len(["x"]) == 0
|
||||
|
||||
|
||||
def test_valid_inputs_unchanged():
|
||||
assert _norm_title(" Hello World ") == "hello world"
|
||||
assert _real_len("# Title") == len("Title")
|
||||
Reference in New Issue
Block a user