Treat non-string research summaries as low quality
Filter malformed non-string research summaries instead of letting the broad exception path classify them as usable, with regression coverage.
This commit is contained in:
@@ -55,7 +55,7 @@ LOW_QUALITY_MARKERS = [
|
|||||||
def is_low_quality(summary: str) -> bool:
|
def is_low_quality(summary: str) -> bool:
|
||||||
"""Check if a finding summary indicates useless or irrelevant content."""
|
"""Check if a finding summary indicates useless or irrelevant content."""
|
||||||
try:
|
try:
|
||||||
if not summary:
|
if not isinstance(summary, str) or not summary:
|
||||||
return True
|
return True
|
||||||
low = summary.lower()
|
low = summary.lower()
|
||||||
return any(marker in low for marker in LOW_QUALITY_MARKERS)
|
return any(marker in low for marker in LOW_QUALITY_MARKERS)
|
||||||
|
|||||||
16
tests/test_research_utils_low_quality_nonstring.py
Normal file
16
tests/test_research_utils_low_quality_nonstring.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
from src.research_utils import is_low_quality
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_low_quality_treats_non_string_as_low_quality():
|
||||||
|
# Old code reached summary.lower(), hit AttributeError, and the bare
|
||||||
|
# except returned False (fail open) so a malformed source slipped through
|
||||||
|
# as "good". A non-string summary has no usable content, so it should be
|
||||||
|
# filtered like an empty one (which already returns True).
|
||||||
|
assert is_low_quality(123) is True
|
||||||
|
assert is_low_quality({"bad": True}) is True
|
||||||
|
assert is_low_quality(["does not contain"]) is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_low_quality_still_classifies_strings():
|
||||||
|
assert is_low_quality("This page does not contain relevant information") is True
|
||||||
|
assert is_low_quality("Detailed analysis of the 2026 EV market") is False
|
||||||
Reference in New Issue
Block a user