Treat non-string research summaries as low quality

Filter malformed non-string research summaries instead of letting the broad exception path classify them as usable, with regression coverage.
2026-06-03 05:42:24 +01:00
parent a880b17624
commit 28dbd5346c
2 changed files with 17 additions and 1 deletions
--- a/src/research_utils.py
+++ b/src/research_utils.py
@@ -55,7 +55,7 @@ LOW_QUALITY_MARKERS = [
 def is_low_quality(summary: str) -> bool:
    """Check if a finding summary indicates useless or irrelevant content."""
    try:
-        if not summary:
+        if not isinstance(summary, str) or not summary:
            return True
        low = summary.lower()
        return any(marker in low for marker in LOW_QUALITY_MARKERS)
--- a/tests/test_research_utils_low_quality_nonstring.py
+++ b/tests/test_research_utils_low_quality_nonstring.py
@@ -0,0 +1,16 @@
 from src.research_utils import is_low_quality
 def test_is_low_quality_treats_non_string_as_low_quality():
    # Old code reached summary.lower(), hit AttributeError, and the bare
    # except returned False (fail open) so a malformed source slipped through
    # as "good". A non-string summary has no usable content, so it should be
    # filtered like an empty one (which already returns True).
    assert is_low_quality(123) is True
    assert is_low_quality({"bad": True}) is True
    assert is_low_quality(["does not contain"]) is True
 def test_is_low_quality_still_classifies_strings():
    assert is_low_quality("This page does not contain relevant information") is True
    assert is_low_quality("Detailed analysis of the 2026 EV market") is False