From 28dbd5346c651ae0865767694861fb90ea7e71d4 Mon Sep 17 00:00:00 2001 From: Afonso Coutinho Date: Wed, 3 Jun 2026 05:42:24 +0100 Subject: [PATCH] Treat non-string research summaries as low quality Filter malformed non-string research summaries instead of letting the broad exception path classify them as usable, with regression coverage. --- src/research_utils.py | 2 +- .../test_research_utils_low_quality_nonstring.py | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 tests/test_research_utils_low_quality_nonstring.py diff --git a/src/research_utils.py b/src/research_utils.py index 9961848..9255adb 100644 --- a/src/research_utils.py +++ b/src/research_utils.py @@ -55,7 +55,7 @@ LOW_QUALITY_MARKERS = [ def is_low_quality(summary: str) -> bool: """Check if a finding summary indicates useless or irrelevant content.""" try: - if not summary: + if not isinstance(summary, str) or not summary: return True low = summary.lower() return any(marker in low for marker in LOW_QUALITY_MARKERS) diff --git a/tests/test_research_utils_low_quality_nonstring.py b/tests/test_research_utils_low_quality_nonstring.py new file mode 100644 index 0000000..2693b55 --- /dev/null +++ b/tests/test_research_utils_low_quality_nonstring.py @@ -0,0 +1,16 @@ +from src.research_utils import is_low_quality + + +def test_is_low_quality_treats_non_string_as_low_quality(): + # Old code reached summary.lower(), hit AttributeError, and the bare + # except returned False (fail open) so a malformed source slipped through + # as "good". A non-string summary has no usable content, so it should be + # filtered like an empty one (which already returns True). + assert is_low_quality(123) is True + assert is_low_quality({"bad": True}) is True + assert is_low_quality(["does not contain"]) is True + + +def test_is_low_quality_still_classifies_strings(): + assert is_low_quality("This page does not contain relevant information") is True + assert is_low_quality("Detailed analysis of the 2026 EV market") is False