diff --git a/src/youtube_handler.py b/src/youtube_handler.py index 2a12d82..0018475 100644 --- a/src/youtube_handler.py +++ b/src/youtube_handler.py @@ -168,6 +168,8 @@ def format_transcript_for_context( if segments: ctx += "Timestamped Transcript:\n" for seg in segments: + if not isinstance(seg, dict): + continue ctx += f"[{seg['timestamp']}] {seg['text']}\n" # Check length — fall back to plain text if too long if len(ctx) > 12000: diff --git a/tests/test_youtube_transcript_seg_nondict.py b/tests/test_youtube_transcript_seg_nondict.py new file mode 100644 index 0000000..a347af4 --- /dev/null +++ b/tests/test_youtube_transcript_seg_nondict.py @@ -0,0 +1,20 @@ +from src.youtube_handler import format_transcript_for_context + + +def test_format_transcript_skips_non_dict_segments(): + # segments come from the parsed transcript JSON; a malformed entry (None or + # a bare string) made seg['timestamp'] raise TypeError and lose the whole + # timestamped transcript. + data = { + "success": True, "transcript": "full text", "video_id": "x", + "segments": [ + {"timestamp": "0:01", "text": "hello"}, + "junk-seg", + None, + {"timestamp": "0:05", "text": "world"}, + ], + } + out = format_transcript_for_context(data, "https://youtu.be/x") + assert "[0:01] hello" in out + assert "[0:05] world" in out + assert "junk-seg" not in out