fix: youtube transcript formatter crashes on a non-dict segment (#1745)

2026-06-03 05:29:08 +01:00
parent 6e38d3f2ef
commit 063e7114e3
2 changed files with 22 additions and 0 deletions
--- a/src/youtube_handler.py
+++ b/src/youtube_handler.py
@@ -168,6 +168,8 @@ def format_transcript_for_context(
    if segments:
        ctx += "Timestamped Transcript:\n"
        for seg in segments:
+            if not isinstance(seg, dict):
+                continue
            ctx += f"[{seg['timestamp']}] {seg['text']}\n"
        # Check length — fall back to plain text if too long
        if len(ctx) > 12000:
--- a/tests/test_youtube_transcript_seg_nondict.py
+++ b/tests/test_youtube_transcript_seg_nondict.py
@@ -0,0 +1,20 @@
+from src.youtube_handler import format_transcript_for_context
+
+
+def test_format_transcript_skips_non_dict_segments():
+    # segments come from the parsed transcript JSON; a malformed entry (None or
+    # a bare string) made seg['timestamp'] raise TypeError and lose the whole
+    # timestamped transcript.
+    data = {
+        "success": True, "transcript": "full text", "video_id": "x",
+        "segments": [
+            {"timestamp": "0:01", "text": "hello"},
+            "junk-seg",
+            None,
+            {"timestamp": "0:05", "text": "world"},
+        ],
+    }
+    out = format_transcript_for_context(data, "https://youtu.be/x")
+    assert "[0:01] hello" in out
+    assert "[0:05] world" in out
+    assert "junk-seg" not in out