fix: youtube transcript formatter crashes on a non-dict segment (#1745)

This commit is contained in:
Afonso Coutinho
2026-06-03 05:29:08 +01:00
committed by GitHub
parent 6e38d3f2ef
commit 063e7114e3
2 changed files with 22 additions and 0 deletions

View File

@@ -168,6 +168,8 @@ def format_transcript_for_context(
if segments: if segments:
ctx += "Timestamped Transcript:\n" ctx += "Timestamped Transcript:\n"
for seg in segments: for seg in segments:
if not isinstance(seg, dict):
continue
ctx += f"[{seg['timestamp']}] {seg['text']}\n" ctx += f"[{seg['timestamp']}] {seg['text']}\n"
# Check length — fall back to plain text if too long # Check length — fall back to plain text if too long
if len(ctx) > 12000: if len(ctx) > 12000:

View File

@@ -0,0 +1,20 @@
from src.youtube_handler import format_transcript_for_context
def test_format_transcript_skips_non_dict_segments():
# segments come from the parsed transcript JSON; a malformed entry (None or
# a bare string) made seg['timestamp'] raise TypeError and lose the whole
# timestamped transcript.
data = {
"success": True, "transcript": "full text", "video_id": "x",
"segments": [
{"timestamp": "0:01", "text": "hello"},
"junk-seg",
None,
{"timestamp": "0:05", "text": "world"},
],
}
out = format_transcript_for_context(data, "https://youtu.be/x")
assert "[0:01] hello" in out
assert "[0:05] world" in out
assert "junk-seg" not in out