fix: youtube transcript formatter crashes on a non-dict segment (#1745)
This commit is contained in:
@@ -168,6 +168,8 @@ def format_transcript_for_context(
|
||||
if segments:
|
||||
ctx += "Timestamped Transcript:\n"
|
||||
for seg in segments:
|
||||
if not isinstance(seg, dict):
|
||||
continue
|
||||
ctx += f"[{seg['timestamp']}] {seg['text']}\n"
|
||||
# Check length — fall back to plain text if too long
|
||||
if len(ctx) > 12000:
|
||||
|
||||
20
tests/test_youtube_transcript_seg_nondict.py
Normal file
20
tests/test_youtube_transcript_seg_nondict.py
Normal file
@@ -0,0 +1,20 @@
|
||||
from src.youtube_handler import format_transcript_for_context
|
||||
|
||||
|
||||
def test_format_transcript_skips_non_dict_segments():
|
||||
# segments come from the parsed transcript JSON; a malformed entry (None or
|
||||
# a bare string) made seg['timestamp'] raise TypeError and lose the whole
|
||||
# timestamped transcript.
|
||||
data = {
|
||||
"success": True, "transcript": "full text", "video_id": "x",
|
||||
"segments": [
|
||||
{"timestamp": "0:01", "text": "hello"},
|
||||
"junk-seg",
|
||||
None,
|
||||
{"timestamp": "0:05", "text": "world"},
|
||||
],
|
||||
}
|
||||
out = format_transcript_for_context(data, "https://youtu.be/x")
|
||||
assert "[0:01] hello" in out
|
||||
assert "[0:05] world" in out
|
||||
assert "junk-seg" not in out
|
||||
Reference in New Issue
Block a user