STT: clean temp audio files on transcription failure
STTService._transcribe_local writes the audio to a NamedTemporaryFile (delete=False) and only unlinks it on the success path, before the except. If model.transcribe() raises (corrupt audio, model/runtime error, etc.) the function logs, returns None, and leaves the .webm temp file behind — so every failed local transcription leaks a file in the system temp dir. Initialize tmp_path = None up front and move the unlink into a finally block so the temp file is cleaned up whether transcription succeeds or raises. tests/test_stt_leak.py stubs the whisper model to raise during transcribe, runs _transcribe_local, and asserts it returns None and leaves no new .webm file in the temp dir. Fails before this change.
This commit is contained in:
@@ -91,6 +91,7 @@ class STTService:
|
|||||||
model = self._get_whisper()
|
model = self._get_whisper()
|
||||||
if not model:
|
if not model:
|
||||||
return None
|
return None
|
||||||
|
tmp_path = None
|
||||||
try:
|
try:
|
||||||
# Write to temp file (faster-whisper needs a file path or file-like)
|
# Write to temp file (faster-whisper needs a file path or file-like)
|
||||||
with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as tmp:
|
with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as tmp:
|
||||||
@@ -104,14 +105,14 @@ class STTService:
|
|||||||
segments, info = model.transcribe(tmp_path, **kwargs)
|
segments, info = model.transcribe(tmp_path, **kwargs)
|
||||||
text = " ".join(seg.text.strip() for seg in segments)
|
text = " ".join(seg.text.strip() for seg in segments)
|
||||||
|
|
||||||
# Cleanup
|
|
||||||
Path(tmp_path).unlink(missing_ok=True)
|
|
||||||
|
|
||||||
logger.info(f"Local STT: {len(text)} chars, lang={info.language}, prob={info.language_probability:.2f}")
|
logger.info(f"Local STT: {len(text)} chars, lang={info.language}, prob={info.language_probability:.2f}")
|
||||||
return text
|
return text
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Local STT transcription failed: {e}", exc_info=True)
|
logger.error(f"Local STT transcription failed: {e}", exc_info=True)
|
||||||
return None
|
return None
|
||||||
|
finally:
|
||||||
|
if tmp_path:
|
||||||
|
Path(tmp_path).unlink(missing_ok=True)
|
||||||
|
|
||||||
# ── API endpoint ──
|
# ── API endpoint ──
|
||||||
|
|
||||||
|
|||||||
30
tests/test_stt_leak.py
Normal file
30
tests/test_stt_leak.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from services.stt.stt_service import STTService
|
||||||
|
|
||||||
|
|
||||||
|
def test_stt_local_transcribe_leak_on_error():
|
||||||
|
service = STTService()
|
||||||
|
|
||||||
|
class MockWhisper:
|
||||||
|
def transcribe(self, *args, **kwargs):
|
||||||
|
raise ValueError("Simulated transcribe error")
|
||||||
|
|
||||||
|
service._get_whisper = lambda: MockWhisper()
|
||||||
|
|
||||||
|
# Track WebM files in the temp directory before running transcription
|
||||||
|
temp_dir = tempfile.gettempdir()
|
||||||
|
webm_before = {f for f in os.listdir(temp_dir) if f.endswith(".webm")}
|
||||||
|
|
||||||
|
# Run transcription, which will raise ValueError internally
|
||||||
|
result = service._transcribe_local(b"dummy_audio_data")
|
||||||
|
|
||||||
|
# Track WebM files in the temp directory after running transcription
|
||||||
|
webm_after = {f for f in os.listdir(temp_dir) if f.endswith(".webm")}
|
||||||
|
|
||||||
|
# Assert that it returned None (failure)
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
# Assert that no new temp files were leaked
|
||||||
|
leaked = webm_after - webm_before
|
||||||
|
assert len(leaked) == 0, f"Leaked files: {leaked}"
|
||||||
Reference in New Issue
Block a user