From 3885f9fa90a3a666554dca65990c9bc289ad9d47 Mon Sep 17 00:00:00 2001 From: Tatlatat Date: Tue, 2 Jun 2026 18:43:24 +0700 Subject: [PATCH] STT: clean temp audio files on transcription failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit STTService._transcribe_local writes the audio to a NamedTemporaryFile (delete=False) and only unlinks it on the success path, before the except. If model.transcribe() raises (corrupt audio, model/runtime error, etc.) the function logs, returns None, and leaves the .webm temp file behind — so every failed local transcription leaks a file in the system temp dir. Initialize tmp_path = None up front and move the unlink into a finally block so the temp file is cleaned up whether transcription succeeds or raises. tests/test_stt_leak.py stubs the whisper model to raise during transcribe, runs _transcribe_local, and asserts it returns None and leaves no new .webm file in the temp dir. Fails before this change. --- services/stt/stt_service.py | 7 ++++--- tests/test_stt_leak.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) create mode 100644 tests/test_stt_leak.py diff --git a/services/stt/stt_service.py b/services/stt/stt_service.py index 0587128..25faf5e 100644 --- a/services/stt/stt_service.py +++ b/services/stt/stt_service.py @@ -91,6 +91,7 @@ class STTService: model = self._get_whisper() if not model: return None + tmp_path = None try: # Write to temp file (faster-whisper needs a file path or file-like) with tempfile.NamedTemporaryFile(suffix=".webm", delete=False) as tmp: @@ -104,14 +105,14 @@ class STTService: segments, info = model.transcribe(tmp_path, **kwargs) text = " ".join(seg.text.strip() for seg in segments) - # Cleanup - Path(tmp_path).unlink(missing_ok=True) - logger.info(f"Local STT: {len(text)} chars, lang={info.language}, prob={info.language_probability:.2f}") return text except Exception as e: logger.error(f"Local STT transcription failed: {e}", exc_info=True) return None + finally: + if tmp_path: + Path(tmp_path).unlink(missing_ok=True) # ── API endpoint ── diff --git a/tests/test_stt_leak.py b/tests/test_stt_leak.py new file mode 100644 index 0000000..ff752ba --- /dev/null +++ b/tests/test_stt_leak.py @@ -0,0 +1,30 @@ +import os +import tempfile +from services.stt.stt_service import STTService + + +def test_stt_local_transcribe_leak_on_error(): + service = STTService() + + class MockWhisper: + def transcribe(self, *args, **kwargs): + raise ValueError("Simulated transcribe error") + + service._get_whisper = lambda: MockWhisper() + + # Track WebM files in the temp directory before running transcription + temp_dir = tempfile.gettempdir() + webm_before = {f for f in os.listdir(temp_dir) if f.endswith(".webm")} + + # Run transcription, which will raise ValueError internally + result = service._transcribe_local(b"dummy_audio_data") + + # Track WebM files in the temp directory after running transcription + webm_after = {f for f in os.listdir(temp_dir) if f.endswith(".webm")} + + # Assert that it returned None (failure) + assert result is None + + # Assert that no new temp files were leaked + leaked = webm_after - webm_before + assert len(leaked) == 0, f"Leaked files: {leaked}"