Fix chat stream recovery and PDF library indexing (#468)

This commit is contained in:
red person
2026-06-01 16:33:35 +03:00
committed by GitHub
parent 92a81480f7
commit e1102585bf
4 changed files with 55 additions and 8 deletions

View File

@@ -29,7 +29,7 @@ class PersonalDocsConfig:
"""Configuration for personal documents management."""
CHUNK_SIZE: int = 1000
CHUNK_OVERLAP: int = 200
DEFAULT_EXTENSIONS: Tuple[str, ...] = (".txt", ".md", ".json")
DEFAULT_EXTENSIONS: Tuple[str, ...] = (".txt", ".md", ".json", ".pdf")
DEFAULT_K: int = 5
STOP_WORDS: Set[str] = None
@@ -85,7 +85,8 @@ def load_personal_index(
if not any(name.lower().endswith(ext) for ext in extensions):
continue
size = os.path.getsize(p)
text = read_text_file(p)
ext = os.path.splitext(name)[1].lower()
text = extract_pdf_text(p) if ext == ".pdf" else read_text_file(p)
chunks = split_chunks(text)
display = os.path.relpath(p, personal_dir)
files.append({"name": display, "path": p, "size": size, "chunks": chunks})