Resolve bug in process_local_files and clean up variabel names

This commit is contained in:
Nielson Janné 2025-03-12 22:04:40 +01:00
parent e259808322
commit 860cfc3438

View File

@ -63,25 +63,22 @@ def process_web_sites(websites: list[str], chunk_size: int) -> list[Document]:
def process_local_files( def process_local_files(
local_paths: list[Path], chunk_size: int, chunk_overlap: int, add_start_index: bool local_paths: list[Path], chunk_size: int, chunk_overlap: int, add_start_index: bool
) -> list[Document]: ) -> list[Document]:
# get all files process_files = []
file_paths = []
for path in local_paths: for path in local_paths:
if path.is_dir(): if path.is_dir():
file_paths.extend(list(path.glob("*.pdf"))) process_files.extend(list(path.glob("*.pdf")))
if path.suffix == ".pdf": elif path.suffix == ".pdf":
file_paths.append(path) process_files.append(path)
else: else:
logging.warning(f"Ignoring path {path} as it is not a pdf file.") logging.warning(f"Ignoring path {path} as it is not a folder or pdf file.")
# parse pdf's loaded_document = []
documents = [] for file in process_files:
for file_path in file_paths: loader = UnstructuredLoader(file_path=file, strategy="hi_res")
loader = UnstructuredLoader(file_path=file_path, strategy="hi_res") for document in loader.lazy_load():
for doc in loader.lazy_load(): loaded_document.append(document)
documents.append(doc)
# split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
chunk_overlap=chunk_overlap, chunk_overlap=chunk_overlap,
add_start_index=add_start_index) add_start_index=add_start_index)
return text_splitter.split_documents(documents) return text_splitter.split_documents(loaded_document)