From 860cfc3438e42d639caae2ff74d874778cff3e57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nielson=20Jann=C3=A9?= Date: Wed, 12 Mar 2025 22:04:40 +0100 Subject: [PATCH] Resolve bug in process_local_files and clean up variabel names --- generic_rag/parsers/parser.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/generic_rag/parsers/parser.py b/generic_rag/parsers/parser.py index b591ce6..92f2fa1 100644 --- a/generic_rag/parsers/parser.py +++ b/generic_rag/parsers/parser.py @@ -63,25 +63,22 @@ def process_web_sites(websites: list[str], chunk_size: int) -> list[Document]: def process_local_files( local_paths: list[Path], chunk_size: int, chunk_overlap: int, add_start_index: bool ) -> list[Document]: - # get all files - file_paths = [] + process_files = [] for path in local_paths: if path.is_dir(): - file_paths.extend(list(path.glob("*.pdf"))) - if path.suffix == ".pdf": - file_paths.append(path) + process_files.extend(list(path.glob("*.pdf"))) + elif path.suffix == ".pdf": + process_files.append(path) else: - logging.warning(f"Ignoring path {path} as it is not a pdf file.") + logging.warning(f"Ignoring path {path} as it is not a folder or pdf file.") - # parse pdf's - documents = [] - for file_path in file_paths: - loader = UnstructuredLoader(file_path=file_path, strategy="hi_res") - for doc in loader.lazy_load(): - documents.append(doc) + loaded_document = [] + for file in process_files: + loader = UnstructuredLoader(file_path=file, strategy="hi_res") + for document in loader.lazy_load(): + loaded_document.append(document) - # split documents text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index) - return text_splitter.split_documents(documents) + return text_splitter.split_documents(loaded_document)