diff --git a/generic_rag/parsers/parser.py b/generic_rag/parsers/parser.py index b591ce6..92f2fa1 100644 --- a/generic_rag/parsers/parser.py +++ b/generic_rag/parsers/parser.py @@ -63,25 +63,22 @@ def process_web_sites(websites: list[str], chunk_size: int) -> list[Document]: def process_local_files( local_paths: list[Path], chunk_size: int, chunk_overlap: int, add_start_index: bool ) -> list[Document]: - # get all files - file_paths = [] + process_files = [] for path in local_paths: if path.is_dir(): - file_paths.extend(list(path.glob("*.pdf"))) - if path.suffix == ".pdf": - file_paths.append(path) + process_files.extend(list(path.glob("*.pdf"))) + elif path.suffix == ".pdf": + process_files.append(path) else: - logging.warning(f"Ignoring path {path} as it is not a pdf file.") + logging.warning(f"Ignoring path {path} as it is not a folder or pdf file.") - # parse pdf's - documents = [] - for file_path in file_paths: - loader = UnstructuredLoader(file_path=file_path, strategy="hi_res") - for doc in loader.lazy_load(): - documents.append(doc) + loaded_document = [] + for file in process_files: + loader = UnstructuredLoader(file_path=file, strategy="hi_res") + for document in loader.lazy_load(): + loaded_document.append(document) - # split documents text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index) - return text_splitter.split_documents(documents) + return text_splitter.split_documents(loaded_document)