From 860cfc3438e42d639caae2ff74d874778cff3e57 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nielson=20Jann=C3=A9?= <nielsonj@gmail.com>
Date: Wed, 12 Mar 2025 22:04:40 +0100
Subject: [PATCH] Resolve bug in process_local_files and clean up variabel
 names

---
 generic_rag/parsers/parser.py | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/generic_rag/parsers/parser.py b/generic_rag/parsers/parser.py
index b591ce6..92f2fa1 100644
--- a/generic_rag/parsers/parser.py
+++ b/generic_rag/parsers/parser.py
@@ -63,25 +63,22 @@ def process_web_sites(websites: list[str], chunk_size: int) -> list[Document]:
 def process_local_files(
         local_paths: list[Path], chunk_size: int, chunk_overlap: int, add_start_index: bool
 ) -> list[Document]:
-    # get all files
-    file_paths = []
+    process_files = []
     for path in local_paths:
         if path.is_dir():
-            file_paths.extend(list(path.glob("*.pdf")))
-        if path.suffix == ".pdf":
-            file_paths.append(path)
+            process_files.extend(list(path.glob("*.pdf")))
+        elif path.suffix == ".pdf":
+            process_files.append(path)
         else:
-            logging.warning(f"Ignoring path {path} as it is not a pdf file.")
+            logging.warning(f"Ignoring path {path} as it is not a folder or pdf file.")
 
-    # parse pdf's
-    documents = []
-    for file_path in file_paths:
-        loader = UnstructuredLoader(file_path=file_path, strategy="hi_res")
-        for doc in loader.lazy_load():
-            documents.append(doc)
+    loaded_document = []
+    for file in process_files:
+        loader = UnstructuredLoader(file_path=file, strategy="hi_res")
+        for document in loader.lazy_load():
+            loaded_document.append(document)
 
-    # split documents
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                    chunk_overlap=chunk_overlap,
                                                    add_start_index=add_start_index)
-    return text_splitter.split_documents(documents)
+    return text_splitter.split_documents(loaded_document)