Resolve bug in process_local_files and clean up variabel names

2025-03-12 22:04:40 +01:00 · 2025-03-12 22:04:40 +01:00 · 860cfc3438
commit 860cfc3438
parent e259808322
1 changed files with 11 additions and 14 deletions
--- a/generic_rag/parsers/parser.py
+++ b/generic_rag/parsers/parser.py
@ -63,25 +63,22 @@ def process_web_sites(websites: list[str], chunk_size: int) -> list[Document]:
 def process_local_files(
        local_paths: list[Path], chunk_size: int, chunk_overlap: int, add_start_index: bool
 ) -> list[Document]:
-    # get all files
-    file_paths = []
+    process_files = []
    for path in local_paths:
        if path.is_dir():
-            file_paths.extend(list(path.glob("*.pdf")))
-        if path.suffix == ".pdf":
-            file_paths.append(path)
+            process_files.extend(list(path.glob("*.pdf")))
+        elif path.suffix == ".pdf":
+            process_files.append(path)
        else:
-            logging.warning(f"Ignoring path {path} as it is not a pdf file.")
+            logging.warning(f"Ignoring path {path} as it is not a folder or pdf file.")

-    # parse pdf's
-    documents = []
-    for file_path in file_paths:
-        loader = UnstructuredLoader(file_path=file_path, strategy="hi_res")
-        for doc in loader.lazy_load():
-            documents.append(doc)
+    loaded_document = []
+    for file in process_files:
+        loader = UnstructuredLoader(file_path=file, strategy="hi_res")
+        for document in loader.lazy_load():
+            loaded_document.append(document)

-    # split documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
                                                   chunk_overlap=chunk_overlap,
                                                   add_start_index=add_start_index)
-    return text_splitter.split_documents(documents)
+    return text_splitter.split_documents(loaded_document)