forked from AI_team/Philosophy-RAG-demo
Resolve bug in process_local_files and clean up variabel names
This commit is contained in:
parent
e259808322
commit
860cfc3438
@ -63,25 +63,22 @@ def process_web_sites(websites: list[str], chunk_size: int) -> list[Document]:
|
|||||||
def process_local_files(
|
def process_local_files(
|
||||||
local_paths: list[Path], chunk_size: int, chunk_overlap: int, add_start_index: bool
|
local_paths: list[Path], chunk_size: int, chunk_overlap: int, add_start_index: bool
|
||||||
) -> list[Document]:
|
) -> list[Document]:
|
||||||
# get all files
|
process_files = []
|
||||||
file_paths = []
|
|
||||||
for path in local_paths:
|
for path in local_paths:
|
||||||
if path.is_dir():
|
if path.is_dir():
|
||||||
file_paths.extend(list(path.glob("*.pdf")))
|
process_files.extend(list(path.glob("*.pdf")))
|
||||||
if path.suffix == ".pdf":
|
elif path.suffix == ".pdf":
|
||||||
file_paths.append(path)
|
process_files.append(path)
|
||||||
else:
|
else:
|
||||||
logging.warning(f"Ignoring path {path} as it is not a pdf file.")
|
logging.warning(f"Ignoring path {path} as it is not a folder or pdf file.")
|
||||||
|
|
||||||
# parse pdf's
|
loaded_document = []
|
||||||
documents = []
|
for file in process_files:
|
||||||
for file_path in file_paths:
|
loader = UnstructuredLoader(file_path=file, strategy="hi_res")
|
||||||
loader = UnstructuredLoader(file_path=file_path, strategy="hi_res")
|
for document in loader.lazy_load():
|
||||||
for doc in loader.lazy_load():
|
loaded_document.append(document)
|
||||||
documents.append(doc)
|
|
||||||
|
|
||||||
# split documents
|
|
||||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
|
||||||
chunk_overlap=chunk_overlap,
|
chunk_overlap=chunk_overlap,
|
||||||
add_start_index=add_start_index)
|
add_start_index=add_start_index)
|
||||||
return text_splitter.split_documents(documents)
|
return text_splitter.split_documents(loaded_document)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user