Add extra logging regarding pdf/web soure parsing

This commit is contained in:
Nielson Janné 2025-03-28 15:07:46 +01:00
parent 0fe4a628d7
commit 47c1c1cd6e

View File

@ -32,6 +32,8 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
The URL's will be fetched and split into chunks of text with the provided chunk size. The URL's will be fetched and split into chunks of text with the provided chunk size.
""" """
logging.info("Web sources to the vector store.")
all_splits = [] all_splits = []
for url in urls: for url in urls:
if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0: if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0:
@ -65,6 +67,9 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
if len(all_splits) == 0: if len(all_splits) == 0:
return return
logging.info(f"{len(urls)} web sources split in {len(all_splits)} vector store documents")
logging.info(f"Adding {len(all_splits)} vector store documents to vector store.")
filtered_splits = filter_complex_metadata(all_splits) filtered_splits = filter_complex_metadata(all_splits)
vector_store.add_documents(documents=filtered_splits) vector_store.add_documents(documents=filtered_splits)
@ -82,7 +87,10 @@ def add_pdf_files(
The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap. The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
""" """
logging.info("Adding PDF files to the vector store.")
pdf_files = get_all_local_pdf_files(file_paths) pdf_files = get_all_local_pdf_files(file_paths)
logging.info(f"Found {len(pdf_files)} PDF files to add to the vector store.")
new_pdfs = [] new_pdfs = []
for pdf_file in pdf_files: for pdf_file in pdf_files:
@ -94,6 +102,8 @@ def add_pdf_files(
if len(new_pdfs) == 0: if len(new_pdfs) == 0:
return return
logging.info(f"{len(new_pdfs)} PDF's to add to the vector store.")
loaded_document = [] loaded_document = []
for file in new_pdfs: for file in new_pdfs:
loader = UnstructuredLoader(file_path=file, strategy="hi_res") if unstructerd else PyMuPDFLoader(file_path=file) loader = UnstructuredLoader(file_path=file, strategy="hi_res") if unstructerd else PyMuPDFLoader(file_path=file)
@ -106,6 +116,9 @@ def add_pdf_files(
pdf_splits = text_splitter.split_documents(loaded_document) pdf_splits = text_splitter.split_documents(loaded_document)
logging.info(f"{len(new_pdfs)} PDF's split in {len(pdf_splits)} vector store documents")
logging.info(f"Adding {len(pdf_splits)} vector store documents to vector store.")
vector_store.add_documents(documents=filter_complex_metadata(pdf_splits)) vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))