From 47c1c1cd6e0541ebb96252e273ed883d73884f9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nielson=20Jann=C3=A9?= Date: Fri, 28 Mar 2025 15:07:46 +0100 Subject: [PATCH] Add extra logging regarding pdf/web soure parsing --- generic_rag/parsers/parser.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/generic_rag/parsers/parser.py b/generic_rag/parsers/parser.py index 36f0866..c562601 100644 --- a/generic_rag/parsers/parser.py +++ b/generic_rag/parsers/parser.py @@ -32,6 +32,8 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None: The URL's will be fetched and split into chunks of text with the provided chunk size. """ + logging.info("Web sources to the vector store.") + all_splits = [] for url in urls: if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0: @@ -65,6 +67,9 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None: if len(all_splits) == 0: return + logging.info(f"{len(urls)} web sources split in {len(all_splits)} vector store documents") + logging.info(f"Adding {len(all_splits)} vector store documents to vector store.") + filtered_splits = filter_complex_metadata(all_splits) vector_store.add_documents(documents=filtered_splits) @@ -82,7 +87,10 @@ def add_pdf_files( The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap. """ + logging.info("Adding PDF files to the vector store.") + pdf_files = get_all_local_pdf_files(file_paths) + logging.info(f"Found {len(pdf_files)} PDF files to add to the vector store.") new_pdfs = [] for pdf_file in pdf_files: @@ -93,6 +101,8 @@ def add_pdf_files( if len(new_pdfs) == 0: return + + logging.info(f"{len(new_pdfs)} PDF's to add to the vector store.") loaded_document = [] for file in new_pdfs: @@ -106,6 +116,9 @@ def add_pdf_files( pdf_splits = text_splitter.split_documents(loaded_document) + logging.info(f"{len(new_pdfs)} PDF's split in {len(pdf_splits)} vector store documents") + logging.info(f"Adding {len(pdf_splits)} vector store documents to vector store.") + vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))