From 47c1c1cd6e0541ebb96252e273ed883d73884f9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nielson=20Jann=C3=A9?= <nielsonj@gmail.com>
Date: Fri, 28 Mar 2025 15:07:46 +0100
Subject: [PATCH] Add extra logging regarding pdf/web soure parsing

---
 generic_rag/parsers/parser.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/generic_rag/parsers/parser.py b/generic_rag/parsers/parser.py
index 36f0866..c562601 100644
--- a/generic_rag/parsers/parser.py
+++ b/generic_rag/parsers/parser.py
@@ -32,6 +32,8 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
 
     The URL's will be fetched and split into chunks of text with the provided chunk size.
     """
+    logging.info("Web sources to the vector store.")
+    
     all_splits = []
     for url in urls:
         if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0:
@@ -65,6 +67,9 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
     if len(all_splits) == 0:
         return
 
+    logging.info(f"{len(urls)} web sources split in {len(all_splits)} vector store documents")
+    logging.info(f"Adding {len(all_splits)} vector store documents to vector store.")
+
     filtered_splits = filter_complex_metadata(all_splits)
     vector_store.add_documents(documents=filtered_splits)
 
@@ -82,7 +87,10 @@ def add_pdf_files(
 
     The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
     """
+    logging.info("Adding PDF files to the vector store.")
+    
     pdf_files = get_all_local_pdf_files(file_paths)
+    logging.info(f"Found {len(pdf_files)} PDF files to add to the vector store.")
 
     new_pdfs = []
     for pdf_file in pdf_files:
@@ -93,6 +101,8 @@ def add_pdf_files(
 
     if len(new_pdfs) == 0:
         return
+    
+    logging.info(f"{len(new_pdfs)} PDF's to add to the vector store.")
 
     loaded_document = []
     for file in new_pdfs:
@@ -106,6 +116,9 @@ def add_pdf_files(
 
     pdf_splits = text_splitter.split_documents(loaded_document)
 
+    logging.info(f"{len(new_pdfs)} PDF's split in {len(pdf_splits)} vector store documents")
+    logging.info(f"Adding {len(pdf_splits)} vector store documents to vector store.")
+
     vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))