From b1e8f19f00eac32dd8374a0e16eae4571c4be562 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nielson=20Jann=C3=A9?= <nielsonj@gmail.com>
Date: Fri, 11 Apr 2025 19:58:11 +0200
Subject: [PATCH] Fixes some logging api mistakes

---
 generic_rag/app.py                 |  4 ++--
 generic_rag/graphs/cond_ret_gen.py |  5 +++--
 generic_rag/graphs/ret_gen.py      |  4 ++--
 generic_rag/parsers/parser.py      | 23 +++++++++++------------
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/generic_rag/app.py b/generic_rag/app.py
index c3a3fb1..73bb3f5 100644
--- a/generic_rag/app.py
+++ b/generic_rag/app.py
@@ -12,7 +12,7 @@ from graphs.ret_gen import RetGenLangGraph
 from langchain_chroma import Chroma
 from parsers.parser import add_pdf_files, add_urls
 
-logging.basicConfig(level=logging.INFO)
+logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 
 parser = argparse.ArgumentParser(description="A Sogeti Nederland Generic RAG demo.")
@@ -162,7 +162,7 @@ async def set_starters():
         try:
             starters.append(cl.Starter(label=starter["label"], message=starter["message"]))
         except KeyError:
-            logging.warning(
+            logger.warning(
                 "CHAINLIT_STARTERS environment is not a list with dictionaries containing 'label' and 'message' keys."
             )
 
diff --git a/generic_rag/graphs/cond_ret_gen.py b/generic_rag/graphs/cond_ret_gen.py
index 8b6d788..8fb554c 100644
--- a/generic_rag/graphs/cond_ret_gen.py
+++ b/generic_rag/graphs/cond_ret_gen.py
@@ -16,6 +16,7 @@ from langgraph.graph import END, MessagesState, StateGraph
 from langgraph.prebuilt import InjectedStore, ToolNode, tools_condition
 from typing_extensions import Annotated
 
+logger = logging.getLogger(__name__)
 
 class CondRetGenLangGraph:
     def __init__(self, vector_store: Chroma, chat_model: BaseChatModel, embedding_model: Embeddings):
@@ -83,8 +84,8 @@ class CondRetGenLangGraph:
         # Furthermore, it can not and should not have the `self` parameter.
         # If you want to pass on state, please refer to:
         # https://python.langchain.com/docs/concepts/tools/#special-type-annotations
-        logging.info(f"Query: {query}")
-        logging.info(f"user content: {full_user_content}")
+        logger.debug(f"query: {query}")
+        logger.debug(f"user content: {full_user_content}")
 
         retrieved_docs = []
         retrieved_docs = vector_store.similarity_search(query, k=4)
diff --git a/generic_rag/graphs/ret_gen.py b/generic_rag/graphs/ret_gen.py
index ba19f0f..c4262c3 100644
--- a/generic_rag/graphs/ret_gen.py
+++ b/generic_rag/graphs/ret_gen.py
@@ -11,7 +11,6 @@ from langgraph.checkpoint.memory import MemorySaver
 from langgraph.graph import END, START, StateGraph
 from typing_extensions import List, TypedDict
 
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
@@ -42,6 +41,7 @@ class RetGenLangGraph:
             yield response.content
 
     def _retrieve(self, state: State) -> dict:
+        logger.debug(f"querying VS for: {state["question"]}")
         self.last_retrieved_docs = self.vector_store.similarity_search(state["question"])
         return {"context": self.last_retrieved_docs}
 
@@ -77,7 +77,7 @@ class RetGenLangGraph:
                 pdf_sources[source].add(doc.metadata["page"])
 
             if len(pdf_sources[source]) == 0:
-                logging.warning(f"PDF source {source} has no page number. Please check the metadata of the document.")
+                logger.warning(f"PDF source {source} has no page number. Please check the metadata of the document.")
 
         return pdf_sources
 
diff --git a/generic_rag/parsers/parser.py b/generic_rag/parsers/parser.py
index c562601..ef3b374 100644
--- a/generic_rag/parsers/parser.py
+++ b/generic_rag/parsers/parser.py
@@ -9,7 +9,6 @@ from langchain_community.vectorstores.utils import filter_complex_metadata
 from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
 from langchain_unstructured import UnstructuredLoader
 
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
@@ -32,12 +31,12 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
 
     The URL's will be fetched and split into chunks of text with the provided chunk size.
     """
-    logging.info("Web sources to the vector store.")
+    logger.info("Web sources to the vector store.")
     
     all_splits = []
     for url in urls:
         if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0:
-            logging.info(f"Skipping URL {url}, as it is already in the database.")
+            logger.info(f"Skipping URL {url}, as it is already in the database.")
             continue
 
         response = requests.get(url)
@@ -67,8 +66,8 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
     if len(all_splits) == 0:
         return
 
-    logging.info(f"{len(urls)} web sources split in {len(all_splits)} vector store documents")
-    logging.info(f"Adding {len(all_splits)} vector store documents to vector store.")
+    logger.info(f"{len(urls)} web sources split in {len(all_splits)} vector store documents")
+    logger.info(f"Adding {len(all_splits)} vector store documents to vector store.")
 
     filtered_splits = filter_complex_metadata(all_splits)
     vector_store.add_documents(documents=filtered_splits)
@@ -87,22 +86,22 @@ def add_pdf_files(
 
     The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
     """
-    logging.info("Adding PDF files to the vector store.")
+    logger.info("Adding PDF files to the vector store.")
     
     pdf_files = get_all_local_pdf_files(file_paths)
-    logging.info(f"Found {len(pdf_files)} PDF files to add to the vector store.")
+    logger.info(f"Found {len(pdf_files)} PDF files to add to the vector store.")
 
     new_pdfs = []
     for pdf_file in pdf_files:
         if len(vector_store.get(where={"source": str(pdf_file)}, limit=1)["ids"]) == 0:
             new_pdfs.append(pdf_file)
         else:
-            logging.info(f"Skipping PDF {pdf_file}, as it is already in the database.")
+            logger.info(f"Skipping PDF {pdf_file}, as it is already in the database.")
 
     if len(new_pdfs) == 0:
         return
     
-    logging.info(f"{len(new_pdfs)} PDF's to add to the vector store.")
+    logger.info(f"{len(new_pdfs)} PDF's to add to the vector store.")
 
     loaded_document = []
     for file in new_pdfs:
@@ -116,8 +115,8 @@ def add_pdf_files(
 
     pdf_splits = text_splitter.split_documents(loaded_document)
 
-    logging.info(f"{len(new_pdfs)} PDF's split in {len(pdf_splits)} vector store documents")
-    logging.info(f"Adding {len(pdf_splits)} vector store documents to vector store.")
+    logger.info(f"{len(new_pdfs)} PDF's split in {len(pdf_splits)} vector store documents")
+    logger.info(f"Adding {len(pdf_splits)} vector store documents to vector store.")
 
     vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))
 
@@ -136,6 +135,6 @@ def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]:
         elif path.suffix == ".pdf":
             all_pdf_files.append(path)
         else:
-            logging.warning(f"Ignoring path {path} as it is not a folder or pdf file.")
+            logger.warning(f"Ignoring path {path} as it is not a folder or pdf file.")
 
     return all_pdf_files