Fixes some logging api mistakes

2025-04-11 19:58:11 +02:00 · 2025-04-11 19:58:11 +02:00 · b1e8f19f00
commit b1e8f19f00
parent fc59aa0d2f
4 changed files with 18 additions and 18 deletions
--- a/generic_rag/app.py
+++ b/generic_rag/app.py
@ -12,7 +12,7 @@ from graphs.ret_gen import RetGenLangGraph
 from langchain_chroma import Chroma
 from parsers.parser import add_pdf_files, add_urls

-logging.basicConfig(level=logging.INFO)
+logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)

 parser = argparse.ArgumentParser(description="A Sogeti Nederland Generic RAG demo.")
@ -162,7 +162,7 @@ async def set_starters():
        try:
            starters.append(cl.Starter(label=starter["label"], message=starter["message"]))
        except KeyError:
-            logging.warning(
+            logger.warning(
                "CHAINLIT_STARTERS environment is not a list with dictionaries containing 'label' and 'message' keys."
            )

--- a/generic_rag/graphs/cond_ret_gen.py
+++ b/generic_rag/graphs/cond_ret_gen.py
@ -16,6 +16,7 @@ from langgraph.graph import END, MessagesState, StateGraph
 from langgraph.prebuilt import InjectedStore, ToolNode, tools_condition
 from typing_extensions import Annotated

+logger = logging.getLogger(__name__)

 class CondRetGenLangGraph:
    def __init__(self, vector_store: Chroma, chat_model: BaseChatModel, embedding_model: Embeddings):
@ -83,8 +84,8 @@ class CondRetGenLangGraph:
        # Furthermore, it can not and should not have the `self` parameter.
        # If you want to pass on state, please refer to:
        # https://python.langchain.com/docs/concepts/tools/#special-type-annotations
-        logging.info(f"Query: {query}")
-        logging.info(f"user content: {full_user_content}")
+        logger.debug(f"query: {query}")
+        logger.debug(f"user content: {full_user_content}")

        retrieved_docs = []
        retrieved_docs = vector_store.similarity_search(query, k=4)
--- a/generic_rag/graphs/ret_gen.py
+++ b/generic_rag/graphs/ret_gen.py
@ -11,7 +11,6 @@ from langgraph.checkpoint.memory import MemorySaver
 from langgraph.graph import END, START, StateGraph
 from typing_extensions import List, TypedDict

-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)


@ -42,6 +41,7 @@ class RetGenLangGraph:
            yield response.content

    def _retrieve(self, state: State) -> dict:
+        logger.debug(f"querying VS for: {state["question"]}")
        self.last_retrieved_docs = self.vector_store.similarity_search(state["question"])
        return {"context": self.last_retrieved_docs}

@ -77,7 +77,7 @@ class RetGenLangGraph:
                pdf_sources[source].add(doc.metadata["page"])

            if len(pdf_sources[source]) == 0:
-                logging.warning(f"PDF source {source} has no page number. Please check the metadata of the document.")
+                logger.warning(f"PDF source {source} has no page number. Please check the metadata of the document.")

        return pdf_sources

--- a/generic_rag/parsers/parser.py
+++ b/generic_rag/parsers/parser.py
@ -9,7 +9,6 @@ from langchain_community.vectorstores.utils import filter_complex_metadata
 from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
 from langchain_unstructured import UnstructuredLoader

-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)


@ -32,12 +31,12 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:

    The URL's will be fetched and split into chunks of text with the provided chunk size.
    """
-    logging.info("Web sources to the vector store.")
+    logger.info("Web sources to the vector store.")
    
    all_splits = []
    for url in urls:
        if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0:
-            logging.info(f"Skipping URL {url}, as it is already in the database.")
+            logger.info(f"Skipping URL {url}, as it is already in the database.")
            continue

        response = requests.get(url)
@ -67,8 +66,8 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
    if len(all_splits) == 0:
        return

-    logging.info(f"{len(urls)} web sources split in {len(all_splits)} vector store documents")
-    logging.info(f"Adding {len(all_splits)} vector store documents to vector store.")
+    logger.info(f"{len(urls)} web sources split in {len(all_splits)} vector store documents")
+    logger.info(f"Adding {len(all_splits)} vector store documents to vector store.")

    filtered_splits = filter_complex_metadata(all_splits)
    vector_store.add_documents(documents=filtered_splits)
@ -87,22 +86,22 @@ def add_pdf_files(

    The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
    """
-    logging.info("Adding PDF files to the vector store.")
+    logger.info("Adding PDF files to the vector store.")
    
    pdf_files = get_all_local_pdf_files(file_paths)
-    logging.info(f"Found {len(pdf_files)} PDF files to add to the vector store.")
+    logger.info(f"Found {len(pdf_files)} PDF files to add to the vector store.")

    new_pdfs = []
    for pdf_file in pdf_files:
        if len(vector_store.get(where={"source": str(pdf_file)}, limit=1)["ids"]) == 0:
            new_pdfs.append(pdf_file)
        else:
-            logging.info(f"Skipping PDF {pdf_file}, as it is already in the database.")
+            logger.info(f"Skipping PDF {pdf_file}, as it is already in the database.")

    if len(new_pdfs) == 0:
        return
    
-    logging.info(f"{len(new_pdfs)} PDF's to add to the vector store.")
+    logger.info(f"{len(new_pdfs)} PDF's to add to the vector store.")

    loaded_document = []
    for file in new_pdfs:
@ -116,8 +115,8 @@ def add_pdf_files(

    pdf_splits = text_splitter.split_documents(loaded_document)

-    logging.info(f"{len(new_pdfs)} PDF's split in {len(pdf_splits)} vector store documents")
-    logging.info(f"Adding {len(pdf_splits)} vector store documents to vector store.")
+    logger.info(f"{len(new_pdfs)} PDF's split in {len(pdf_splits)} vector store documents")
+    logger.info(f"Adding {len(pdf_splits)} vector store documents to vector store.")

    vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))

@ -136,6 +135,6 @@ def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]:
        elif path.suffix == ".pdf":
            all_pdf_files.append(path)
        else:
-            logging.warning(f"Ignoring path {path} as it is not a folder or pdf file.")
+            logger.warning(f"Ignoring path {path} as it is not a folder or pdf file.")

    return all_pdf_files