From b1e8f19f00eac32dd8374a0e16eae4571c4be562 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nielson=20Jann=C3=A9?= Date: Fri, 11 Apr 2025 19:58:11 +0200 Subject: [PATCH] Fixes some logging api mistakes --- generic_rag/app.py | 4 ++-- generic_rag/graphs/cond_ret_gen.py | 5 +++-- generic_rag/graphs/ret_gen.py | 4 ++-- generic_rag/parsers/parser.py | 23 +++++++++++------------ 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/generic_rag/app.py b/generic_rag/app.py index c3a3fb1..73bb3f5 100644 --- a/generic_rag/app.py +++ b/generic_rag/app.py @@ -12,7 +12,7 @@ from graphs.ret_gen import RetGenLangGraph from langchain_chroma import Chroma from parsers.parser import add_pdf_files, add_urls -logging.basicConfig(level=logging.INFO) +logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) parser = argparse.ArgumentParser(description="A Sogeti Nederland Generic RAG demo.") @@ -162,7 +162,7 @@ async def set_starters(): try: starters.append(cl.Starter(label=starter["label"], message=starter["message"])) except KeyError: - logging.warning( + logger.warning( "CHAINLIT_STARTERS environment is not a list with dictionaries containing 'label' and 'message' keys." ) diff --git a/generic_rag/graphs/cond_ret_gen.py b/generic_rag/graphs/cond_ret_gen.py index 8b6d788..8fb554c 100644 --- a/generic_rag/graphs/cond_ret_gen.py +++ b/generic_rag/graphs/cond_ret_gen.py @@ -16,6 +16,7 @@ from langgraph.graph import END, MessagesState, StateGraph from langgraph.prebuilt import InjectedStore, ToolNode, tools_condition from typing_extensions import Annotated +logger = logging.getLogger(__name__) class CondRetGenLangGraph: def __init__(self, vector_store: Chroma, chat_model: BaseChatModel, embedding_model: Embeddings): @@ -83,8 +84,8 @@ class CondRetGenLangGraph: # Furthermore, it can not and should not have the `self` parameter. # If you want to pass on state, please refer to: # https://python.langchain.com/docs/concepts/tools/#special-type-annotations - logging.info(f"Query: {query}") - logging.info(f"user content: {full_user_content}") + logger.debug(f"query: {query}") + logger.debug(f"user content: {full_user_content}") retrieved_docs = [] retrieved_docs = vector_store.similarity_search(query, k=4) diff --git a/generic_rag/graphs/ret_gen.py b/generic_rag/graphs/ret_gen.py index ba19f0f..c4262c3 100644 --- a/generic_rag/graphs/ret_gen.py +++ b/generic_rag/graphs/ret_gen.py @@ -11,7 +11,6 @@ from langgraph.checkpoint.memory import MemorySaver from langgraph.graph import END, START, StateGraph from typing_extensions import List, TypedDict -logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -42,6 +41,7 @@ class RetGenLangGraph: yield response.content def _retrieve(self, state: State) -> dict: + logger.debug(f"querying VS for: {state["question"]}") self.last_retrieved_docs = self.vector_store.similarity_search(state["question"]) return {"context": self.last_retrieved_docs} @@ -77,7 +77,7 @@ class RetGenLangGraph: pdf_sources[source].add(doc.metadata["page"]) if len(pdf_sources[source]) == 0: - logging.warning(f"PDF source {source} has no page number. Please check the metadata of the document.") + logger.warning(f"PDF source {source} has no page number. Please check the metadata of the document.") return pdf_sources diff --git a/generic_rag/parsers/parser.py b/generic_rag/parsers/parser.py index c562601..ef3b374 100644 --- a/generic_rag/parsers/parser.py +++ b/generic_rag/parsers/parser.py @@ -9,7 +9,6 @@ from langchain_community.vectorstores.utils import filter_complex_metadata from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter from langchain_unstructured import UnstructuredLoader -logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -32,12 +31,12 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None: The URL's will be fetched and split into chunks of text with the provided chunk size. """ - logging.info("Web sources to the vector store.") + logger.info("Web sources to the vector store.") all_splits = [] for url in urls: if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0: - logging.info(f"Skipping URL {url}, as it is already in the database.") + logger.info(f"Skipping URL {url}, as it is already in the database.") continue response = requests.get(url) @@ -67,8 +66,8 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None: if len(all_splits) == 0: return - logging.info(f"{len(urls)} web sources split in {len(all_splits)} vector store documents") - logging.info(f"Adding {len(all_splits)} vector store documents to vector store.") + logger.info(f"{len(urls)} web sources split in {len(all_splits)} vector store documents") + logger.info(f"Adding {len(all_splits)} vector store documents to vector store.") filtered_splits = filter_complex_metadata(all_splits) vector_store.add_documents(documents=filtered_splits) @@ -87,22 +86,22 @@ def add_pdf_files( The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap. """ - logging.info("Adding PDF files to the vector store.") + logger.info("Adding PDF files to the vector store.") pdf_files = get_all_local_pdf_files(file_paths) - logging.info(f"Found {len(pdf_files)} PDF files to add to the vector store.") + logger.info(f"Found {len(pdf_files)} PDF files to add to the vector store.") new_pdfs = [] for pdf_file in pdf_files: if len(vector_store.get(where={"source": str(pdf_file)}, limit=1)["ids"]) == 0: new_pdfs.append(pdf_file) else: - logging.info(f"Skipping PDF {pdf_file}, as it is already in the database.") + logger.info(f"Skipping PDF {pdf_file}, as it is already in the database.") if len(new_pdfs) == 0: return - logging.info(f"{len(new_pdfs)} PDF's to add to the vector store.") + logger.info(f"{len(new_pdfs)} PDF's to add to the vector store.") loaded_document = [] for file in new_pdfs: @@ -116,8 +115,8 @@ def add_pdf_files( pdf_splits = text_splitter.split_documents(loaded_document) - logging.info(f"{len(new_pdfs)} PDF's split in {len(pdf_splits)} vector store documents") - logging.info(f"Adding {len(pdf_splits)} vector store documents to vector store.") + logger.info(f"{len(new_pdfs)} PDF's split in {len(pdf_splits)} vector store documents") + logger.info(f"Adding {len(pdf_splits)} vector store documents to vector store.") vector_store.add_documents(documents=filter_complex_metadata(pdf_splits)) @@ -136,6 +135,6 @@ def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]: elif path.suffix == ".pdf": all_pdf_files.append(path) else: - logging.warning(f"Ignoring path {path} as it is not a folder or pdf file.") + logger.warning(f"Ignoring path {path} as it is not a folder or pdf file.") return all_pdf_files