forked from AI_team/Philosophy-RAG-demo
Fixes some logging api mistakes
This commit is contained in:
parent
fc59aa0d2f
commit
b1e8f19f00
@ -12,7 +12,7 @@ from graphs.ret_gen import RetGenLangGraph
|
|||||||
from langchain_chroma import Chroma
|
from langchain_chroma import Chroma
|
||||||
from parsers.parser import add_pdf_files, add_urls
|
from parsers.parser import add_pdf_files, add_urls
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="A Sogeti Nederland Generic RAG demo.")
|
parser = argparse.ArgumentParser(description="A Sogeti Nederland Generic RAG demo.")
|
||||||
@ -162,7 +162,7 @@ async def set_starters():
|
|||||||
try:
|
try:
|
||||||
starters.append(cl.Starter(label=starter["label"], message=starter["message"]))
|
starters.append(cl.Starter(label=starter["label"], message=starter["message"]))
|
||||||
except KeyError:
|
except KeyError:
|
||||||
logging.warning(
|
logger.warning(
|
||||||
"CHAINLIT_STARTERS environment is not a list with dictionaries containing 'label' and 'message' keys."
|
"CHAINLIT_STARTERS environment is not a list with dictionaries containing 'label' and 'message' keys."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -16,6 +16,7 @@ from langgraph.graph import END, MessagesState, StateGraph
|
|||||||
from langgraph.prebuilt import InjectedStore, ToolNode, tools_condition
|
from langgraph.prebuilt import InjectedStore, ToolNode, tools_condition
|
||||||
from typing_extensions import Annotated
|
from typing_extensions import Annotated
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class CondRetGenLangGraph:
|
class CondRetGenLangGraph:
|
||||||
def __init__(self, vector_store: Chroma, chat_model: BaseChatModel, embedding_model: Embeddings):
|
def __init__(self, vector_store: Chroma, chat_model: BaseChatModel, embedding_model: Embeddings):
|
||||||
@ -83,8 +84,8 @@ class CondRetGenLangGraph:
|
|||||||
# Furthermore, it can not and should not have the `self` parameter.
|
# Furthermore, it can not and should not have the `self` parameter.
|
||||||
# If you want to pass on state, please refer to:
|
# If you want to pass on state, please refer to:
|
||||||
# https://python.langchain.com/docs/concepts/tools/#special-type-annotations
|
# https://python.langchain.com/docs/concepts/tools/#special-type-annotations
|
||||||
logging.info(f"Query: {query}")
|
logger.debug(f"query: {query}")
|
||||||
logging.info(f"user content: {full_user_content}")
|
logger.debug(f"user content: {full_user_content}")
|
||||||
|
|
||||||
retrieved_docs = []
|
retrieved_docs = []
|
||||||
retrieved_docs = vector_store.similarity_search(query, k=4)
|
retrieved_docs = vector_store.similarity_search(query, k=4)
|
||||||
|
|||||||
@ -11,7 +11,6 @@ from langgraph.checkpoint.memory import MemorySaver
|
|||||||
from langgraph.graph import END, START, StateGraph
|
from langgraph.graph import END, START, StateGraph
|
||||||
from typing_extensions import List, TypedDict
|
from typing_extensions import List, TypedDict
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@ -42,6 +41,7 @@ class RetGenLangGraph:
|
|||||||
yield response.content
|
yield response.content
|
||||||
|
|
||||||
def _retrieve(self, state: State) -> dict:
|
def _retrieve(self, state: State) -> dict:
|
||||||
|
logger.debug(f"querying VS for: {state["question"]}")
|
||||||
self.last_retrieved_docs = self.vector_store.similarity_search(state["question"])
|
self.last_retrieved_docs = self.vector_store.similarity_search(state["question"])
|
||||||
return {"context": self.last_retrieved_docs}
|
return {"context": self.last_retrieved_docs}
|
||||||
|
|
||||||
@ -77,7 +77,7 @@ class RetGenLangGraph:
|
|||||||
pdf_sources[source].add(doc.metadata["page"])
|
pdf_sources[source].add(doc.metadata["page"])
|
||||||
|
|
||||||
if len(pdf_sources[source]) == 0:
|
if len(pdf_sources[source]) == 0:
|
||||||
logging.warning(f"PDF source {source} has no page number. Please check the metadata of the document.")
|
logger.warning(f"PDF source {source} has no page number. Please check the metadata of the document.")
|
||||||
|
|
||||||
return pdf_sources
|
return pdf_sources
|
||||||
|
|
||||||
|
|||||||
@ -9,7 +9,6 @@ from langchain_community.vectorstores.utils import filter_complex_metadata
|
|||||||
from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
|
from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
|
||||||
from langchain_unstructured import UnstructuredLoader
|
from langchain_unstructured import UnstructuredLoader
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@ -32,12 +31,12 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
|
|||||||
|
|
||||||
The URL's will be fetched and split into chunks of text with the provided chunk size.
|
The URL's will be fetched and split into chunks of text with the provided chunk size.
|
||||||
"""
|
"""
|
||||||
logging.info("Web sources to the vector store.")
|
logger.info("Web sources to the vector store.")
|
||||||
|
|
||||||
all_splits = []
|
all_splits = []
|
||||||
for url in urls:
|
for url in urls:
|
||||||
if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0:
|
if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0:
|
||||||
logging.info(f"Skipping URL {url}, as it is already in the database.")
|
logger.info(f"Skipping URL {url}, as it is already in the database.")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
@ -67,8 +66,8 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
|
|||||||
if len(all_splits) == 0:
|
if len(all_splits) == 0:
|
||||||
return
|
return
|
||||||
|
|
||||||
logging.info(f"{len(urls)} web sources split in {len(all_splits)} vector store documents")
|
logger.info(f"{len(urls)} web sources split in {len(all_splits)} vector store documents")
|
||||||
logging.info(f"Adding {len(all_splits)} vector store documents to vector store.")
|
logger.info(f"Adding {len(all_splits)} vector store documents to vector store.")
|
||||||
|
|
||||||
filtered_splits = filter_complex_metadata(all_splits)
|
filtered_splits = filter_complex_metadata(all_splits)
|
||||||
vector_store.add_documents(documents=filtered_splits)
|
vector_store.add_documents(documents=filtered_splits)
|
||||||
@ -87,22 +86,22 @@ def add_pdf_files(
|
|||||||
|
|
||||||
The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
|
The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
|
||||||
"""
|
"""
|
||||||
logging.info("Adding PDF files to the vector store.")
|
logger.info("Adding PDF files to the vector store.")
|
||||||
|
|
||||||
pdf_files = get_all_local_pdf_files(file_paths)
|
pdf_files = get_all_local_pdf_files(file_paths)
|
||||||
logging.info(f"Found {len(pdf_files)} PDF files to add to the vector store.")
|
logger.info(f"Found {len(pdf_files)} PDF files to add to the vector store.")
|
||||||
|
|
||||||
new_pdfs = []
|
new_pdfs = []
|
||||||
for pdf_file in pdf_files:
|
for pdf_file in pdf_files:
|
||||||
if len(vector_store.get(where={"source": str(pdf_file)}, limit=1)["ids"]) == 0:
|
if len(vector_store.get(where={"source": str(pdf_file)}, limit=1)["ids"]) == 0:
|
||||||
new_pdfs.append(pdf_file)
|
new_pdfs.append(pdf_file)
|
||||||
else:
|
else:
|
||||||
logging.info(f"Skipping PDF {pdf_file}, as it is already in the database.")
|
logger.info(f"Skipping PDF {pdf_file}, as it is already in the database.")
|
||||||
|
|
||||||
if len(new_pdfs) == 0:
|
if len(new_pdfs) == 0:
|
||||||
return
|
return
|
||||||
|
|
||||||
logging.info(f"{len(new_pdfs)} PDF's to add to the vector store.")
|
logger.info(f"{len(new_pdfs)} PDF's to add to the vector store.")
|
||||||
|
|
||||||
loaded_document = []
|
loaded_document = []
|
||||||
for file in new_pdfs:
|
for file in new_pdfs:
|
||||||
@ -116,8 +115,8 @@ def add_pdf_files(
|
|||||||
|
|
||||||
pdf_splits = text_splitter.split_documents(loaded_document)
|
pdf_splits = text_splitter.split_documents(loaded_document)
|
||||||
|
|
||||||
logging.info(f"{len(new_pdfs)} PDF's split in {len(pdf_splits)} vector store documents")
|
logger.info(f"{len(new_pdfs)} PDF's split in {len(pdf_splits)} vector store documents")
|
||||||
logging.info(f"Adding {len(pdf_splits)} vector store documents to vector store.")
|
logger.info(f"Adding {len(pdf_splits)} vector store documents to vector store.")
|
||||||
|
|
||||||
vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))
|
vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))
|
||||||
|
|
||||||
@ -136,6 +135,6 @@ def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]:
|
|||||||
elif path.suffix == ".pdf":
|
elif path.suffix == ".pdf":
|
||||||
all_pdf_files.append(path)
|
all_pdf_files.append(path)
|
||||||
else:
|
else:
|
||||||
logging.warning(f"Ignoring path {path} as it is not a folder or pdf file.")
|
logger.warning(f"Ignoring path {path} as it is not a folder or pdf file.")
|
||||||
|
|
||||||
return all_pdf_files
|
return all_pdf_files
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user