forked from AI_team/Philosophy-RAG-demo
Add extra logging regarding pdf/web soure parsing
This commit is contained in:
parent
0fe4a628d7
commit
47c1c1cd6e
@ -32,6 +32,8 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
|
||||
|
||||
The URL's will be fetched and split into chunks of text with the provided chunk size.
|
||||
"""
|
||||
logging.info("Web sources to the vector store.")
|
||||
|
||||
all_splits = []
|
||||
for url in urls:
|
||||
if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0:
|
||||
@ -65,6 +67,9 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
|
||||
if len(all_splits) == 0:
|
||||
return
|
||||
|
||||
logging.info(f"{len(urls)} web sources split in {len(all_splits)} vector store documents")
|
||||
logging.info(f"Adding {len(all_splits)} vector store documents to vector store.")
|
||||
|
||||
filtered_splits = filter_complex_metadata(all_splits)
|
||||
vector_store.add_documents(documents=filtered_splits)
|
||||
|
||||
@ -82,7 +87,10 @@ def add_pdf_files(
|
||||
|
||||
The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
|
||||
"""
|
||||
logging.info("Adding PDF files to the vector store.")
|
||||
|
||||
pdf_files = get_all_local_pdf_files(file_paths)
|
||||
logging.info(f"Found {len(pdf_files)} PDF files to add to the vector store.")
|
||||
|
||||
new_pdfs = []
|
||||
for pdf_file in pdf_files:
|
||||
@ -93,6 +101,8 @@ def add_pdf_files(
|
||||
|
||||
if len(new_pdfs) == 0:
|
||||
return
|
||||
|
||||
logging.info(f"{len(new_pdfs)} PDF's to add to the vector store.")
|
||||
|
||||
loaded_document = []
|
||||
for file in new_pdfs:
|
||||
@ -106,6 +116,9 @@ def add_pdf_files(
|
||||
|
||||
pdf_splits = text_splitter.split_documents(loaded_document)
|
||||
|
||||
logging.info(f"{len(new_pdfs)} PDF's split in {len(pdf_splits)} vector store documents")
|
||||
logging.info(f"Adding {len(pdf_splits)} vector store documents to vector store.")
|
||||
|
||||
vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))
|
||||
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user