forked from AI_team/Philosophy-RAG-demo
Add extra logging regarding pdf/web soure parsing
This commit is contained in:
parent
0fe4a628d7
commit
47c1c1cd6e
@ -32,6 +32,8 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
|
|||||||
|
|
||||||
The URL's will be fetched and split into chunks of text with the provided chunk size.
|
The URL's will be fetched and split into chunks of text with the provided chunk size.
|
||||||
"""
|
"""
|
||||||
|
logging.info("Web sources to the vector store.")
|
||||||
|
|
||||||
all_splits = []
|
all_splits = []
|
||||||
for url in urls:
|
for url in urls:
|
||||||
if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0:
|
if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0:
|
||||||
@ -65,6 +67,9 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
|
|||||||
if len(all_splits) == 0:
|
if len(all_splits) == 0:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
logging.info(f"{len(urls)} web sources split in {len(all_splits)} vector store documents")
|
||||||
|
logging.info(f"Adding {len(all_splits)} vector store documents to vector store.")
|
||||||
|
|
||||||
filtered_splits = filter_complex_metadata(all_splits)
|
filtered_splits = filter_complex_metadata(all_splits)
|
||||||
vector_store.add_documents(documents=filtered_splits)
|
vector_store.add_documents(documents=filtered_splits)
|
||||||
|
|
||||||
@ -82,7 +87,10 @@ def add_pdf_files(
|
|||||||
|
|
||||||
The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
|
The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
|
||||||
"""
|
"""
|
||||||
|
logging.info("Adding PDF files to the vector store.")
|
||||||
|
|
||||||
pdf_files = get_all_local_pdf_files(file_paths)
|
pdf_files = get_all_local_pdf_files(file_paths)
|
||||||
|
logging.info(f"Found {len(pdf_files)} PDF files to add to the vector store.")
|
||||||
|
|
||||||
new_pdfs = []
|
new_pdfs = []
|
||||||
for pdf_file in pdf_files:
|
for pdf_file in pdf_files:
|
||||||
@ -93,6 +101,8 @@ def add_pdf_files(
|
|||||||
|
|
||||||
if len(new_pdfs) == 0:
|
if len(new_pdfs) == 0:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
logging.info(f"{len(new_pdfs)} PDF's to add to the vector store.")
|
||||||
|
|
||||||
loaded_document = []
|
loaded_document = []
|
||||||
for file in new_pdfs:
|
for file in new_pdfs:
|
||||||
@ -106,6 +116,9 @@ def add_pdf_files(
|
|||||||
|
|
||||||
pdf_splits = text_splitter.split_documents(loaded_document)
|
pdf_splits = text_splitter.split_documents(loaded_document)
|
||||||
|
|
||||||
|
logging.info(f"{len(new_pdfs)} PDF's split in {len(pdf_splits)} vector store documents")
|
||||||
|
logging.info(f"Adding {len(pdf_splits)} vector store documents to vector store.")
|
||||||
|
|
||||||
vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))
|
vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user