Add extra logging regarding pdf/web soure parsing

2025-03-28 15:07:46 +01:00 · 2025-03-28 15:07:46 +01:00 · 47c1c1cd6e
commit 47c1c1cd6e
parent 0fe4a628d7
1 changed files with 13 additions and 0 deletions
--- a/generic_rag/parsers/parser.py
+++ b/generic_rag/parsers/parser.py
@ -32,6 +32,8 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
    The URL's will be fetched and split into chunks of text with the provided chunk size.
    """
    logging.info("Web sources to the vector store.")
    all_splits = []
    for url in urls:
        if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0:
@ -65,6 +67,9 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
    if len(all_splits) == 0:
        return
    logging.info(f"{len(urls)} web sources split in {len(all_splits)} vector store documents")
    logging.info(f"Adding {len(all_splits)} vector store documents to vector store.")
    filtered_splits = filter_complex_metadata(all_splits)
    vector_store.add_documents(documents=filtered_splits)
@ -82,7 +87,10 @@ def add_pdf_files(
    The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
    """
    logging.info("Adding PDF files to the vector store.")
    pdf_files = get_all_local_pdf_files(file_paths)
    logging.info(f"Found {len(pdf_files)} PDF files to add to the vector store.")
    new_pdfs = []
    for pdf_file in pdf_files:
@ -94,6 +102,8 @@ def add_pdf_files(
    if len(new_pdfs) == 0:
        return
    logging.info(f"{len(new_pdfs)} PDF's to add to the vector store.")
    loaded_document = []
    for file in new_pdfs:
        loader = UnstructuredLoader(file_path=file, strategy="hi_res") if unstructerd else PyMuPDFLoader(file_path=file)
@ -106,6 +116,9 @@ def add_pdf_files(
    pdf_splits = text_splitter.split_documents(loaded_document)
    logging.info(f"{len(new_pdfs)} PDF's split in {len(pdf_splits)} vector store documents")
    logging.info(f"Adding {len(pdf_splits)} vector store documents to vector store.")
    vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))