Philosophy-RAG-demo/generic_rag/parsers/parser.py

import logging
from pathlib import Path

import requests
from bs4 import BeautifulSoup, Tag
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
from langchain_unstructured import UnstructuredLoader

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")]


def code_handler(element: Tag) -> str:
    """
    Custom handler for code elements.
    """
    data_lang = element.get("data-lang")
    code_format = f"<code:{data_lang}>{element.get_text()}</code>"

    return code_format


def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
    """
    Adds a list of URLs as vector documents to the provided vector store.

    The URL's will be fetched and split into chunks of text with the provided chunk size.
    """
    logging.info("Web sources to the vector store.")

    all_splits = []
    for url in urls:
        if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0:
            logging.info(f"Skipping URL {url}, as it is already in the database.")
            continue

        response = requests.get(url)
        html_content = response.text

        soup = BeautifulSoup(html_content, "html.parser")

        web_splitter = HTMLSemanticPreservingSplitter(
            headers_to_split_on=headers_to_split_on,
            separators=["\n\n", "\n", ". ", "! ", "? "],
            max_chunk_size=chunk_size,
            preserve_images=True,
            preserve_videos=True,
            elements_to_preserve=["table", "ul", "ol", "code"],
            denylist_tags=["script", "style", "head"],
            custom_handlers={"code": code_handler},
        )

        splits = web_splitter.split_text(str(soup))

        for split in splits:
            split.metadata["source"] = url
            split.metadata["filetype"] = "web"

        all_splits.extend(splits)

    if len(all_splits) == 0:
        return

    logging.info(f"{len(urls)} web sources split in {len(all_splits)} vector store documents")
    logging.info(f"Adding {len(all_splits)} vector store documents to vector store.")

    filtered_splits = filter_complex_metadata(all_splits)
    vector_store.add_documents(documents=filtered_splits)


def add_pdf_files(
    vector_store: Chroma,
    file_paths: list[Path],
    chunk_size: int,
    chunk_overlap: int,
    add_start_index: bool,
    unstructerd: bool,
) -> None:
    """
    Adds a list of PDF files as vector documents to the provided vector store.

    The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
    """
    logging.info("Adding PDF files to the vector store.")

    pdf_files = get_all_local_pdf_files(file_paths)
    logging.info(f"Found {len(pdf_files)} PDF files to add to the vector store.")

    new_pdfs = []
    for pdf_file in pdf_files:
        if len(vector_store.get(where={"source": str(pdf_file)}, limit=1)["ids"]) == 0:
            new_pdfs.append(pdf_file)
        else:
            logging.info(f"Skipping PDF {pdf_file}, as it is already in the database.")

    if len(new_pdfs) == 0:
        return

    logging.info(f"{len(new_pdfs)} PDF's to add to the vector store.")

    loaded_document = []
    for file in new_pdfs:
        loader = UnstructuredLoader(file_path=file, strategy="hi_res") if unstructerd else PyMuPDFLoader(file_path=file)
        for document in loader.lazy_load():
            loaded_document.append(document)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index
    )

    pdf_splits = text_splitter.split_documents(loaded_document)

    logging.info(f"{len(new_pdfs)} PDF's split in {len(pdf_splits)} vector store documents")
    logging.info(f"Adding {len(pdf_splits)} vector store documents to vector store.")

    vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))


def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]:
    """
    Function that takes a list of local paths,
    that might contain directories paths and/or direct file paths,
    and returns a list with all file paths that are a PDF file or any PDF files found in the directory file paths.
    This fucntion does not scan directories recursively.
    """
    all_pdf_files = []
    for path in local_paths:
        if path.is_dir():
            all_pdf_files.extend(list(path.glob("*.pdf")))
        elif path.suffix == ".pdf":
            all_pdf_files.append(path)
        else:
            logging.warning(f"Ignoring path {path} as it is not a folder or pdf file.")

    return all_pdf_files