Philosophy-RAG-demo/generic_rag/parsers/parser.py

import logging
from pathlib import Path

import requests
from bs4 import BeautifulSoup, Tag
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
from langchain_unstructured import UnstructuredLoader

logger = logging.getLogger(__name__)

headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")]


def code_handler(element: Tag) -> str:
    """
    Custom handler for code elements.
    """
    data_lang = element.get("data-lang")
    code_format = f"<code:{data_lang}>{element.get_text()}</code>"

    return code_format


def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
    """
    Adds a list of URLs as vector documents to the provided vector store.

    The URL's will be fetched and split into chunks of text with the provided chunk size.
    """
    logger.info("Web sources to the vector store.")

    all_splits = []
    for url in urls:
        if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0:
            logger.info(f"Skipping URL {url}, as it is already in the database.")
            continue

        response = requests.get(url)
        html_content = response.text

        soup = BeautifulSoup(html_content, "html.parser")

        web_splitter = HTMLSemanticPreservingSplitter(
            headers_to_split_on=headers_to_split_on,
            separators=["\n\n", "\n", ". ", "! ", "? "],
            max_chunk_size=chunk_size,
            preserve_images=True,
            preserve_videos=True,
            elements_to_preserve=["table", "ul", "ol", "code"],
            denylist_tags=["script", "style", "head"],
            custom_handlers={"code": code_handler},
        )

        splits = web_splitter.split_text(str(soup))

        for split in splits:
            split.metadata["source"] = url
            split.metadata["filetype"] = "web"

        all_splits.extend(splits)

    if len(all_splits) == 0:
        return

    logger.info(f"{len(urls)} web sources split in {len(all_splits)} vector store documents")
    logger.info(f"Adding {len(all_splits)} vector store documents to vector store.")

    filtered_splits = filter_complex_metadata(all_splits)
    vector_store.add_documents(documents=filtered_splits)


def add_pdf_files(
        vector_store: Chroma,
        file_paths: list[Path],
        chunk_size: int,
        chunk_overlap: int,
        add_start_index: bool,
        unstructerd: bool,
) -> None:
    """
    Adds a list of PDF files as vector documents to the provided vector store.

    The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
    """
    logger.info("Adding PDF files to the vector store.")
    pdf_files = get_all_local_pdf_files(file_paths)
    logger.info(f"Found {len(pdf_files)} PDF files to add to the vector store.")

    new_pdfs = []
    for pdf_file in pdf_files:
        if len(vector_store.get(where={"source": str(pdf_file)}, limit=1)["ids"]) == 0:
            new_pdfs.append(pdf_file)
        else:
            logger.info(f"Skipping PDF {pdf_file}, as it is already in the database.")

    if len(new_pdfs) == 0:
        return

    logger.info(f"{len(new_pdfs)} PDF(s) to add to the vector store.")

    loaded_document = []
    for file in new_pdfs:
        loader = UnstructuredLoader(file_path=file, strategy="hi_res") if unstructerd else PyMuPDFLoader(file_path=file)
        for document in loader.lazy_load():
            loaded_document.append(document)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index
    )

    pdf_splits = text_splitter.split_documents(loaded_document)

    logger.info(f"{len(new_pdfs)} PDF's split in {len(pdf_splits)} vector store documents")
    logger.info(f"Adding {len(pdf_splits)} vector store documents to vector store.")

    vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))


def add_text_files(
        vector_store: Chroma,
        file_paths: list[Path],
        chunk_size: int,
        chunk_overlap: int,
        add_start_index: bool,
        unstructerd: bool,
) -> None:
    """
    Adds a list of Text files as vector documents to the provided vector store.
    The Text file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
    """
    logger.info("Adding Text files to the vector store.")
    text_files = get_all_local_text_files(file_paths)
    new_txts = []
    for txt_file in text_files:
        if len(vector_store.get(where={"source": str(txt_file)}, limit=1)["ids"]) == 0:
            new_txts.append(txt_file)
        else:
            logger.info(f"Skipping PDF {txt_file}, as it is already in the database.")

    if len(new_txts) == 0:
        return

    logger.info(f"{len(new_txts)} Txt(s) to add to the vector store.")

    loaded_documents = []
    for file in new_txts:
        loader = UnstructuredLoader(file_path=file, strategy="basic")
        for document in loader.lazy_load():
            loaded_documents.append(document)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index
    )

    txt_splits = text_splitter.split_documents(loaded_documents)

    logger.info(f"{len(txt_splits)} PDF's split in {len(txt_splits)} vector store documents")
    logger.info(f"Adding {len(txt_splits)} vector store documents to vector store.")

    vector_store.add_documents(documents=filter_complex_metadata(txt_splits))


def get_all_local_files(local_paths: list[Path], extension="pdf") -> list[Path]:
    """
    Function that takes a list of local paths,
    that might contain directories paths and/or direct file paths,
    and returns a list with all file paths that are a file with the matching extension.
    This function does not scan directories recursively.
    """
    all_pdf_files = []
    for path in local_paths:
        if path.is_dir():
            all_pdf_files.extend(list(path.glob(f"*.{extension}")))
        elif path.suffix == f".{extension}":
            all_pdf_files.append(path)
        else:
            logger.warning(f"Ignoring path {path} as it is not a folder or {extension} file.")

    return all_pdf_files


def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]:
    return get_all_local_files(local_paths, "pdf")


def get_all_local_text_files(local_paths: list[Path]) -> list[Path]:
    return get_all_local_files(local_paths, "txt")