Philosophy-RAG-demo/generic_rag/parsers/parser.py

import logging
from pathlib import Path

import requests
from bs4 import BeautifulSoup, Tag
from langchain_chroma import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
from langchain_unstructured import UnstructuredLoader

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")]


def code_handler(element: Tag) -> str:
    """
    Custom handler for code elements.
    """
    data_lang = element.get("data-lang")
    code_format = f"<code:{data_lang}>{element.get_text()}</code>"

    return code_format


def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
    all_splits = []
    for url in urls:
        if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0:
            continue

        response = requests.get(url)
        html_content = response.text

        soup = BeautifulSoup(html_content, "html.parser")

        web_splitter = HTMLSemanticPreservingSplitter(
            headers_to_split_on=headers_to_split_on,
            separators=["\n\n", "\n", ". ", "! ", "? "],
            max_chunk_size=chunk_size,
            preserve_images=True,
            preserve_videos=True,
            elements_to_preserve=["table", "ul", "ol", "code"],
            denylist_tags=["script", "style", "head"],
            custom_handlers={"code": code_handler},
        )

        splits = web_splitter.split_text(str(soup))

        for split in splits:
            split.metadata["source"] = url

        all_splits.extend(splits)

    if len(all_splits) == 0:
        return

    filtered_splits = filter_complex_metadata(all_splits)
    vector_store.add_documents(documents=filtered_splits)


def add_pdf_files(
    vector_store: Chroma, file_paths: list[Path], chunk_size: int, chunk_overlap: int, add_start_index: bool
) -> None:
    pdf_files = get_all_local_pdf_files(file_paths)

    new_pdfs = []
    for pdf_file in pdf_files:
        if len(vector_store.get(where={"source": str(pdf_file)}, limit=1)["ids"]) == 0:
            new_pdfs.append(pdf_file)

    if len(new_pdfs) == 0:
        return

    loaded_document = []
    for file in new_pdfs:
        loader = UnstructuredLoader(file_path=file, strategy="hi_res")
        for document in loader.lazy_load():
            loaded_document.append(document)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index
    )

    pdf_splits = text_splitter.split_documents(loaded_document)

    vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))


def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]:
    """
    Function that takes a list of local paths,
    that might contain directories paths and/or direct file paths,
    and returns a list with all file paths that are a PDF file or any PDF files found in the directory file paths.
    This fucntion does not scan directories recursively.
    """
    all_pdf_files = []
    for path in local_paths:
        if path.is_dir():
            all_pdf_files.extend(list(path.glob("*.pdf")))
        elif path.suffix == ".pdf":
            all_pdf_files.append(path)
        else:
            logging.warning(f"Ignoring path {path} as it is not a folder or pdf file.")

    return all_pdf_files