Philosophy-RAG-demo/generic_rag/parsers/parser.py
2025-04-18 15:33:35 +02:00

191 lines
6.6 KiB
Python

import logging
from pathlib import Path
import requests
from bs4 import BeautifulSoup, Tag
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
from langchain_unstructured import UnstructuredLoader
logger = logging.getLogger(__name__)
headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")]
def code_handler(element: Tag) -> str:
"""
Custom handler for code elements.
"""
data_lang = element.get("data-lang")
code_format = f"<code:{data_lang}>{element.get_text()}</code>"
return code_format
def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
"""
Adds a list of URLs as vector documents to the provided vector store.
The URL's will be fetched and split into chunks of text with the provided chunk size.
"""
logger.info("Web sources to the vector store.")
all_splits = []
for url in urls:
if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0:
logger.info(f"Skipping URL {url}, as it is already in the database.")
continue
response = requests.get(url)
html_content = response.text
soup = BeautifulSoup(html_content, "html.parser")
web_splitter = HTMLSemanticPreservingSplitter(
headers_to_split_on=headers_to_split_on,
separators=["\n\n", "\n", ". ", "! ", "? "],
max_chunk_size=chunk_size,
preserve_images=True,
preserve_videos=True,
elements_to_preserve=["table", "ul", "ol", "code"],
denylist_tags=["script", "style", "head"],
custom_handlers={"code": code_handler},
)
splits = web_splitter.split_text(str(soup))
for split in splits:
split.metadata["source"] = url
split.metadata["filetype"] = "web"
all_splits.extend(splits)
if len(all_splits) == 0:
return
logger.info(f"{len(urls)} web sources split in {len(all_splits)} vector store documents")
logger.info(f"Adding {len(all_splits)} vector store documents to vector store.")
filtered_splits = filter_complex_metadata(all_splits)
vector_store.add_documents(documents=filtered_splits)
def add_pdf_files(
vector_store: Chroma,
file_paths: list[Path],
chunk_size: int,
chunk_overlap: int,
add_start_index: bool,
unstructerd: bool,
) -> None:
"""
Adds a list of PDF files as vector documents to the provided vector store.
The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
"""
logger.info("Adding PDF files to the vector store.")
pdf_files = get_all_local_pdf_files(file_paths)
logger.info(f"Found {len(pdf_files)} PDF files to add to the vector store.")
new_pdfs = []
for pdf_file in pdf_files:
if len(vector_store.get(where={"source": str(pdf_file)}, limit=1)["ids"]) == 0:
new_pdfs.append(pdf_file)
else:
logger.info(f"Skipping PDF {pdf_file}, as it is already in the database.")
if len(new_pdfs) == 0:
return
logger.info(f"{len(new_pdfs)} PDF(s) to add to the vector store.")
loaded_document = []
for file in new_pdfs:
loader = UnstructuredLoader(file_path=file, strategy="hi_res") if unstructerd else PyMuPDFLoader(file_path=file)
for document in loader.lazy_load():
loaded_document.append(document)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index
)
pdf_splits = text_splitter.split_documents(loaded_document)
logger.info(f"{len(new_pdfs)} PDF's split in {len(pdf_splits)} vector store documents")
logger.info(f"Adding {len(pdf_splits)} vector store documents to vector store.")
vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))
def add_text_files(
vector_store: Chroma,
file_paths: list[Path],
chunk_size: int,
chunk_overlap: int,
add_start_index: bool,
unstructerd: bool,
) -> None:
"""
Adds a list of Text files as vector documents to the provided vector store.
The Text file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
"""
logger.info("Adding Text files to the vector store.")
text_files = get_all_local_text_files(file_paths)
new_txts = []
for txt_file in text_files:
if len(vector_store.get(where={"source": str(txt_file)}, limit=1)["ids"]) == 0:
new_txts.append(txt_file)
else:
logger.info(f"Skipping PDF {txt_file}, as it is already in the database.")
if len(new_txts) == 0:
return
logger.info(f"{len(new_txts)} Txt(s) to add to the vector store.")
loaded_documents = []
for file in new_txts:
loader = UnstructuredLoader(file_path=file, strategy="basic")
for document in loader.lazy_load():
loaded_documents.append(document)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index
)
txt_splits = text_splitter.split_documents(loaded_documents)
logger.info(f"{len(txt_splits)} PDF's split in {len(txt_splits)} vector store documents")
logger.info(f"Adding {len(txt_splits)} vector store documents to vector store.")
vector_store.add_documents(documents=filter_complex_metadata(txt_splits))
def get_all_local_files(local_paths: list[Path], extension="pdf") -> list[Path]:
"""
Function that takes a list of local paths,
that might contain directories paths and/or direct file paths,
and returns a list with all file paths that are a file with the matching extension.
This function does not scan directories recursively.
"""
all_pdf_files = []
for path in local_paths:
if path.is_dir():
all_pdf_files.extend(list(path.glob(f"*.{extension}")))
elif path.suffix == f".{extension}":
all_pdf_files.append(path)
else:
logger.warning(f"Ignoring path {path} as it is not a folder or {extension} file.")
return all_pdf_files
def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]:
return get_all_local_files(local_paths, "pdf")
def get_all_local_text_files(local_paths: list[Path]) -> list[Path]:
return get_all_local_files(local_paths, "txt")