import logging from pathlib import Path import requests from bs4 import BeautifulSoup, Tag from langchain_chroma import Chroma from langchain_community.document_loaders import PyMuPDFLoader from langchain_community.vectorstores.utils import filter_complex_metadata from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter from langchain_unstructured import UnstructuredLoader logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")] def code_handler(element: Tag) -> str: """ Custom handler for code elements. """ data_lang = element.get("data-lang") code_format = f"{element.get_text()}" return code_format def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None: """ Adds a list of URLs as vector documents to the provided vector store. The URL's will be fetched and split into chunks of text with the provided chunk size. """ logging.info("Web sources to the vector store.") all_splits = [] for url in urls: if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0: logging.info(f"Skipping URL {url}, as it is already in the database.") continue response = requests.get(url) html_content = response.text soup = BeautifulSoup(html_content, "html.parser") web_splitter = HTMLSemanticPreservingSplitter( headers_to_split_on=headers_to_split_on, separators=["\n\n", "\n", ". ", "! ", "? "], max_chunk_size=chunk_size, preserve_images=True, preserve_videos=True, elements_to_preserve=["table", "ul", "ol", "code"], denylist_tags=["script", "style", "head"], custom_handlers={"code": code_handler}, ) splits = web_splitter.split_text(str(soup)) for split in splits: split.metadata["source"] = url split.metadata["filetype"] = "web" all_splits.extend(splits) if len(all_splits) == 0: return logging.info(f"{len(urls)} web sources split in {len(all_splits)} vector store documents") logging.info(f"Adding {len(all_splits)} vector store documents to vector store.") filtered_splits = filter_complex_metadata(all_splits) vector_store.add_documents(documents=filtered_splits) def add_pdf_files( vector_store: Chroma, file_paths: list[Path], chunk_size: int, chunk_overlap: int, add_start_index: bool, unstructerd: bool, ) -> None: """ Adds a list of PDF files as vector documents to the provided vector store. The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap. """ logging.info("Adding PDF files to the vector store.") pdf_files = get_all_local_pdf_files(file_paths) logging.info(f"Found {len(pdf_files)} PDF files to add to the vector store.") new_pdfs = [] for pdf_file in pdf_files: if len(vector_store.get(where={"source": str(pdf_file)}, limit=1)["ids"]) == 0: new_pdfs.append(pdf_file) else: logging.info(f"Skipping PDF {pdf_file}, as it is already in the database.") if len(new_pdfs) == 0: return logging.info(f"{len(new_pdfs)} PDF's to add to the vector store.") loaded_document = [] for file in new_pdfs: loader = UnstructuredLoader(file_path=file, strategy="hi_res") if unstructerd else PyMuPDFLoader(file_path=file) for document in loader.lazy_load(): loaded_document.append(document) text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index ) pdf_splits = text_splitter.split_documents(loaded_document) logging.info(f"{len(new_pdfs)} PDF's split in {len(pdf_splits)} vector store documents") logging.info(f"Adding {len(pdf_splits)} vector store documents to vector store.") vector_store.add_documents(documents=filter_complex_metadata(pdf_splits)) def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]: """ Function that takes a list of local paths, that might contain directories paths and/or direct file paths, and returns a list with all file paths that are a PDF file or any PDF files found in the directory file paths. This fucntion does not scan directories recursively. """ all_pdf_files = [] for path in local_paths: if path.is_dir(): all_pdf_files.extend(list(path.glob("*.pdf"))) elif path.suffix == ".pdf": all_pdf_files.append(path) else: logging.warning(f"Ignoring path {path} as it is not a folder or pdf file.") return all_pdf_files