import logging from pathlib import Path import requests from bs4 import BeautifulSoup, Tag from langchain_chroma import Chroma from langchain_community.vectorstores.utils import filter_complex_metadata from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter from langchain_unstructured import UnstructuredLoader logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")] def code_handler(element: Tag) -> str: """ Custom handler for code elements. """ data_lang = element.get("data-lang") code_format = f"{element.get_text()}" return code_format def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None: all_splits = [] for url in urls: if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0: logging.info(f"Skipping URL {url}, as it is already in the database.") continue response = requests.get(url) html_content = response.text soup = BeautifulSoup(html_content, "html.parser") web_splitter = HTMLSemanticPreservingSplitter( headers_to_split_on=headers_to_split_on, separators=["\n\n", "\n", ". ", "! ", "? "], max_chunk_size=chunk_size, preserve_images=True, preserve_videos=True, elements_to_preserve=["table", "ul", "ol", "code"], denylist_tags=["script", "style", "head"], custom_handlers={"code": code_handler}, ) splits = web_splitter.split_text(str(soup)) for split in splits: split.metadata["source"] = url split.metadata["filetype"] = "web" all_splits.extend(splits) if len(all_splits) == 0: return filtered_splits = filter_complex_metadata(all_splits) vector_store.add_documents(documents=filtered_splits) def add_pdf_files( vector_store: Chroma, file_paths: list[Path], chunk_size: int, chunk_overlap: int, add_start_index: bool ) -> None: pdf_files = get_all_local_pdf_files(file_paths) new_pdfs = [] for pdf_file in pdf_files: if len(vector_store.get(where={"source": str(pdf_file)}, limit=1)["ids"]) == 0: new_pdfs.append(pdf_file) else: logging.info(f"Skipping PDF {pdf_file}, as it is already in the database.") if len(new_pdfs) == 0: return loaded_document = [] for file in new_pdfs: loader = UnstructuredLoader(file_path=file, strategy="hi_res") for document in loader.lazy_load(): loaded_document.append(document) text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index ) pdf_splits = text_splitter.split_documents(loaded_document) vector_store.add_documents(documents=filter_complex_metadata(pdf_splits)) def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]: """ Function that takes a list of local paths, that might contain directories paths and/or direct file paths, and returns a list with all file paths that are a PDF file or any PDF files found in the directory file paths. This fucntion does not scan directories recursively. """ all_pdf_files = [] for path in local_paths: if path.is_dir(): all_pdf_files.extend(list(path.glob("*.pdf"))) elif path.suffix == ".pdf": all_pdf_files.append(path) else: logging.warning(f"Ignoring path {path} as it is not a folder or pdf file.") return all_pdf_files