forked from AI_team/Philosophy-RAG-demo
142 lines
4.8 KiB
Python
142 lines
4.8 KiB
Python
import logging
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup, Tag
|
|
from langchain_chroma import Chroma
|
|
from langchain_community.document_loaders import PyMuPDFLoader
|
|
from langchain_community.vectorstores.utils import filter_complex_metadata
|
|
from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
|
|
from langchain_unstructured import UnstructuredLoader
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")]
|
|
|
|
|
|
def code_handler(element: Tag) -> str:
|
|
"""
|
|
Custom handler for code elements.
|
|
"""
|
|
data_lang = element.get("data-lang")
|
|
code_format = f"<code:{data_lang}>{element.get_text()}</code>"
|
|
|
|
return code_format
|
|
|
|
|
|
def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
|
|
"""
|
|
Adds a list of URLs as vector documents to the provided vector store.
|
|
|
|
The URL's will be fetched and split into chunks of text with the provided chunk size.
|
|
"""
|
|
logging.info("Web sources to the vector store.")
|
|
|
|
all_splits = []
|
|
for url in urls:
|
|
if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0:
|
|
logging.info(f"Skipping URL {url}, as it is already in the database.")
|
|
continue
|
|
|
|
response = requests.get(url)
|
|
html_content = response.text
|
|
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
|
|
|
web_splitter = HTMLSemanticPreservingSplitter(
|
|
headers_to_split_on=headers_to_split_on,
|
|
separators=["\n\n", "\n", ". ", "! ", "? "],
|
|
max_chunk_size=chunk_size,
|
|
preserve_images=True,
|
|
preserve_videos=True,
|
|
elements_to_preserve=["table", "ul", "ol", "code"],
|
|
denylist_tags=["script", "style", "head"],
|
|
custom_handlers={"code": code_handler},
|
|
)
|
|
|
|
splits = web_splitter.split_text(str(soup))
|
|
|
|
for split in splits:
|
|
split.metadata["source"] = url
|
|
split.metadata["filetype"] = "web"
|
|
|
|
all_splits.extend(splits)
|
|
|
|
if len(all_splits) == 0:
|
|
return
|
|
|
|
logging.info(f"{len(urls)} web sources split in {len(all_splits)} vector store documents")
|
|
logging.info(f"Adding {len(all_splits)} vector store documents to vector store.")
|
|
|
|
filtered_splits = filter_complex_metadata(all_splits)
|
|
vector_store.add_documents(documents=filtered_splits)
|
|
|
|
|
|
def add_pdf_files(
|
|
vector_store: Chroma,
|
|
file_paths: list[Path],
|
|
chunk_size: int,
|
|
chunk_overlap: int,
|
|
add_start_index: bool,
|
|
unstructerd: bool,
|
|
) -> None:
|
|
"""
|
|
Adds a list of PDF files as vector documents to the provided vector store.
|
|
|
|
The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
|
|
"""
|
|
logging.info("Adding PDF files to the vector store.")
|
|
|
|
pdf_files = get_all_local_pdf_files(file_paths)
|
|
logging.info(f"Found {len(pdf_files)} PDF files to add to the vector store.")
|
|
|
|
new_pdfs = []
|
|
for pdf_file in pdf_files:
|
|
if len(vector_store.get(where={"source": str(pdf_file)}, limit=1)["ids"]) == 0:
|
|
new_pdfs.append(pdf_file)
|
|
else:
|
|
logging.info(f"Skipping PDF {pdf_file}, as it is already in the database.")
|
|
|
|
if len(new_pdfs) == 0:
|
|
return
|
|
|
|
logging.info(f"{len(new_pdfs)} PDF's to add to the vector store.")
|
|
|
|
loaded_document = []
|
|
for file in new_pdfs:
|
|
loader = UnstructuredLoader(file_path=file, strategy="hi_res") if unstructerd else PyMuPDFLoader(file_path=file)
|
|
for document in loader.lazy_load():
|
|
loaded_document.append(document)
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index
|
|
)
|
|
|
|
pdf_splits = text_splitter.split_documents(loaded_document)
|
|
|
|
logging.info(f"{len(new_pdfs)} PDF's split in {len(pdf_splits)} vector store documents")
|
|
logging.info(f"Adding {len(pdf_splits)} vector store documents to vector store.")
|
|
|
|
vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))
|
|
|
|
|
|
def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]:
|
|
"""
|
|
Function that takes a list of local paths,
|
|
that might contain directories paths and/or direct file paths,
|
|
and returns a list with all file paths that are a PDF file or any PDF files found in the directory file paths.
|
|
This fucntion does not scan directories recursively.
|
|
"""
|
|
all_pdf_files = []
|
|
for path in local_paths:
|
|
if path.is_dir():
|
|
all_pdf_files.extend(list(path.glob("*.pdf")))
|
|
elif path.suffix == ".pdf":
|
|
all_pdf_files.append(path)
|
|
else:
|
|
logging.warning(f"Ignoring path {path} as it is not a folder or pdf file.")
|
|
|
|
return all_pdf_files
|