diff --git a/config.yaml b/config.yaml index 672f4c7..989988e 100644 --- a/config.yaml +++ b/config.yaml @@ -41,21 +41,29 @@ huggingface: # --- Data Processing Settings --- -pdf: - # List of paths to PDF files or folders containing PDFs. - # Pydantic converts these strings to pathlib.Path objects. - data: - - "C:/path/folder" - unstructured: false # Use the unstructured PDF loader? - chunk_size: 1000 - chunk_overlap: 200 - add_start_index: false +#pdf: +# # List of paths to PDF files or folders containing PDFs. +# # Pydantic converts these strings to pathlib.Path objects. +# data: +# - "C:/path/folder" +# unstructured: false # Use the unstructured PDF loader? +# chunk_size: 1000 +# chunk_overlap: 200 +# add_start_index: false +# +#web: +# # List of URLs to scrape for data. +# data: +# - "https://www.example.nl/subdomain" +# chunk_size: 200 -web: - # List of URLs to scrape for data. +text: data: - - "https://www.example.nl/subdomain" - chunk_size: 200 + - "../transcriptions" + unstructured: true + chunk_size: 500 + chunk_overlap: 100 + add_start_index: false chroma_db: location: "/app/data/vector_database" # Override default DB path (default: '.chroma_db') diff --git a/generic_rag/app.py b/generic_rag/app.py index fbde75a..d360827 100644 --- a/generic_rag/app.py +++ b/generic_rag/app.py @@ -13,7 +13,7 @@ from generic_rag.parsers.config import AppSettings, load_settings from generic_rag.backend.models import get_chat_model, get_embedding_model, get_compression_model from generic_rag.graphs.cond_ret_gen import CondRetGenLangGraph from generic_rag.graphs.ret_gen import RetGenLangGraph -from generic_rag.parsers.parser import add_pdf_files, add_urls +from generic_rag.parsers.parser import add_pdf_files, add_urls, add_text_files logger = logging.getLogger("sogeti-rag") logger.setLevel(logging.DEBUG) @@ -139,6 +139,16 @@ if __name__ == "__main__": settings.pdf.add_start_index, settings.pdf.unstructured, ) + + add_text_files( + vector_store, + settings.text.data, + settings.text.chunk_size, + settings.text.chunk_overlap, + settings.text.add_start_index, + settings.text.unstructured, + ) + add_urls( vector_store, settings.web.data, diff --git a/generic_rag/parsers/config.py b/generic_rag/parsers/config.py index d7f14f5..7856e2c 100644 --- a/generic_rag/parsers/config.py +++ b/generic_rag/parsers/config.py @@ -92,6 +92,14 @@ class PdfSettings(BaseModel): chunk_overlap: int = Field(default=200) add_start_index: bool = Field(default=False) +class TextSettings(BaseModel): + """Text processing settings.""" + + data: List[Path] = Field(default_factory=list) + unstructured: bool = Field(default=False) + chunk_size: int = Field(default=1000) + chunk_overlap: int = Field(default=200) + add_start_index: bool = Field(default=False) class WebSettings(BaseModel): """Web data processing settings.""" @@ -131,6 +139,7 @@ class AppSettings(BaseModel): # --- Data processing settings --- pdf: PdfSettings = Field(default_factory=PdfSettings) + text: TextSettings = Field(default_factory=TextSettings) web: WebSettings = Field(default_factory=WebSettings) chroma_db: ChromaDbSettings = Field(default_factory=ChromaDbSettings) diff --git a/generic_rag/parsers/parser.py b/generic_rag/parsers/parser.py index be5bd28..ea67b00 100644 --- a/generic_rag/parsers/parser.py +++ b/generic_rag/parsers/parser.py @@ -4,14 +4,13 @@ from pathlib import Path import requests from bs4 import BeautifulSoup, Tag from langchain_chroma import Chroma -from langchain_community.document_loaders import PyMuPDFLoader +from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader from langchain_community.vectorstores.utils import filter_complex_metadata from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter from langchain_unstructured import UnstructuredLoader logger = logging.getLogger(__name__) - headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")] @@ -74,12 +73,12 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None: def add_pdf_files( - vector_store: Chroma, - file_paths: list[Path], - chunk_size: int, - chunk_overlap: int, - add_start_index: bool, - unstructerd: bool, + vector_store: Chroma, + file_paths: list[Path], + chunk_size: int, + chunk_overlap: int, + add_start_index: bool, + unstructerd: bool, ) -> None: """ Adds a list of PDF files as vector documents to the provided vector store. @@ -120,20 +119,72 @@ def add_pdf_files( vector_store.add_documents(documents=filter_complex_metadata(pdf_splits)) -def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]: +def add_text_files( + vector_store: Chroma, + file_paths: list[Path], + chunk_size: int, + chunk_overlap: int, + add_start_index: bool, + unstructerd: bool, +) -> None: + """ + Adds a list of Text files as vector documents to the provided vector store. + The Text file will be parsed per page and split into chunks of text with the provided chunk size and overlap. + """ + logger.info("Adding Text files to the vector store.") + text_files = get_all_local_text_files(file_paths) + new_txts = [] + for txt_file in text_files: + if len(vector_store.get(where={"source": str(txt_file)}, limit=1)["ids"]) == 0: + new_txts.append(txt_file) + else: + logger.info(f"Skipping PDF {txt_file}, as it is already in the database.") + + if len(new_txts) == 0: + return + + logger.info(f"{len(new_txts)} Txt(s) to add to the vector store.") + + loaded_documents = [] + for file in new_txts: + loader = UnstructuredLoader(file_path=file, strategy="basic") + for document in loader.lazy_load(): + loaded_documents.append(document) + + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index + ) + + txt_splits = text_splitter.split_documents(loaded_documents) + + logger.info(f"{len(txt_splits)} PDF's split in {len(txt_splits)} vector store documents") + logger.info(f"Adding {len(txt_splits)} vector store documents to vector store.") + + vector_store.add_documents(documents=filter_complex_metadata(txt_splits)) + + +def get_all_local_files(local_paths: list[Path], extension="pdf") -> list[Path]: """ Function that takes a list of local paths, that might contain directories paths and/or direct file paths, - and returns a list with all file paths that are a PDF file or any PDF files found in the directory file paths. - This fucntion does not scan directories recursively. + and returns a list with all file paths that are a file with the matching extension. + This function does not scan directories recursively. """ all_pdf_files = [] for path in local_paths: if path.is_dir(): - all_pdf_files.extend(list(path.glob("*.pdf"))) - elif path.suffix == ".pdf": + all_pdf_files.extend(list(path.glob(f"*.{extension}"))) + elif path.suffix == f".{extension}": all_pdf_files.append(path) else: - logger.warning(f"Ignoring path {path} as it is not a folder or pdf file.") + logger.warning(f"Ignoring path {path} as it is not a folder or {extension} file.") return all_pdf_files + + +def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]: + return get_all_local_files(local_paths, "pdf") + + +def get_all_local_text_files(local_paths: list[Path]) -> list[Path]: + return get_all_local_files(local_paths, "txt")