Text reader added

2025-04-18 15:33:35 +02:00 · 2025-04-18 15:33:35 +02:00 · 94a42c3892
commit 94a42c3892
parent da62a16fb9
4 changed files with 106 additions and 28 deletions
--- a/config.yaml
+++ b/config.yaml
@ -41,21 +41,29 @@ huggingface:

 # --- Data Processing Settings ---

-pdf:
-  # List of paths to PDF files or folders containing PDFs.
-  # Pydantic converts these strings to pathlib.Path objects.
-  data:
-    - "C:/path/folder"
-  unstructured: false # Use the unstructured PDF loader?
-  chunk_size: 1000
-  chunk_overlap: 200
-  add_start_index: false
+#pdf:
+#  # List of paths to PDF files or folders containing PDFs.
+#  # Pydantic converts these strings to pathlib.Path objects.
+#  data:
+#    - "C:/path/folder"
+#  unstructured: false # Use the unstructured PDF loader?
+#  chunk_size: 1000
+#  chunk_overlap: 200
+#  add_start_index: false
+#
+#web:
+#  # List of URLs to scrape for data.
+#  data:
+#    - "https://www.example.nl/subdomain"
+#  chunk_size: 200

-web:
-  # List of URLs to scrape for data.
+text:
  data:
-    - "https://www.example.nl/subdomain"
-  chunk_size: 200
+    - "../transcriptions"
+  unstructured: true
+  chunk_size: 500
+  chunk_overlap: 100
+  add_start_index: false

 chroma_db:
  location: "/app/data/vector_database" # Override default DB path (default: '.chroma_db')
--- a/generic_rag/app.py
+++ b/generic_rag/app.py
@ -13,7 +13,7 @@ from generic_rag.parsers.config import AppSettings, load_settings
 from generic_rag.backend.models import get_chat_model, get_embedding_model, get_compression_model
 from generic_rag.graphs.cond_ret_gen import CondRetGenLangGraph
 from generic_rag.graphs.ret_gen import RetGenLangGraph
-from generic_rag.parsers.parser import add_pdf_files, add_urls
+from generic_rag.parsers.parser import add_pdf_files, add_urls, add_text_files

 logger = logging.getLogger("sogeti-rag")
 logger.setLevel(logging.DEBUG)
@ -139,6 +139,16 @@ if __name__ == "__main__":
        settings.pdf.add_start_index,
        settings.pdf.unstructured,
    )
+
+    add_text_files(
+        vector_store,
+        settings.text.data,
+        settings.text.chunk_size,
+        settings.text.chunk_overlap,
+        settings.text.add_start_index,
+        settings.text.unstructured,
+    )
+
    add_urls(
        vector_store,
        settings.web.data,
--- a/generic_rag/parsers/config.py
+++ b/generic_rag/parsers/config.py
@ -92,6 +92,14 @@ class PdfSettings(BaseModel):
    chunk_overlap: int = Field(default=200)
    add_start_index: bool = Field(default=False)

+class TextSettings(BaseModel):
+    """Text processing settings."""
+
+    data: List[Path] = Field(default_factory=list)
+    unstructured: bool = Field(default=False)
+    chunk_size: int = Field(default=1000)
+    chunk_overlap: int = Field(default=200)
+    add_start_index: bool = Field(default=False)

 class WebSettings(BaseModel):
    """Web data processing settings."""
@ -131,6 +139,7 @@ class AppSettings(BaseModel):

    # --- Data processing settings ---
    pdf: PdfSettings = Field(default_factory=PdfSettings)
+    text: TextSettings = Field(default_factory=TextSettings)
    web: WebSettings = Field(default_factory=WebSettings)
    chroma_db: ChromaDbSettings = Field(default_factory=ChromaDbSettings)

--- a/generic_rag/parsers/parser.py
+++ b/generic_rag/parsers/parser.py
@ -4,14 +4,13 @@ from pathlib import Path
 import requests
 from bs4 import BeautifulSoup, Tag
 from langchain_chroma import Chroma
-from langchain_community.document_loaders import PyMuPDFLoader
+from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
 from langchain_community.vectorstores.utils import filter_complex_metadata
 from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
 from langchain_unstructured import UnstructuredLoader

 logger = logging.getLogger(__name__)

-
 headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")]


@ -120,20 +119,72 @@ def add_pdf_files(
    vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))


-def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]:
+def add_text_files(
+        vector_store: Chroma,
+        file_paths: list[Path],
+        chunk_size: int,
+        chunk_overlap: int,
+        add_start_index: bool,
+        unstructerd: bool,
+) -> None:
+    """
+    Adds a list of Text files as vector documents to the provided vector store.
+    The Text file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
+    """
+    logger.info("Adding Text files to the vector store.")
+    text_files = get_all_local_text_files(file_paths)
+    new_txts = []
+    for txt_file in text_files:
+        if len(vector_store.get(where={"source": str(txt_file)}, limit=1)["ids"]) == 0:
+            new_txts.append(txt_file)
+        else:
+            logger.info(f"Skipping PDF {txt_file}, as it is already in the database.")
+
+    if len(new_txts) == 0:
+        return
+
+    logger.info(f"{len(new_txts)} Txt(s) to add to the vector store.")
+
+    loaded_documents = []
+    for file in new_txts:
+        loader = UnstructuredLoader(file_path=file, strategy="basic")
+        for document in loader.lazy_load():
+            loaded_documents.append(document)
+
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index
+    )
+
+    txt_splits = text_splitter.split_documents(loaded_documents)
+
+    logger.info(f"{len(txt_splits)} PDF's split in {len(txt_splits)} vector store documents")
+    logger.info(f"Adding {len(txt_splits)} vector store documents to vector store.")
+
+    vector_store.add_documents(documents=filter_complex_metadata(txt_splits))
+
+
+def get_all_local_files(local_paths: list[Path], extension="pdf") -> list[Path]:
    """
    Function that takes a list of local paths,
    that might contain directories paths and/or direct file paths,
-    and returns a list with all file paths that are a PDF file or any PDF files found in the directory file paths.
-    This fucntion does not scan directories recursively.
+    and returns a list with all file paths that are a file with the matching extension.
+    This function does not scan directories recursively.
    """
    all_pdf_files = []
    for path in local_paths:
        if path.is_dir():
-            all_pdf_files.extend(list(path.glob("*.pdf")))
-        elif path.suffix == ".pdf":
+            all_pdf_files.extend(list(path.glob(f"*.{extension}")))
+        elif path.suffix == f".{extension}":
            all_pdf_files.append(path)
        else:
-            logger.warning(f"Ignoring path {path} as it is not a folder or pdf file.")
+            logger.warning(f"Ignoring path {path} as it is not a folder or {extension} file.")

    return all_pdf_files
+
+
+def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]:
+    return get_all_local_files(local_paths, "pdf")
+
+
+def get_all_local_text_files(local_paths: list[Path]) -> list[Path]:
+    return get_all_local_files(local_paths, "txt")