Text reader added

2025-04-18 15:33:35 +02:00 · 2025-04-18 15:33:35 +02:00 · 94a42c3892
commit 94a42c3892
parent da62a16fb9
4 changed files with 106 additions and 28 deletions
--- a/config.yaml
+++ b/config.yaml
@ -41,21 +41,29 @@ huggingface:
 # --- Data Processing Settings ---
-pdf:
+#pdf:
-  # List of paths to PDF files or folders containing PDFs.
+#  # List of paths to PDF files or folders containing PDFs.
-  # Pydantic converts these strings to pathlib.Path objects.
+#  # Pydantic converts these strings to pathlib.Path objects.
-  data:
+#  data:
-    - "C:/path/folder"
+#    - "C:/path/folder"
-  unstructured: false # Use the unstructured PDF loader?
+#  unstructured: false # Use the unstructured PDF loader?
-  chunk_size: 1000
+#  chunk_size: 1000
-  chunk_overlap: 200
+#  chunk_overlap: 200
-  add_start_index: false
+#  add_start_index: false
 #
 #web:
 #  # List of URLs to scrape for data.
 #  data:
 #    - "https://www.example.nl/subdomain"
 #  chunk_size: 200
-web:
+text:
  # List of URLs to scrape for data.
  data:
-    - "https://www.example.nl/subdomain"
+    - "../transcriptions"
-  chunk_size: 200
+  unstructured: true
  chunk_size: 500
  chunk_overlap: 100
  add_start_index: false
 chroma_db:
  location: "/app/data/vector_database" # Override default DB path (default: '.chroma_db')
--- a/generic_rag/app.py
+++ b/generic_rag/app.py
@ -13,7 +13,7 @@ from generic_rag.parsers.config import AppSettings, load_settings
 from generic_rag.backend.models import get_chat_model, get_embedding_model, get_compression_model
 from generic_rag.graphs.cond_ret_gen import CondRetGenLangGraph
 from generic_rag.graphs.ret_gen import RetGenLangGraph
-from generic_rag.parsers.parser import add_pdf_files, add_urls
+from generic_rag.parsers.parser import add_pdf_files, add_urls, add_text_files
 logger = logging.getLogger("sogeti-rag")
 logger.setLevel(logging.DEBUG)
@ -139,6 +139,16 @@ if __name__ == "__main__":
        settings.pdf.add_start_index,
        settings.pdf.unstructured,
    )
    add_text_files(
        vector_store,
        settings.text.data,
        settings.text.chunk_size,
        settings.text.chunk_overlap,
        settings.text.add_start_index,
        settings.text.unstructured,
    )
    add_urls(
        vector_store,
        settings.web.data,
--- a/generic_rag/parsers/config.py
+++ b/generic_rag/parsers/config.py
@ -92,6 +92,14 @@ class PdfSettings(BaseModel):
    chunk_overlap: int = Field(default=200)
    add_start_index: bool = Field(default=False)
 class TextSettings(BaseModel):
    """Text processing settings."""
    data: List[Path] = Field(default_factory=list)
    unstructured: bool = Field(default=False)
    chunk_size: int = Field(default=1000)
    chunk_overlap: int = Field(default=200)
    add_start_index: bool = Field(default=False)
 class WebSettings(BaseModel):
    """Web data processing settings."""
@ -131,6 +139,7 @@ class AppSettings(BaseModel):
    # --- Data processing settings ---
    pdf: PdfSettings = Field(default_factory=PdfSettings)
    text: TextSettings = Field(default_factory=TextSettings)
    web: WebSettings = Field(default_factory=WebSettings)
    chroma_db: ChromaDbSettings = Field(default_factory=ChromaDbSettings)
--- a/generic_rag/parsers/parser.py
+++ b/generic_rag/parsers/parser.py
@ -4,14 +4,13 @@ from pathlib import Path
 import requests
 from bs4 import BeautifulSoup, Tag
 from langchain_chroma import Chroma
-from langchain_community.document_loaders import PyMuPDFLoader
+from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
 from langchain_community.vectorstores.utils import filter_complex_metadata
 from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
 from langchain_unstructured import UnstructuredLoader
 logger = logging.getLogger(__name__)
 headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")]
@ -120,20 +119,72 @@ def add_pdf_files(
    vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))
-def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]:
+def add_text_files(
        vector_store: Chroma,
        file_paths: list[Path],
        chunk_size: int,
        chunk_overlap: int,
        add_start_index: bool,
        unstructerd: bool,
 ) -> None:
    """
    Adds a list of Text files as vector documents to the provided vector store.
    The Text file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
    """
    logger.info("Adding Text files to the vector store.")
    text_files = get_all_local_text_files(file_paths)
    new_txts = []
    for txt_file in text_files:
        if len(vector_store.get(where={"source": str(txt_file)}, limit=1)["ids"]) == 0:
            new_txts.append(txt_file)
        else:
            logger.info(f"Skipping PDF {txt_file}, as it is already in the database.")
    if len(new_txts) == 0:
        return
    logger.info(f"{len(new_txts)} Txt(s) to add to the vector store.")
    loaded_documents = []
    for file in new_txts:
        loader = UnstructuredLoader(file_path=file, strategy="basic")
        for document in loader.lazy_load():
            loaded_documents.append(document)
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index
    )
    txt_splits = text_splitter.split_documents(loaded_documents)
    logger.info(f"{len(txt_splits)} PDF's split in {len(txt_splits)} vector store documents")
    logger.info(f"Adding {len(txt_splits)} vector store documents to vector store.")
    vector_store.add_documents(documents=filter_complex_metadata(txt_splits))
 def get_all_local_files(local_paths: list[Path], extension="pdf") -> list[Path]:
    """
    Function that takes a list of local paths,
    that might contain directories paths and/or direct file paths,
-    and returns a list with all file paths that are a PDF file or any PDF files found in the directory file paths.
+    and returns a list with all file paths that are a file with the matching extension.
-    This fucntion does not scan directories recursively.
+    This function does not scan directories recursively.
    """
    all_pdf_files = []
    for path in local_paths:
        if path.is_dir():
-            all_pdf_files.extend(list(path.glob("*.pdf")))
+            all_pdf_files.extend(list(path.glob(f"*.{extension}")))
-        elif path.suffix == ".pdf":
+        elif path.suffix == f".{extension}":
            all_pdf_files.append(path)
        else:
-            logger.warning(f"Ignoring path {path} as it is not a folder or pdf file.")
+            logger.warning(f"Ignoring path {path} as it is not a folder or {extension} file.")
    return all_pdf_files
 def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]:
    return get_all_local_files(local_paths, "pdf")
 def get_all_local_text_files(local_paths: list[Path]) -> list[Path]:
    return get_all_local_files(local_paths, "txt")