Text reader added

This commit is contained in:
joep 2025-04-18 15:33:35 +02:00
parent da62a16fb9
commit 94a42c3892
4 changed files with 106 additions and 28 deletions

View File

@ -41,21 +41,29 @@ huggingface:
# --- Data Processing Settings --- # --- Data Processing Settings ---
pdf: #pdf:
# List of paths to PDF files or folders containing PDFs. # # List of paths to PDF files or folders containing PDFs.
# Pydantic converts these strings to pathlib.Path objects. # # Pydantic converts these strings to pathlib.Path objects.
data: # data:
- "C:/path/folder" # - "C:/path/folder"
unstructured: false # Use the unstructured PDF loader? # unstructured: false # Use the unstructured PDF loader?
chunk_size: 1000 # chunk_size: 1000
chunk_overlap: 200 # chunk_overlap: 200
add_start_index: false # add_start_index: false
#
#web:
# # List of URLs to scrape for data.
# data:
# - "https://www.example.nl/subdomain"
# chunk_size: 200
web: text:
# List of URLs to scrape for data.
data: data:
- "https://www.example.nl/subdomain" - "../transcriptions"
chunk_size: 200 unstructured: true
chunk_size: 500
chunk_overlap: 100
add_start_index: false
chroma_db: chroma_db:
location: "/app/data/vector_database" # Override default DB path (default: '.chroma_db') location: "/app/data/vector_database" # Override default DB path (default: '.chroma_db')

View File

@ -13,7 +13,7 @@ from generic_rag.parsers.config import AppSettings, load_settings
from generic_rag.backend.models import get_chat_model, get_embedding_model, get_compression_model from generic_rag.backend.models import get_chat_model, get_embedding_model, get_compression_model
from generic_rag.graphs.cond_ret_gen import CondRetGenLangGraph from generic_rag.graphs.cond_ret_gen import CondRetGenLangGraph
from generic_rag.graphs.ret_gen import RetGenLangGraph from generic_rag.graphs.ret_gen import RetGenLangGraph
from generic_rag.parsers.parser import add_pdf_files, add_urls from generic_rag.parsers.parser import add_pdf_files, add_urls, add_text_files
logger = logging.getLogger("sogeti-rag") logger = logging.getLogger("sogeti-rag")
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
@ -139,6 +139,16 @@ if __name__ == "__main__":
settings.pdf.add_start_index, settings.pdf.add_start_index,
settings.pdf.unstructured, settings.pdf.unstructured,
) )
add_text_files(
vector_store,
settings.text.data,
settings.text.chunk_size,
settings.text.chunk_overlap,
settings.text.add_start_index,
settings.text.unstructured,
)
add_urls( add_urls(
vector_store, vector_store,
settings.web.data, settings.web.data,

View File

@ -92,6 +92,14 @@ class PdfSettings(BaseModel):
chunk_overlap: int = Field(default=200) chunk_overlap: int = Field(default=200)
add_start_index: bool = Field(default=False) add_start_index: bool = Field(default=False)
class TextSettings(BaseModel):
"""Text processing settings."""
data: List[Path] = Field(default_factory=list)
unstructured: bool = Field(default=False)
chunk_size: int = Field(default=1000)
chunk_overlap: int = Field(default=200)
add_start_index: bool = Field(default=False)
class WebSettings(BaseModel): class WebSettings(BaseModel):
"""Web data processing settings.""" """Web data processing settings."""
@ -131,6 +139,7 @@ class AppSettings(BaseModel):
# --- Data processing settings --- # --- Data processing settings ---
pdf: PdfSettings = Field(default_factory=PdfSettings) pdf: PdfSettings = Field(default_factory=PdfSettings)
text: TextSettings = Field(default_factory=TextSettings)
web: WebSettings = Field(default_factory=WebSettings) web: WebSettings = Field(default_factory=WebSettings)
chroma_db: ChromaDbSettings = Field(default_factory=ChromaDbSettings) chroma_db: ChromaDbSettings = Field(default_factory=ChromaDbSettings)

View File

@ -4,14 +4,13 @@ from pathlib import Path
import requests import requests
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from langchain_chroma import Chroma from langchain_chroma import Chroma
from langchain_community.document_loaders import PyMuPDFLoader from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
from langchain_community.vectorstores.utils import filter_complex_metadata from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
from langchain_unstructured import UnstructuredLoader from langchain_unstructured import UnstructuredLoader
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")] headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")]
@ -120,20 +119,72 @@ def add_pdf_files(
vector_store.add_documents(documents=filter_complex_metadata(pdf_splits)) vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))
def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]: def add_text_files(
vector_store: Chroma,
file_paths: list[Path],
chunk_size: int,
chunk_overlap: int,
add_start_index: bool,
unstructerd: bool,
) -> None:
"""
Adds a list of Text files as vector documents to the provided vector store.
The Text file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
"""
logger.info("Adding Text files to the vector store.")
text_files = get_all_local_text_files(file_paths)
new_txts = []
for txt_file in text_files:
if len(vector_store.get(where={"source": str(txt_file)}, limit=1)["ids"]) == 0:
new_txts.append(txt_file)
else:
logger.info(f"Skipping PDF {txt_file}, as it is already in the database.")
if len(new_txts) == 0:
return
logger.info(f"{len(new_txts)} Txt(s) to add to the vector store.")
loaded_documents = []
for file in new_txts:
loader = UnstructuredLoader(file_path=file, strategy="basic")
for document in loader.lazy_load():
loaded_documents.append(document)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index
)
txt_splits = text_splitter.split_documents(loaded_documents)
logger.info(f"{len(txt_splits)} PDF's split in {len(txt_splits)} vector store documents")
logger.info(f"Adding {len(txt_splits)} vector store documents to vector store.")
vector_store.add_documents(documents=filter_complex_metadata(txt_splits))
def get_all_local_files(local_paths: list[Path], extension="pdf") -> list[Path]:
""" """
Function that takes a list of local paths, Function that takes a list of local paths,
that might contain directories paths and/or direct file paths, that might contain directories paths and/or direct file paths,
and returns a list with all file paths that are a PDF file or any PDF files found in the directory file paths. and returns a list with all file paths that are a file with the matching extension.
This fucntion does not scan directories recursively. This function does not scan directories recursively.
""" """
all_pdf_files = [] all_pdf_files = []
for path in local_paths: for path in local_paths:
if path.is_dir(): if path.is_dir():
all_pdf_files.extend(list(path.glob("*.pdf"))) all_pdf_files.extend(list(path.glob(f"*.{extension}")))
elif path.suffix == ".pdf": elif path.suffix == f".{extension}":
all_pdf_files.append(path) all_pdf_files.append(path)
else: else:
logger.warning(f"Ignoring path {path} as it is not a folder or pdf file.") logger.warning(f"Ignoring path {path} as it is not a folder or {extension} file.")
return all_pdf_files return all_pdf_files
def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]:
return get_all_local_files(local_paths, "pdf")
def get_all_local_text_files(local_paths: list[Path]) -> list[Path]:
return get_all_local_files(local_paths, "txt")