forked from AI_team/Philosophy-RAG-demo
Text reader added
This commit is contained in:
parent
da62a16fb9
commit
94a42c3892
34
config.yaml
34
config.yaml
@ -41,21 +41,29 @@ huggingface:
|
||||
|
||||
# --- Data Processing Settings ---
|
||||
|
||||
pdf:
|
||||
# List of paths to PDF files or folders containing PDFs.
|
||||
# Pydantic converts these strings to pathlib.Path objects.
|
||||
data:
|
||||
- "C:/path/folder"
|
||||
unstructured: false # Use the unstructured PDF loader?
|
||||
chunk_size: 1000
|
||||
chunk_overlap: 200
|
||||
add_start_index: false
|
||||
#pdf:
|
||||
# # List of paths to PDF files or folders containing PDFs.
|
||||
# # Pydantic converts these strings to pathlib.Path objects.
|
||||
# data:
|
||||
# - "C:/path/folder"
|
||||
# unstructured: false # Use the unstructured PDF loader?
|
||||
# chunk_size: 1000
|
||||
# chunk_overlap: 200
|
||||
# add_start_index: false
|
||||
#
|
||||
#web:
|
||||
# # List of URLs to scrape for data.
|
||||
# data:
|
||||
# - "https://www.example.nl/subdomain"
|
||||
# chunk_size: 200
|
||||
|
||||
web:
|
||||
# List of URLs to scrape for data.
|
||||
text:
|
||||
data:
|
||||
- "https://www.example.nl/subdomain"
|
||||
chunk_size: 200
|
||||
- "../transcriptions"
|
||||
unstructured: true
|
||||
chunk_size: 500
|
||||
chunk_overlap: 100
|
||||
add_start_index: false
|
||||
|
||||
chroma_db:
|
||||
location: "/app/data/vector_database" # Override default DB path (default: '.chroma_db')
|
||||
|
||||
@ -13,7 +13,7 @@ from generic_rag.parsers.config import AppSettings, load_settings
|
||||
from generic_rag.backend.models import get_chat_model, get_embedding_model, get_compression_model
|
||||
from generic_rag.graphs.cond_ret_gen import CondRetGenLangGraph
|
||||
from generic_rag.graphs.ret_gen import RetGenLangGraph
|
||||
from generic_rag.parsers.parser import add_pdf_files, add_urls
|
||||
from generic_rag.parsers.parser import add_pdf_files, add_urls, add_text_files
|
||||
|
||||
logger = logging.getLogger("sogeti-rag")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
@ -139,6 +139,16 @@ if __name__ == "__main__":
|
||||
settings.pdf.add_start_index,
|
||||
settings.pdf.unstructured,
|
||||
)
|
||||
|
||||
add_text_files(
|
||||
vector_store,
|
||||
settings.text.data,
|
||||
settings.text.chunk_size,
|
||||
settings.text.chunk_overlap,
|
||||
settings.text.add_start_index,
|
||||
settings.text.unstructured,
|
||||
)
|
||||
|
||||
add_urls(
|
||||
vector_store,
|
||||
settings.web.data,
|
||||
|
||||
@ -92,6 +92,14 @@ class PdfSettings(BaseModel):
|
||||
chunk_overlap: int = Field(default=200)
|
||||
add_start_index: bool = Field(default=False)
|
||||
|
||||
class TextSettings(BaseModel):
|
||||
"""Text processing settings."""
|
||||
|
||||
data: List[Path] = Field(default_factory=list)
|
||||
unstructured: bool = Field(default=False)
|
||||
chunk_size: int = Field(default=1000)
|
||||
chunk_overlap: int = Field(default=200)
|
||||
add_start_index: bool = Field(default=False)
|
||||
|
||||
class WebSettings(BaseModel):
|
||||
"""Web data processing settings."""
|
||||
@ -131,6 +139,7 @@ class AppSettings(BaseModel):
|
||||
|
||||
# --- Data processing settings ---
|
||||
pdf: PdfSettings = Field(default_factory=PdfSettings)
|
||||
text: TextSettings = Field(default_factory=TextSettings)
|
||||
web: WebSettings = Field(default_factory=WebSettings)
|
||||
chroma_db: ChromaDbSettings = Field(default_factory=ChromaDbSettings)
|
||||
|
||||
|
||||
@ -4,14 +4,13 @@ from pathlib import Path
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from langchain_chroma import Chroma
|
||||
from langchain_community.document_loaders import PyMuPDFLoader
|
||||
from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
|
||||
from langchain_community.vectorstores.utils import filter_complex_metadata
|
||||
from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
|
||||
from langchain_unstructured import UnstructuredLoader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")]
|
||||
|
||||
|
||||
@ -120,20 +119,72 @@ def add_pdf_files(
|
||||
vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))
|
||||
|
||||
|
||||
def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]:
|
||||
def add_text_files(
|
||||
vector_store: Chroma,
|
||||
file_paths: list[Path],
|
||||
chunk_size: int,
|
||||
chunk_overlap: int,
|
||||
add_start_index: bool,
|
||||
unstructerd: bool,
|
||||
) -> None:
|
||||
"""
|
||||
Adds a list of Text files as vector documents to the provided vector store.
|
||||
The Text file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
|
||||
"""
|
||||
logger.info("Adding Text files to the vector store.")
|
||||
text_files = get_all_local_text_files(file_paths)
|
||||
new_txts = []
|
||||
for txt_file in text_files:
|
||||
if len(vector_store.get(where={"source": str(txt_file)}, limit=1)["ids"]) == 0:
|
||||
new_txts.append(txt_file)
|
||||
else:
|
||||
logger.info(f"Skipping PDF {txt_file}, as it is already in the database.")
|
||||
|
||||
if len(new_txts) == 0:
|
||||
return
|
||||
|
||||
logger.info(f"{len(new_txts)} Txt(s) to add to the vector store.")
|
||||
|
||||
loaded_documents = []
|
||||
for file in new_txts:
|
||||
loader = UnstructuredLoader(file_path=file, strategy="basic")
|
||||
for document in loader.lazy_load():
|
||||
loaded_documents.append(document)
|
||||
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index
|
||||
)
|
||||
|
||||
txt_splits = text_splitter.split_documents(loaded_documents)
|
||||
|
||||
logger.info(f"{len(txt_splits)} PDF's split in {len(txt_splits)} vector store documents")
|
||||
logger.info(f"Adding {len(txt_splits)} vector store documents to vector store.")
|
||||
|
||||
vector_store.add_documents(documents=filter_complex_metadata(txt_splits))
|
||||
|
||||
|
||||
def get_all_local_files(local_paths: list[Path], extension="pdf") -> list[Path]:
|
||||
"""
|
||||
Function that takes a list of local paths,
|
||||
that might contain directories paths and/or direct file paths,
|
||||
and returns a list with all file paths that are a PDF file or any PDF files found in the directory file paths.
|
||||
This fucntion does not scan directories recursively.
|
||||
and returns a list with all file paths that are a file with the matching extension.
|
||||
This function does not scan directories recursively.
|
||||
"""
|
||||
all_pdf_files = []
|
||||
for path in local_paths:
|
||||
if path.is_dir():
|
||||
all_pdf_files.extend(list(path.glob("*.pdf")))
|
||||
elif path.suffix == ".pdf":
|
||||
all_pdf_files.extend(list(path.glob(f"*.{extension}")))
|
||||
elif path.suffix == f".{extension}":
|
||||
all_pdf_files.append(path)
|
||||
else:
|
||||
logger.warning(f"Ignoring path {path} as it is not a folder or pdf file.")
|
||||
logger.warning(f"Ignoring path {path} as it is not a folder or {extension} file.")
|
||||
|
||||
return all_pdf_files
|
||||
|
||||
|
||||
def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]:
|
||||
return get_all_local_files(local_paths, "pdf")
|
||||
|
||||
|
||||
def get_all_local_text_files(local_paths: list[Path]) -> list[Path]:
|
||||
return get_all_local_files(local_paths, "txt")
|
||||
|
||||
Loading…
Reference in New Issue
Block a user