forked from AI_team/Philosophy-RAG-demo
Text reader added
This commit is contained in:
parent
da62a16fb9
commit
94a42c3892
34
config.yaml
34
config.yaml
@ -41,21 +41,29 @@ huggingface:
|
|||||||
|
|
||||||
# --- Data Processing Settings ---
|
# --- Data Processing Settings ---
|
||||||
|
|
||||||
pdf:
|
#pdf:
|
||||||
# List of paths to PDF files or folders containing PDFs.
|
# # List of paths to PDF files or folders containing PDFs.
|
||||||
# Pydantic converts these strings to pathlib.Path objects.
|
# # Pydantic converts these strings to pathlib.Path objects.
|
||||||
data:
|
# data:
|
||||||
- "C:/path/folder"
|
# - "C:/path/folder"
|
||||||
unstructured: false # Use the unstructured PDF loader?
|
# unstructured: false # Use the unstructured PDF loader?
|
||||||
chunk_size: 1000
|
# chunk_size: 1000
|
||||||
chunk_overlap: 200
|
# chunk_overlap: 200
|
||||||
add_start_index: false
|
# add_start_index: false
|
||||||
|
#
|
||||||
|
#web:
|
||||||
|
# # List of URLs to scrape for data.
|
||||||
|
# data:
|
||||||
|
# - "https://www.example.nl/subdomain"
|
||||||
|
# chunk_size: 200
|
||||||
|
|
||||||
web:
|
text:
|
||||||
# List of URLs to scrape for data.
|
|
||||||
data:
|
data:
|
||||||
- "https://www.example.nl/subdomain"
|
- "../transcriptions"
|
||||||
chunk_size: 200
|
unstructured: true
|
||||||
|
chunk_size: 500
|
||||||
|
chunk_overlap: 100
|
||||||
|
add_start_index: false
|
||||||
|
|
||||||
chroma_db:
|
chroma_db:
|
||||||
location: "/app/data/vector_database" # Override default DB path (default: '.chroma_db')
|
location: "/app/data/vector_database" # Override default DB path (default: '.chroma_db')
|
||||||
|
|||||||
@ -13,7 +13,7 @@ from generic_rag.parsers.config import AppSettings, load_settings
|
|||||||
from generic_rag.backend.models import get_chat_model, get_embedding_model, get_compression_model
|
from generic_rag.backend.models import get_chat_model, get_embedding_model, get_compression_model
|
||||||
from generic_rag.graphs.cond_ret_gen import CondRetGenLangGraph
|
from generic_rag.graphs.cond_ret_gen import CondRetGenLangGraph
|
||||||
from generic_rag.graphs.ret_gen import RetGenLangGraph
|
from generic_rag.graphs.ret_gen import RetGenLangGraph
|
||||||
from generic_rag.parsers.parser import add_pdf_files, add_urls
|
from generic_rag.parsers.parser import add_pdf_files, add_urls, add_text_files
|
||||||
|
|
||||||
logger = logging.getLogger("sogeti-rag")
|
logger = logging.getLogger("sogeti-rag")
|
||||||
logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.DEBUG)
|
||||||
@ -139,6 +139,16 @@ if __name__ == "__main__":
|
|||||||
settings.pdf.add_start_index,
|
settings.pdf.add_start_index,
|
||||||
settings.pdf.unstructured,
|
settings.pdf.unstructured,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
add_text_files(
|
||||||
|
vector_store,
|
||||||
|
settings.text.data,
|
||||||
|
settings.text.chunk_size,
|
||||||
|
settings.text.chunk_overlap,
|
||||||
|
settings.text.add_start_index,
|
||||||
|
settings.text.unstructured,
|
||||||
|
)
|
||||||
|
|
||||||
add_urls(
|
add_urls(
|
||||||
vector_store,
|
vector_store,
|
||||||
settings.web.data,
|
settings.web.data,
|
||||||
|
|||||||
@ -92,6 +92,14 @@ class PdfSettings(BaseModel):
|
|||||||
chunk_overlap: int = Field(default=200)
|
chunk_overlap: int = Field(default=200)
|
||||||
add_start_index: bool = Field(default=False)
|
add_start_index: bool = Field(default=False)
|
||||||
|
|
||||||
|
class TextSettings(BaseModel):
|
||||||
|
"""Text processing settings."""
|
||||||
|
|
||||||
|
data: List[Path] = Field(default_factory=list)
|
||||||
|
unstructured: bool = Field(default=False)
|
||||||
|
chunk_size: int = Field(default=1000)
|
||||||
|
chunk_overlap: int = Field(default=200)
|
||||||
|
add_start_index: bool = Field(default=False)
|
||||||
|
|
||||||
class WebSettings(BaseModel):
|
class WebSettings(BaseModel):
|
||||||
"""Web data processing settings."""
|
"""Web data processing settings."""
|
||||||
@ -131,6 +139,7 @@ class AppSettings(BaseModel):
|
|||||||
|
|
||||||
# --- Data processing settings ---
|
# --- Data processing settings ---
|
||||||
pdf: PdfSettings = Field(default_factory=PdfSettings)
|
pdf: PdfSettings = Field(default_factory=PdfSettings)
|
||||||
|
text: TextSettings = Field(default_factory=TextSettings)
|
||||||
web: WebSettings = Field(default_factory=WebSettings)
|
web: WebSettings = Field(default_factory=WebSettings)
|
||||||
chroma_db: ChromaDbSettings = Field(default_factory=ChromaDbSettings)
|
chroma_db: ChromaDbSettings = Field(default_factory=ChromaDbSettings)
|
||||||
|
|
||||||
|
|||||||
@ -4,14 +4,13 @@ from pathlib import Path
|
|||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
from langchain_chroma import Chroma
|
from langchain_chroma import Chroma
|
||||||
from langchain_community.document_loaders import PyMuPDFLoader
|
from langchain_community.document_loaders import PyMuPDFLoader, DirectoryLoader
|
||||||
from langchain_community.vectorstores.utils import filter_complex_metadata
|
from langchain_community.vectorstores.utils import filter_complex_metadata
|
||||||
from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
|
from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
|
||||||
from langchain_unstructured import UnstructuredLoader
|
from langchain_unstructured import UnstructuredLoader
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")]
|
headers_to_split_on = [("h1", "Header 1"), ("h2", "Header 2")]
|
||||||
|
|
||||||
|
|
||||||
@ -74,12 +73,12 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
|
|||||||
|
|
||||||
|
|
||||||
def add_pdf_files(
|
def add_pdf_files(
|
||||||
vector_store: Chroma,
|
vector_store: Chroma,
|
||||||
file_paths: list[Path],
|
file_paths: list[Path],
|
||||||
chunk_size: int,
|
chunk_size: int,
|
||||||
chunk_overlap: int,
|
chunk_overlap: int,
|
||||||
add_start_index: bool,
|
add_start_index: bool,
|
||||||
unstructerd: bool,
|
unstructerd: bool,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Adds a list of PDF files as vector documents to the provided vector store.
|
Adds a list of PDF files as vector documents to the provided vector store.
|
||||||
@ -120,20 +119,72 @@ def add_pdf_files(
|
|||||||
vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))
|
vector_store.add_documents(documents=filter_complex_metadata(pdf_splits))
|
||||||
|
|
||||||
|
|
||||||
def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]:
|
def add_text_files(
|
||||||
|
vector_store: Chroma,
|
||||||
|
file_paths: list[Path],
|
||||||
|
chunk_size: int,
|
||||||
|
chunk_overlap: int,
|
||||||
|
add_start_index: bool,
|
||||||
|
unstructerd: bool,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Adds a list of Text files as vector documents to the provided vector store.
|
||||||
|
The Text file will be parsed per page and split into chunks of text with the provided chunk size and overlap.
|
||||||
|
"""
|
||||||
|
logger.info("Adding Text files to the vector store.")
|
||||||
|
text_files = get_all_local_text_files(file_paths)
|
||||||
|
new_txts = []
|
||||||
|
for txt_file in text_files:
|
||||||
|
if len(vector_store.get(where={"source": str(txt_file)}, limit=1)["ids"]) == 0:
|
||||||
|
new_txts.append(txt_file)
|
||||||
|
else:
|
||||||
|
logger.info(f"Skipping PDF {txt_file}, as it is already in the database.")
|
||||||
|
|
||||||
|
if len(new_txts) == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info(f"{len(new_txts)} Txt(s) to add to the vector store.")
|
||||||
|
|
||||||
|
loaded_documents = []
|
||||||
|
for file in new_txts:
|
||||||
|
loader = UnstructuredLoader(file_path=file, strategy="basic")
|
||||||
|
for document in loader.lazy_load():
|
||||||
|
loaded_documents.append(document)
|
||||||
|
|
||||||
|
text_splitter = RecursiveCharacterTextSplitter(
|
||||||
|
chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=add_start_index
|
||||||
|
)
|
||||||
|
|
||||||
|
txt_splits = text_splitter.split_documents(loaded_documents)
|
||||||
|
|
||||||
|
logger.info(f"{len(txt_splits)} PDF's split in {len(txt_splits)} vector store documents")
|
||||||
|
logger.info(f"Adding {len(txt_splits)} vector store documents to vector store.")
|
||||||
|
|
||||||
|
vector_store.add_documents(documents=filter_complex_metadata(txt_splits))
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_local_files(local_paths: list[Path], extension="pdf") -> list[Path]:
|
||||||
"""
|
"""
|
||||||
Function that takes a list of local paths,
|
Function that takes a list of local paths,
|
||||||
that might contain directories paths and/or direct file paths,
|
that might contain directories paths and/or direct file paths,
|
||||||
and returns a list with all file paths that are a PDF file or any PDF files found in the directory file paths.
|
and returns a list with all file paths that are a file with the matching extension.
|
||||||
This fucntion does not scan directories recursively.
|
This function does not scan directories recursively.
|
||||||
"""
|
"""
|
||||||
all_pdf_files = []
|
all_pdf_files = []
|
||||||
for path in local_paths:
|
for path in local_paths:
|
||||||
if path.is_dir():
|
if path.is_dir():
|
||||||
all_pdf_files.extend(list(path.glob("*.pdf")))
|
all_pdf_files.extend(list(path.glob(f"*.{extension}")))
|
||||||
elif path.suffix == ".pdf":
|
elif path.suffix == f".{extension}":
|
||||||
all_pdf_files.append(path)
|
all_pdf_files.append(path)
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Ignoring path {path} as it is not a folder or pdf file.")
|
logger.warning(f"Ignoring path {path} as it is not a folder or {extension} file.")
|
||||||
|
|
||||||
return all_pdf_files
|
return all_pdf_files
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_local_pdf_files(local_paths: list[Path]) -> list[Path]:
|
||||||
|
return get_all_local_files(local_paths, "pdf")
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_local_text_files(local_paths: list[Path]) -> list[Path]:
|
||||||
|
return get_all_local_files(local_paths, "txt")
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user