From 996f3bf7a2d656cc1146a32fedd06daf906bf2b7 Mon Sep 17 00:00:00 2001 From: Ruben Lucas Date: Tue, 15 Apr 2025 16:22:55 +0200 Subject: [PATCH 01/12] =?UTF-8?q?=E2=9C=A8=20Yaml=20parser=20using=20Pydan?= =?UTF-8?q?tic=20classes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 + generic_rag/app.py | 112 ++++++------------ generic_rag/backend/models.py | 209 ++++++++++++++++++++++++---------- generic_rag/parsers/config.py | 177 ++++++++++++++++++++++++++++ generic_rag/parsers/parser.py | 7 +- 5 files changed, 370 insertions(+), 138 deletions(-) create mode 100644 generic_rag/parsers/config.py diff --git a/.gitignore b/.gitignore index 74598a5..6578b8f 100644 --- a/.gitignore +++ b/.gitignore @@ -167,3 +167,6 @@ chainlit.md # Chroma DB .chroma_db/ + +# Settings +config.yaml \ No newline at end of file diff --git a/generic_rag/app.py b/generic_rag/app.py index 50ee910..d17b211 100644 --- a/generic_rag/app.py +++ b/generic_rag/app.py @@ -1,14 +1,15 @@ -import argparse import json import logging import os from pathlib import Path +import sys import chainlit as cl from chainlit.cli import run_chainlit from langchain_chroma import Chroma -from generic_rag.backend.models import ChatBackend, EmbeddingBackend, get_chat_model, get_embedding_model +from generic_rag.parsers.config import AppSettings, load_settings +from generic_rag.backend.models import get_chat_model, get_embedding_model from generic_rag.graphs.cond_ret_gen import CondRetGenLangGraph from generic_rag.graphs.ret_gen import RetGenLangGraph from generic_rag.parsers.parser import add_pdf_files, add_urls @@ -23,85 +24,36 @@ system_prompt = ( "If you don't know the answer, say that you don't know." ) -parser = argparse.ArgumentParser(description="A Sogeti Nederland Generic RAG demo.") -parser.add_argument( - "-c", - "--chat-backend", - type=ChatBackend, - choices=list(ChatBackend), - default=ChatBackend.local, - help="Cloud provider or local LLM to use as backend. In the case of 'local', Ollama needs to be installed.", -) -parser.add_argument( - "-e", - "--emb-backend", - type=EmbeddingBackend, - choices=list(EmbeddingBackend), - default=EmbeddingBackend.huggingface, - help="Cloud provider or local embedding to use as backend. In the case of 'local', Ollama needs to be installed. ", -) -parser.add_argument( - "-p", - "--pdf-data", - type=Path, - nargs="+", - default=[], - help="One or multiple paths to folders or files to use for retrieval. " - "If a path is a folder, all files in the folder will be used. " - "If a path is a file, only that file will be used. " - "If the path is relative it will be relative to the current working directory.", -) -parser.add_argument( - "-u", - "--unstructured-pdf", - action="store_true", - help="Use an unstructered PDF loader. " - "An unstructured PDF loader might be usefull for PDF files " - "that contain a lot of images with text, tables or (scanned) text as images. " - "Please use '-r' when switching parsers on already indexed data.", -) -parser.add_argument("--pdf-chunk_size", type=int, default=1000, help="The size of the chunks to split the text into.") -parser.add_argument("--pdf-chunk_overlap", type=int, default=200, help="The overlap between the chunks.") -parser.add_argument( - "--pdf-add-start-index", action="store_true", help="Add the start index to the metadata of the chunks." -) -parser.add_argument( - "-w", "--web-data", type=str, nargs="*", default=[], help="One or multiple URLs to use for retrieval." -) -parser.add_argument("--web-chunk-size", type=int, default=200, help="The size of the chunks to split the text into.") -parser.add_argument( - "-d", - "--chroma-db-location", - type=Path, - default=Path(".chroma_db"), - help="File path to store or load a Chroma DB from/to.", -) -parser.add_argument("-r", "--reset-chrome-db", action="store_true", help="Reset the Chroma DB.") -parser.add_argument( - "--use-conditional-graph", - action="store_true", - help="Use the conditial retrieve generate graph over the regular retrieve generate graph.", -) -args = parser.parse_args() +CONFIG_FILE_PATH = Path("config.yaml") + +try: + settings: AppSettings = load_settings(CONFIG_FILE_PATH) +except (FileNotFoundError, Exception) as e: + logger.error(f"Failed to load configuration from {CONFIG_FILE_PATH}. Exiting.") + sys.exit(1) + +embedding_function = get_embedding_model(settings) + +chat_function = get_chat_model(settings) vector_store = Chroma( collection_name="generic_rag", - embedding_function=get_embedding_model(args.emb_backend), - persist_directory=str(args.chroma_db_location), + embedding_function=embedding_function, + persist_directory=str(settings.chroma_db.location), ) -if args.use_conditional_graph: +if settings.use_conditional_graph: graph = CondRetGenLangGraph( vector_store=vector_store, - chat_model=get_chat_model(args.chat_backend), - embedding_model=get_embedding_model(args.emb_backend), + chat_model=chat_function, + embedding_model=embedding_function, system_prompt=system_prompt, ) else: graph = RetGenLangGraph( vector_store=vector_store, - chat_model=get_chat_model(args.chat_backend), - embedding_model=get_embedding_model(args.emb_backend), + chat_model=chat_function, + embedding_model=embedding_function, system_prompt=system_prompt, ) @@ -129,7 +81,9 @@ async def add_sources(chainlit_response: cl.Message, pdf_sources: dict, web_sour for source, page_numbers in pdf_sources.items(): filename = Path(source).name await chainlit_response.stream_token(f"- {filename} on page(s): {sorted(page_numbers)}\n") - chainlit_response.elements.append(cl.Pdf(name=filename, display="side", path=source, page=sorted(page_numbers)[0])) + chainlit_response.elements.append( + cl.Pdf(name=filename, display="side", path=source, page=sorted(page_numbers)[0]) + ) if len(web_sources) > 0: await chainlit_response.stream_token("\n\nThe following web sources were consulted:\n") @@ -159,17 +113,21 @@ async def set_starters(): if __name__ == "__main__": - if args.reset_chrome_db: + if settings.chroma_db.reset: vector_store.reset_collection() add_pdf_files( vector_store, - args.pdf_data, - args.pdf_chunk_size, - args.pdf_chunk_overlap, - args.pdf_add_start_index, - args.unstructured_pdf, + settings.pdf.data, + settings.pdf.chunk_size, + settings.pdf.chunk_overlap, + settings.pdf.add_start_index, + settings.pdf.unstructured, + ) + add_urls( + vector_store, + settings.web.data, + settings.web.chunk_size, ) - add_urls(vector_store, args.web_data, args.web_chunk_size) run_chainlit(__file__) diff --git a/generic_rag/backend/models.py b/generic_rag/backend/models.py index 1e100ef..a5a261f 100644 --- a/generic_rag/backend/models.py +++ b/generic_rag/backend/models.py @@ -1,85 +1,180 @@ -import os -from enum import Enum +import logging -from langchain.chat_models import init_chat_model -from langchain_aws import BedrockEmbeddings +from generic_rag.parsers.config import AppSettings, ChatBackend, EmbeddingBackend + +# Langchain imports from langchain_core.embeddings import Embeddings from langchain_core.language_models.chat_models import BaseChatModel -from langchain_google_vertexai import VertexAIEmbeddings +from langchain_aws import BedrockEmbeddings, ChatBedrock # Import ChatBedrock +from langchain_google_vertexai import VertexAIEmbeddings, ChatVertexAI # Import ChatVertexAI from langchain_huggingface import HuggingFaceEmbeddings from langchain_ollama import ChatOllama, OllamaEmbeddings -from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings, OpenAIEmbeddings +from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings, ChatOpenAI, OpenAIEmbeddings # Import ChatOpenAI + +logger = logging.getLogger(__name__) -class ChatBackend(Enum): - azure = "azure" - openai = "openai" - google_vertex = "google_vertex" - aws = "aws" - local = "local" +def get_chat_model(settings: AppSettings) -> BaseChatModel: + """ + Initializes and returns a chat model based on the backend type and configuration. - # make the enum pretty printable for argparse - def __str__(self): - return self.value + Args: + settings: The loaded AppSettings object containing configurations. + Returns: + An instance of BaseChatModel. -class EmbeddingBackend(Enum): - azure = "azure" - openai = "openai" - google_vertex = "google_vertex" - aws = "aws" - local = "local" - huggingface = "huggingface" + Raises: + ValueError: If the backend type is unknown or required configuration is missing. + """ + logger.info(f"Initializing chat model for backend: {settings.chat_backend.value}") - # make the enum pretty printable for argparse - def __str__(self): - return self.value - - -def get_chat_model(backend_type: ChatBackend) -> BaseChatModel: - if backend_type == ChatBackend.azure: + if settings.chat_backend == ChatBackend.azure: + if not settings.azure: + raise ValueError("Azure chat backend selected, but 'azure' configuration section is missing in config.") + if ( + not settings.azure.llm_endpoint + or not settings.azure.llm_deployment_name + or not settings.azure.llm_api_version + ): + raise ValueError( + "Azure configuration requires 'llm_endpoint', 'llm_deployment_name', and 'llm_api_version'." + ) return AzureChatOpenAI( - azure_endpoint=os.environ["AZURE_LLM_ENDPOINT"], - azure_deployment=os.environ["AZURE_LLM_DEPLOYMENT_NAME"], - openai_api_version=os.environ["AZURE_LLM_API_VERSION"], + azure_endpoint=settings.azure.llm_endpoint, + azure_deployment=settings.azure.llm_deployment_name, + openai_api_version=settings.azure.llm_api_version, + openai_api_key=settings.azure.openai_api_key.get_secret_value() if settings.azure.openai_api_key else None, ) - if backend_type == ChatBackend.openai: - return init_chat_model(os.environ["OPENAI_CHAT_MODEL"], model_provider="openai") + if settings.chat_backend == ChatBackend.openai: + if not settings.openai: + raise ValueError("OpenAI chat backend selected, but 'openai' configuration section is missing.") + if not settings.openai.api_key or not settings.openai.chat_model: + raise ValueError("OpenAI configuration requires 'api_key' and 'chat_model'.") + logger.info(f"Using OpenAI model: {model_name}") + return ChatOpenAI(model=settings.openai.chat_model, openai_api_key=settings.openai.api_key.get_secret_value()) - if backend_type == ChatBackend.google_vertex: - return init_chat_model(os.environ["GOOGLE_CHAT_MODEL"], model_provider="google_vertexai") + if settings.chat_backend == ChatBackend.google_vertex: + if not settings.google: + raise ValueError("Google Vertex chat backend selected, but 'google' configuration section is missing.") + if settings.google.chat_model: + model_name = settings.google.chat_model + logger.info(f"Using Google Vertex model: {model_name}") + return ChatVertexAI( + model_name=settings.google.chat_model, + project=settings.google.project_id, + location=settings.google.location, + ) - if backend_type == ChatBackend.aws: - return init_chat_model(model=os.environ["AWS_CHAT_MODEL"], model_provider="bedrock_converse") + if settings.chat_backend == ChatBackend.aws: + if not settings.aws: + raise ValueError("AWS Bedrock chat backend selected, but 'aws' configuration section is missing.") + model_name = "anthropic.claude-v2" # Example default + if hasattr(settings.aws, "chat_model") and settings.aws.chat_model: + model_name = settings.aws.chat_model + logger.info(f"Using AWS Bedrock model: {model_name}") + return ChatBedrock( + model_id=model_name, + region_name=settings.aws.region_name, + ) - if backend_type == ChatBackend.local: - return ChatOllama(model=os.environ["LOCAL_CHAT_MODEL"]) + if settings.chat_backend == ChatBackend.local: + if not settings.local or not settings.local.chat_model: + raise ValueError("Local chat backend selected, but 'local.chat_model' is missing in config.") + logger.info(f"Using Local Ollama model: {settings.local.chat_model}") + # Base URL can also be configured, e.g., base_url=config.local.ollama_base_url + return ChatOllama(model=settings.local.chat_model) - raise ValueError(f"Unknown backend type: {backend_type}") + # This should not be reached if all Enum members are handled + raise ValueError(f"Unknown or unhandled chat backend type: {settings.chat_backend}") -def get_embedding_model(backend_type: EmbeddingBackend) -> Embeddings: - if backend_type == EmbeddingBackend.azure: +def get_embedding_model(settings: AppSettings) -> Embeddings: + """ + Initializes and returns an embedding model based on the backend type and configuration. + + Args: + settings: The loaded AppSettings object containing configurations. + + Returns: + An instance of Embeddings. + + Raises: + ValueError: If the backend type is unknown or required configuration is missing. + """ + logger.info(f"Initializing embedding model for backend: {settings.emb_backend.value}") + + if settings.emb_backend == EmbeddingBackend.azure: + if not settings.azure: + raise ValueError("Azure embedding backend selected, but 'azure' configuration section is missing.") + if ( + not settings.azure.emb_endpoint + or not settings.azure.emb_deployment_name + or not settings.azure.emb_api_version + ): + raise ValueError( + "Azure configuration requires 'emb_endpoint', 'emb_deployment_name', and 'emb_api_version'." + ) return AzureOpenAIEmbeddings( - azure_endpoint=os.environ["AZURE_EMB_ENDPOINT"], - azure_deployment=os.environ["AZURE_EMB_DEPLOYMENT_NAME"], - openai_api_version=os.environ["AZURE_EMB_API_VERSION"], + azure_endpoint=settings.azure.emb_endpoint, + azure_deployment=settings.azure.emb_deployment_name, + openai_api_version=settings.azure.emb_api_version, + openai_api_key=settings.azure.openai_api_key.get_secret_value() if settings.azure.openai_api_key else None, ) - if backend_type == EmbeddingBackend.openai: - return OpenAIEmbeddings(model=os.environ["OPENAI_EMB_MODEL"]) + if settings.emb_backend == EmbeddingBackend.openai: + if not settings.openai: + raise ValueError("OpenAI embedding backend selected, but 'openai' configuration section is missing.") + if not settings.openai.api_key: + raise ValueError("OpenAI configuration requires 'api_key'.") + model_name = "text-embedding-ada-002" # Example default + if hasattr(settings.openai, "emb_model") and settings.openai.emb_model: + model_name = settings.openai.emb_model + logger.info(f"Using OpenAI embedding model: {model_name}") + return OpenAIEmbeddings(model=model_name, openai_api_key=settings.openai.api_key.get_secret_value()) - if backend_type == EmbeddingBackend.google_vertex: - return VertexAIEmbeddings(model=os.environ["GOOGLE_EMB_MODEL"]) + if settings.emb_backend == EmbeddingBackend.google_vertex: + if not settings.google: + raise ValueError("Google Vertex embedding backend selected, but 'google' configuration section is missing.") + model_name = "textembedding-gecko@001" # Example default + if settings.google.emb_model: + model_name = settings.google.emb_model + logger.info(f"Using Google Vertex embedding model: {model_name}") + return VertexAIEmbeddings( + model_name=model_name, project=settings.google.project_id, location=settings.google.location + ) - if backend_type == EmbeddingBackend.aws: - return BedrockEmbeddings(model_id=os.environ["AWS_EMB_MODEL"]) + if settings.emb_backend == EmbeddingBackend.aws: + if not settings.aws: + raise ValueError("AWS Bedrock embedding backend selected, but 'aws' configuration section is missing.") + model_name = "amazon.titan-embed-text-v1" # Example default + if hasattr(settings.aws, "emb_model") and settings.aws.emb_model: + model_name = settings.aws.emb_model + logger.info(f"Using AWS Bedrock embedding model: {model_name}") + return BedrockEmbeddings(model_id=model_name, region_name=settings.aws.region_name) - if backend_type == EmbeddingBackend.local: - return OllamaEmbeddings(model=os.environ["LOCAL_EMB_MODEL"]) + if settings.emb_backend == EmbeddingBackend.local: + if not settings.local or not settings.local.emb_model: + raise ValueError("Local embedding backend selected, but 'local.emb_model' is missing in config.") + logger.info(f"Using Local Ollama embedding model: {settings.local.emb_model}") + return OllamaEmbeddings(model=settings.local.emb_model) - if backend_type == EmbeddingBackend.huggingface: - return HuggingFaceEmbeddings(model_name=os.environ["HUGGINGFACE_EMB_MODEL"]) + if settings.emb_backend == EmbeddingBackend.huggingface: + if not settings.huggingface or not settings.huggingface.emb_model: + if settings.local and settings.local.emb_model: + logger.warning( + "HuggingFace backend selected, but 'huggingface.emb_model' missing. Using 'local.emb_model'." + ) + model_name = settings.local.emb_model + else: + raise ValueError( + "HuggingFace embedding backend selected, but 'huggingface.emb_model' (or 'local.emb_model') is missing in config." + ) + else: + model_name = settings.huggingface.emb_model - raise ValueError(f"Unknown backend type: {backend_type}") + logger.info(f"Using HuggingFace embedding model: {model_name}") + return HuggingFaceEmbeddings(model_name=model_name) + + raise ValueError(f"Unknown or unhandled embedding backend type: {settings.emb_backend}") diff --git a/generic_rag/parsers/config.py b/generic_rag/parsers/config.py new file mode 100644 index 0000000..af5c9b0 --- /dev/null +++ b/generic_rag/parsers/config.py @@ -0,0 +1,177 @@ +import yaml +from pathlib import Path +from typing import List, Optional +from enum import Enum +from pydantic import ( + BaseModel, + Field, + ValidationError, + SecretStr, +) +import sys + + +class ChatBackend(str, Enum): + azure = "azure" + openai = "openai" + google_vertex = "google_vertex" + aws = "aws" + local = "local" + + def __str__(self): + return self.value + + +class EmbeddingBackend(str, Enum): + azure = "azure" + openai = "openai" + google_vertex = "google_vertex" + aws = "aws" + local = "local" + huggingface = "huggingface" + + def __str__(self): + return self.value + + +class AzureSettings(BaseModel): + """Azure specific settings.""" + + openai_api_key: Optional[SecretStr] = None + llm_endpoint: Optional[str] = None + llm_deployment_name: Optional[str] = None + llm_api_version: Optional[str] = None + emb_endpoint: Optional[str] = None + emb_deployment_name: Optional[str] = None + emb_api_version: Optional[str] = None + + +class OpenAISettings(BaseModel): + """OpenAI specific settings.""" + + api_key: Optional[SecretStr] = None + + +class GoogleSettings(BaseModel): + """Google specific settings (Vertex AI or GenAI).""" + + api_key: Optional[SecretStr] = None + project_id: Optional[str] = None + location: Optional[str] = None + chat_model: Optional[str] = None + emb_model: Optional[str] = None + + +class AwsSettings(BaseModel): + """AWS specific settings (e.g., for Bedrock).""" + + access_key_id: Optional[SecretStr] = None + secret_access_key: Optional[SecretStr] = None + region_name: Optional[str] = None + + +class LocalSettings(BaseModel): + """Local backend specific settings (e.g., Ollama models).""" + + chat_model: Optional[str] = None + emb_model: Optional[str] = None + + +class HuggingFaceSettings(BaseModel): + """HuggingFace specific settings (if different from local embeddings).""" + + emb_model: Optional[str] = None + api_token: Optional[SecretStr] = None + + +class PdfSettings(BaseModel): + """PDF processing settings.""" + + data: List[Path] = Field(default_factory=list) + unstructured: bool = Field(default=False) + chunk_size: int = Field(default=1000) + chunk_overlap: int = Field(default=200) + add_start_index: bool = Field(default=False) + + +class WebSettings(BaseModel): + """Web data processing settings.""" + + data: List[str] = Field(default_factory=list) + chunk_size: int = Field(default=200) + + +class ChromaDbSettings(BaseModel): + """Chroma DB settings.""" + + location: Path = Field(default=Path(".chroma_db")) + reset: bool = Field(default=False) + + +class AppSettings(BaseModel): + """ + Main application settings model. + + Loads configuration from a YAML file using the structure defined + by the nested models. + """ + + # --- Top-level settings --- + chat_backend: ChatBackend = Field(default=ChatBackend.local) + emb_backend: EmbeddingBackend = Field(default=EmbeddingBackend.huggingface) + use_conditional_graph: bool = Field(default=False) + + # --- Provider-specific settings --- + azure: Optional[AzureSettings] = None + openai: Optional[OpenAISettings] = None + google: Optional[GoogleSettings] = None + aws: Optional[AwsSettings] = None + local: Optional[LocalSettings] = None + huggingface: Optional[HuggingFaceSettings] = None # Separate HF config if needed + + # --- Data processing settings --- + pdf: PdfSettings = Field(default_factory=PdfSettings) + web: WebSettings = Field(default_factory=WebSettings) + chroma_db: ChromaDbSettings = Field(default_factory=ChromaDbSettings) + + +# --- Configuration Loading Function --- +def load_settings(config_path: Path = Path("config.yaml")) -> AppSettings: + """ + Loads settings from a YAML file and validates them using Pydantic models. + + Args: + config_path: The path to the configuration YAML file. + + Returns: + An instance of AppSettings containing the loaded configuration. + + Raises: + FileNotFoundError: If the config file does not exist. + yaml.YAMLError: If the file is not valid YAML. + ValidationError: If the data in the file doesn't match the AppSettings model. + """ + if not config_path.is_file(): + print(f"Error: Configuration file not found at '{config_path}'", file=sys.stderr) + raise FileNotFoundError(f"Configuration file not found: {config_path}") + + print(f"--- Loading settings from '{config_path}' ---") + try: + with open(config_path, "r", encoding="utf-8") as f: + config_data = yaml.safe_load(f) + if config_data is None: + config_data = {} + + settings = AppSettings(**config_data) + print("--- Settings loaded and validated successfully ---") + return settings + + except yaml.YAMLError as e: + print(f"Error parsing YAML file '{config_path}':\n {e}", file=sys.stderr) + raise + except ValidationError as e: + print(f"Error validating configuration from '{config_path}':\n{e}", file=sys.stderr) + raise + except Exception as e: + print(f"An unexpected error occurred while loading settings from '{config_path}': {e}", file=sys.stderr) + raise diff --git a/generic_rag/parsers/parser.py b/generic_rag/parsers/parser.py index ef3b374..be5bd28 100644 --- a/generic_rag/parsers/parser.py +++ b/generic_rag/parsers/parser.py @@ -32,7 +32,7 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None: The URL's will be fetched and split into chunks of text with the provided chunk size. """ logger.info("Web sources to the vector store.") - + all_splits = [] for url in urls: if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0: @@ -87,7 +87,6 @@ def add_pdf_files( The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap. """ logger.info("Adding PDF files to the vector store.") - pdf_files = get_all_local_pdf_files(file_paths) logger.info(f"Found {len(pdf_files)} PDF files to add to the vector store.") @@ -100,8 +99,8 @@ def add_pdf_files( if len(new_pdfs) == 0: return - - logger.info(f"{len(new_pdfs)} PDF's to add to the vector store.") + + logger.info(f"{len(new_pdfs)} PDF(s) to add to the vector store.") loaded_document = [] for file in new_pdfs: From 9777c2ff7432e39df2b446e1792f994343bf534d Mon Sep 17 00:00:00 2001 From: Ruben Lucas Date: Wed, 16 Apr 2025 11:06:56 +0200 Subject: [PATCH 02/12] =?UTF-8?q?=F0=9F=9A=9A=20Rename=20everything=20to?= =?UTF-8?q?=20google=20vertex?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- generic_rag/backend/models.py | 32 +++++++++++++++++++++----------- generic_rag/parsers/config.py | 7 +++---- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/generic_rag/backend/models.py b/generic_rag/backend/models.py index a5a261f..2065bd5 100644 --- a/generic_rag/backend/models.py +++ b/generic_rag/backend/models.py @@ -56,15 +56,17 @@ def get_chat_model(settings: AppSettings) -> BaseChatModel: return ChatOpenAI(model=settings.openai.chat_model, openai_api_key=settings.openai.api_key.get_secret_value()) if settings.chat_backend == ChatBackend.google_vertex: - if not settings.google: - raise ValueError("Google Vertex chat backend selected, but 'google' configuration section is missing.") - if settings.google.chat_model: + if not settings.google_vertex: + raise ValueError( + "Google Vertex chat backend selected, but 'google_vertex' configuration section is missing." + ) + if settings.google_vertex.chat_model: model_name = settings.google.chat_model logger.info(f"Using Google Vertex model: {model_name}") return ChatVertexAI( - model_name=settings.google.chat_model, - project=settings.google.project_id, - location=settings.google.location, + model_name=settings.google_vertex.chat_model, + project=settings.google_vertex.project_id, + location=settings.google_vertex.location, ) if settings.chat_backend == ChatBackend.aws: @@ -135,14 +137,22 @@ def get_embedding_model(settings: AppSettings) -> Embeddings: return OpenAIEmbeddings(model=model_name, openai_api_key=settings.openai.api_key.get_secret_value()) if settings.emb_backend == EmbeddingBackend.google_vertex: - if not settings.google: - raise ValueError("Google Vertex embedding backend selected, but 'google' configuration section is missing.") + if not settings.google_vertex: + raise ValueError( + "Google Vertex embedding backend selected, but 'google_vertex' configuration section is missing." + ) model_name = "textembedding-gecko@001" # Example default - if settings.google.emb_model: - model_name = settings.google.emb_model + if ( + not settings.google_vertex.emb_model + or not settings.google_vertex.project_id + or not settings.google_vertex.location + ): + raise ValueError("Google Vertex configuration requires 'emb_model', 'project_id', and 'location'.") logger.info(f"Using Google Vertex embedding model: {model_name}") return VertexAIEmbeddings( - model_name=model_name, project=settings.google.project_id, location=settings.google.location + model_name=settings.google_vertex.emb_model, + project=settings.google_vertex.project_id, + location=settings.google_vertex.location, ) if settings.emb_backend == EmbeddingBackend.aws: diff --git a/generic_rag/parsers/config.py b/generic_rag/parsers/config.py index af5c9b0..1815937 100644 --- a/generic_rag/parsers/config.py +++ b/generic_rag/parsers/config.py @@ -52,10 +52,9 @@ class OpenAISettings(BaseModel): api_key: Optional[SecretStr] = None -class GoogleSettings(BaseModel): - """Google specific settings (Vertex AI or GenAI).""" +class GoogleVertexSettings(BaseModel): + """Google Vertex specific settings.""" - api_key: Optional[SecretStr] = None project_id: Optional[str] = None location: Optional[str] = None chat_model: Optional[str] = None @@ -124,7 +123,7 @@ class AppSettings(BaseModel): # --- Provider-specific settings --- azure: Optional[AzureSettings] = None openai: Optional[OpenAISettings] = None - google: Optional[GoogleSettings] = None + google_vertex: Optional[GoogleVertexSettings] = None aws: Optional[AwsSettings] = None local: Optional[LocalSettings] = None huggingface: Optional[HuggingFaceSettings] = None # Separate HF config if needed From 498b06bb3bb1078310eea61a375e0db6dc6c431b Mon Sep 17 00:00:00 2001 From: Ruben Lucas Date: Wed, 16 Apr 2025 11:07:30 +0200 Subject: [PATCH 03/12] =?UTF-8?q?=E2=9E=96=20Remove=20dotenv=20dependency?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 1 - requirements.txt | 2 -- 2 files changed, 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a0af033..bd09387 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,6 @@ requires-python = ">=3.12,<3.13" dependencies = [ "beautifulsoup4>=4.13.3", "chainlit>=2.3.0", - "dotenv>=0.9.9", "langchain>=0.3.20", "langchain-aws>=0.2.15", "langchain-chroma>=0.2.2", diff --git a/requirements.txt b/requirements.txt index cf3ffe8..46f5596 100644 --- a/requirements.txt +++ b/requirements.txt @@ -117,8 +117,6 @@ distro==1.9.0 # posthog docstring-parser==0.16 # via google-cloud-aiplatform -dotenv==0.9.9 - # via sogeti-generic-rag-demo (pyproject.toml) durationpy==0.9 # via kubernetes effdet==0.4.1 From f933ee0ccf0c8f85545473dca4a27750716d6930 Mon Sep 17 00:00:00 2001 From: Ruben Lucas Date: Wed, 16 Apr 2025 11:11:56 +0200 Subject: [PATCH 04/12] =?UTF-8?q?=E2=9C=A8=20Add=20config.yaml=20example?= =?UTF-8?q?=20file?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config.example.yaml | 61 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 config.example.yaml diff --git a/config.example.yaml b/config.example.yaml new file mode 100644 index 0000000..2b778ce --- /dev/null +++ b/config.example.yaml @@ -0,0 +1,61 @@ +# Define your application settings here. + +chat_backend: local # Select the primary chat backend (azure, openai, google_vertex, aws, local) +emb_backend: local # Select the primary embedding backend (azure, openai, google_vertex, aws, local, huggingface) + +use_conditional_graph: false # Use a conditional RAG model with historical chat context, or a non-conditional model without access to the current conversation + +# --- Provider Specific Settings --- + +azure: + openai_api_key: "your_openai_api_key" + llm_endpoint: "https://example.openai.azure.com" + llm_deployment_name: "gpt-4o-mini" + llm_api_version: "2025-01-01-preview" + emb_endpoint: "https://example.openai.azure.com" # Can be same as LLM endpoint + emb_deployment_name: "text-embedding-3-large" + emb_api_version: "2023-05-15" + +openai: + openai_api_key: "your_openai_api_key" + chat_model: "gpt-4o-mini" + emb_model: "text-embedding-3-large" + +google_vertex: + project_id: "your_gcp_project_id" + location: "europe-west4" + chat_model: "gemini-pro" + emb_model: "textembedding-gecko@001" + +aws: + region: "us-east-1" + credentials: "PATH_TO_YOUR_CREDENTIALS_FILE.json" + +local: # Settings for local models (e.g., Ollama) + chat_model: "llama3.1:8b" + emb_model: "llama3.1:8b" + +huggingface: # Settings specific to HuggingFace embedding backend + emb_model: "sentence-transformers/paraphrase-MiniLM-L12-v2" + +# --- Data Processing Settings --- + +pdf: + # List of paths to PDF files or folders containing PDFs. + # Pydantic converts these strings to pathlib.Path objects. + data: + - "C:/path/folder" + unstructured: false # Use the unstructured PDF loader? + chunk_size: 1000 + chunk_overlap: 200 + add_start_index: false + +web: + # List of URLs to scrape for data. + data: + - "https://www.example.nl/subdomain" + chunk_size: 200 + +chroma_db: + location: "/app/data/vector_database" # Override default DB path (default: '.chroma_db') + reset: True # Reset the database on startup? (default: false) From 572a278a7d5761e16b3980d3374af40857c70068 Mon Sep 17 00:00:00 2001 From: Ruben Lucas Date: Wed, 16 Apr 2025 11:12:29 +0200 Subject: [PATCH 05/12] =?UTF-8?q?=F0=9F=93=9D=20Update=20readme=20with=20n?= =?UTF-8?q?ew=20config=20file?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 93 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 0457da3..c94f3fa 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,52 @@ # generic-RAG-demo -A Sogeti Nederland generic RAG demo +A generic Retrieval Augmented Generation (RAG) demo from Sogeti Netherlands built in Python. This project demonstrates how to integrate and run different backends, from cloud providers to local models, to parse and process your PDFs, web data, or other text sources. + +## Table of Contents + +- [generic-RAG-demo](#generic-rag-demo) + - [Table of Contents](#table-of-contents) + - [Features](#features) + - [Getting started](#getting-started) + - [Installation of system dependencies](#installation-of-system-dependencies) + - [Unstructered PDF loader (optional)](#unstructered-pdf-loader-optional) + - [Local LLM (optional)](#local-llm-optional) + - [Running generic RAG demo](#running-generic-rag-demo) + - [config.yaml file](#configyaml-file) + - [Chainlit starters](#chainlit-starters) + - [Dev details](#dev-details) + - [Linting](#linting) + +## Features + +- **Multi-backend Support:** Easily switch between cloud-based and local LLMs. +- **Flexible Data Input:** Supports both PDFs and web data ingestion. +- **Configurable Workflows:** Customize settings via a central `config.yaml` file. ## Getting started ### Installation of system dependencies +This project leverages a modern packaging method defined in `pyproject.toml`. After cloning the repository, you can install the project along with its dependencies. You have two options: +1. Using uv + +If you're using uv, simply run: +```bash +uv install +``` + +2. Using a Python Virtual Environment + +Alternatively, set up a virtual environment and install the project: +```bash +python -m venv .venv # Create a new virtual environment named ".venv" +source .venv/bin/activate # Activate the virtual environment (use ".venv\Scripts\activate" on Windows) +pip install . # Install the project and its dependencies +``` + #### Unstructered PDF loader (optional) -If you would like to run the application using the unstructered PDF loader (`--unstructured-pdf` flag) you need to install two system dependencies. +If you would like to run the application using the unstructered PDF loader (`pdf.unstructured` setting) you need to install two system dependencies. - [poppler-utils](https://launchpad.net/ubuntu/jammy/amd64/poppler-utils) - [tesseract-ocr](https://github.com/tesseract-ocr/tesseract?tab=readme-ov-file#installing-tesseract) @@ -21,18 +59,19 @@ sudo apt install poppler-utils tesseract-ocr #### Local LLM (optional) -If you would like to run the application using a local LLM backend (`-b local` flag), you need to install Ollama. +If you would like to run the application using a local LLM backend (`local` settings), you need to install Ollama. ```bash curl -fsSL https://ollama.com/install.sh | sh # install Ollama ollama pull llama3.1:8b # fetch and download as model ``` -Include the downloaded model in the `.env` file: +Include the downloaded model in the `config.yaml` file: -```text -LOCAL_CHAT_MODEL="llama3.1:8b" -LOCAL_EMB_MODEL="llama3.1:8b" +```yaml +local: + chat_model: "llama3.1:8b" + emb_model: "llama3.1:8b" ``` >For more information on installing Ollama, please refer to the Langchain Local LLM documentation, specifically the [Quickstart section](https://python.langchain.com/docs/how_to/local_llms/#quickstart). @@ -52,14 +91,19 @@ python generic_rag/app.py -p data # will work and parsers all pdf files in ./da python generic_rag/app.py --help # will work and prints command line options ``` -Please configure your `.env` file with your cloud provider (backend) of choice and set the `--backend` flag accordingly. +Please configure your `config.yaml` file with your cloud provider (backend) of choice. See the `config.example.yaml` file as a starting point that holds all possible options. -### .env file +### config.yaml file -A .env file needs to be populated to configure API end-points or local back-ends using environment variables. -Currently all required environment variables are defined in code at [backend/models.py](generic_rag/backend/models.py) -with the exception of the API key variables itself. -More information about configuring API endpoints for langchain can be found at the following locations. +A config.yaml file is required to specify your API endpoints, local backends, and environment variables. Use the provided config.yaml.example as a starting point. Update the file according to your backend settings and project requirements. + +Key configuration points include: +- Chat Backend: Choose among azure, openai, google_vertex, aws, or local. +- Embedding Backend: Configure the embedding models similarly. +- Data Processing Settings: Define PDF and web data sources, chunk sizes, and overlap. +- Vector Database: Customize the path and reset behavior. + +For more information on configuring Langchain endpoints and models, please see: - [langchain cloud chat model doc](https://python.langchain.com/docs/integrations/chat/) - [langchain local chat model doc](https://python.langchain.com/docs/how_to/local_llms/) @@ -67,27 +111,6 @@ More information about configuring API endpoints for langchain can be found at t > for local models we currently use Ollama -An `.env` example is as followed. - -```text -# only one backend (azure, google, local, etc) is required. Please addjust the --backend flag accordingly - -AZURE_OPENAI_API_KEY="" -AZURE_LLM_ENDPOINT="https://.openai.azure.com" -AZURE_LLM_DEPLOYMENT_NAME="gpt-4" -AZURE_LLM_API_VERSION="2025-01-01-preview" -AZURE_EMB_ENDPOINT="https://.openai.azure.com" -AZURE_EMB_DEPLOYMENT_NAME="text-embedding-3-large" -AZURE_EMB_API_VERSION="2023-05-15" - -LOCAL_CHAT_MODEL="llama3.1:8b" -LOCAL_EMB_MODEL="llama3.1:8b" - -# google vertex AI does not use API keys but a seperate authentication method -GOOGLE_GENAI_CHAT_MODEL="gemini-2.0-flash" -GOOGLE_GENAI_EMB_MODEL="models/text-embedding-004" -``` - ### Chainlit starters Chainlit suggestions (starters) can be set with the `CHAINLIT_STARTERS` environment variable. @@ -102,4 +125,4 @@ CHAINLIT_STARTERS=[{"label":"Label 1","message":"Message one."},{"label":"Label ### Linting -Currently [Ruff](https://github.com/astral-sh/ruff) is used as Python linter. It is included in the [pyproject.toml](pyproject.toml) as `dev` dependency if your IDE needs that. However, for VS Code a [Ruff extension](https://marketplace.visualstudio.com/items?itemName=charliermarsh.ruff) excists. +Currently [Ruff](https://github.com/astral-sh/ruff) is used as Python linter. It is included in the [pyproject.toml](pyproject.toml) as `dev` dependency if your IDE needs that. However, for VS Code a [Ruff extension](https://marketplace.visualstudio.com/items?itemName=charliermarsh.ruff) exists. \ No newline at end of file From 770f341c1f6d812bb210711033ce1f272af955b7 Mon Sep 17 00:00:00 2001 From: Ruben Lucas Date: Wed, 16 Apr 2025 16:06:58 +0200 Subject: [PATCH 06/12] =?UTF-8?q?=F0=9F=8E=A8=20Cleanup=20model=20init=20?= =?UTF-8?q?=E2=9C=A8=20Add=20huggingface=20chat=20model?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config.example.yaml | 3 ++ generic_rag/backend/models.py | 97 ++++++++++++++++++----------------- generic_rag/parsers/config.py | 10 ++-- 3 files changed, 59 insertions(+), 51 deletions(-) diff --git a/config.example.yaml b/config.example.yaml index 2b778ce..12bd131 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -28,6 +28,8 @@ google_vertex: emb_model: "textembedding-gecko@001" aws: + chat_model: "amazon.titan-llm-v1" + emb_model: "amazon.titan-embed-text-v1" region: "us-east-1" credentials: "PATH_TO_YOUR_CREDENTIALS_FILE.json" @@ -36,6 +38,7 @@ local: # Settings for local models (e.g., Ollama) emb_model: "llama3.1:8b" huggingface: # Settings specific to HuggingFace embedding backend + chat_model: "meta-llama/Llama-2-7b-chat-hf" emb_model: "sentence-transformers/paraphrase-MiniLM-L12-v2" # --- Data Processing Settings --- diff --git a/generic_rag/backend/models.py b/generic_rag/backend/models.py index 2065bd5..e8a3063 100644 --- a/generic_rag/backend/models.py +++ b/generic_rag/backend/models.py @@ -2,14 +2,13 @@ import logging from generic_rag.parsers.config import AppSettings, ChatBackend, EmbeddingBackend -# Langchain imports from langchain_core.embeddings import Embeddings from langchain_core.language_models.chat_models import BaseChatModel -from langchain_aws import BedrockEmbeddings, ChatBedrock # Import ChatBedrock -from langchain_google_vertexai import VertexAIEmbeddings, ChatVertexAI # Import ChatVertexAI -from langchain_huggingface import HuggingFaceEmbeddings +from langchain_aws import BedrockEmbeddings, ChatBedrock +from langchain_google_vertexai import VertexAIEmbeddings, ChatVertexAI +from langchain_huggingface import HuggingFaceEmbeddings, ChatHuggingFace, HuggingFacePipeline from langchain_ollama import ChatOllama, OllamaEmbeddings -from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings, ChatOpenAI, OpenAIEmbeddings # Import ChatOpenAI +from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings, ChatOpenAI, OpenAIEmbeddings logger = logging.getLogger(__name__) @@ -52,7 +51,6 @@ def get_chat_model(settings: AppSettings) -> BaseChatModel: raise ValueError("OpenAI chat backend selected, but 'openai' configuration section is missing.") if not settings.openai.api_key or not settings.openai.chat_model: raise ValueError("OpenAI configuration requires 'api_key' and 'chat_model'.") - logger.info(f"Using OpenAI model: {model_name}") return ChatOpenAI(model=settings.openai.chat_model, openai_api_key=settings.openai.api_key.get_secret_value()) if settings.chat_backend == ChatBackend.google_vertex: @@ -60,9 +58,12 @@ def get_chat_model(settings: AppSettings) -> BaseChatModel: raise ValueError( "Google Vertex chat backend selected, but 'google_vertex' configuration section is missing." ) - if settings.google_vertex.chat_model: - model_name = settings.google.chat_model - logger.info(f"Using Google Vertex model: {model_name}") + if ( + not settings.google_vertex.chat_model + or not settings.google_vertex.project_id + or not settings.google_vertex.location + ): + raise ValueError("Google Vertex configuration requires 'chat_model' and 'project_id'.") return ChatVertexAI( model_name=settings.google_vertex.chat_model, project=settings.google_vertex.project_id, @@ -72,22 +73,35 @@ def get_chat_model(settings: AppSettings) -> BaseChatModel: if settings.chat_backend == ChatBackend.aws: if not settings.aws: raise ValueError("AWS Bedrock chat backend selected, but 'aws' configuration section is missing.") - model_name = "anthropic.claude-v2" # Example default - if hasattr(settings.aws, "chat_model") and settings.aws.chat_model: - model_name = settings.aws.chat_model - logger.info(f"Using AWS Bedrock model: {model_name}") + if not settings.aws.chat_model or not settings.aws.region_name: + raise ValueError("AWS Bedrock configuration requires 'chat_model' and 'region_name'") return ChatBedrock( - model_id=model_name, + model_id=settings.aws.chat_model, region_name=settings.aws.region_name, ) if settings.chat_backend == ChatBackend.local: - if not settings.local or not settings.local.chat_model: - raise ValueError("Local chat backend selected, but 'local.chat_model' is missing in config.") - logger.info(f"Using Local Ollama model: {settings.local.chat_model}") - # Base URL can also be configured, e.g., base_url=config.local.ollama_base_url + if not settings.local: + raise ValueError("Local chat backend selected, but 'local' configuration section is missing.") + if not settings.local.chat_model: + raise ValueError("Local configuration requires 'chat_model'") return ChatOllama(model=settings.local.chat_model) + if settings.chat_backend == ChatBackend.huggingface: + if not settings.huggingface: + raise ValueError("Huggingface chat backend selected, but 'huggingface' configuration section is missing.") + if not settings.huggingface.chat_model: + raise ValueError("Huggingface configuration requires 'chat_model'") + llm = HuggingFacePipeline.from_model_id( + model_id=settings.huggingface.chat_model, + task="text-generation", + pipeline_kwargs=dict( + max_new_tokens=512, + do_sample=False, + repetition_penalty=1.03, + ), + ) + return ChatHuggingFace(llm=llm) # This should not be reached if all Enum members are handled raise ValueError(f"Unknown or unhandled chat backend type: {settings.chat_backend}") @@ -130,25 +144,21 @@ def get_embedding_model(settings: AppSettings) -> Embeddings: raise ValueError("OpenAI embedding backend selected, but 'openai' configuration section is missing.") if not settings.openai.api_key: raise ValueError("OpenAI configuration requires 'api_key'.") - model_name = "text-embedding-ada-002" # Example default - if hasattr(settings.openai, "emb_model") and settings.openai.emb_model: - model_name = settings.openai.emb_model - logger.info(f"Using OpenAI embedding model: {model_name}") - return OpenAIEmbeddings(model=model_name, openai_api_key=settings.openai.api_key.get_secret_value()) + return OpenAIEmbeddings( + model=settings.openai.emb_model, openai_api_key=settings.openai.api_key.get_secret_value() + ) if settings.emb_backend == EmbeddingBackend.google_vertex: if not settings.google_vertex: raise ValueError( "Google Vertex embedding backend selected, but 'google_vertex' configuration section is missing." ) - model_name = "textembedding-gecko@001" # Example default if ( not settings.google_vertex.emb_model or not settings.google_vertex.project_id or not settings.google_vertex.location ): raise ValueError("Google Vertex configuration requires 'emb_model', 'project_id', and 'location'.") - logger.info(f"Using Google Vertex embedding model: {model_name}") return VertexAIEmbeddings( model_name=settings.google_vertex.emb_model, project=settings.google_vertex.project_id, @@ -158,33 +168,24 @@ def get_embedding_model(settings: AppSettings) -> Embeddings: if settings.emb_backend == EmbeddingBackend.aws: if not settings.aws: raise ValueError("AWS Bedrock embedding backend selected, but 'aws' configuration section is missing.") - model_name = "amazon.titan-embed-text-v1" # Example default - if hasattr(settings.aws, "emb_model") and settings.aws.emb_model: - model_name = settings.aws.emb_model - logger.info(f"Using AWS Bedrock embedding model: {model_name}") - return BedrockEmbeddings(model_id=model_name, region_name=settings.aws.region_name) + if not settings.aws.emb_model or not settings.aws.region_name: + raise ValueError("AWS Bedrock configuration requires 'emb_model' and 'region_name'") + return BedrockEmbeddings(model_id=settings.aws.emb_model, region_name=settings.aws.region_name) if settings.emb_backend == EmbeddingBackend.local: - if not settings.local or not settings.local.emb_model: - raise ValueError("Local embedding backend selected, but 'local.emb_model' is missing in config.") - logger.info(f"Using Local Ollama embedding model: {settings.local.emb_model}") + if not settings.local: + raise ValueError("Local embedding backend selected, but 'local' configuration section is missing.") + if not settings.local.emb_model: + raise ValueError("Local configuration requires 'emb_model'") return OllamaEmbeddings(model=settings.local.emb_model) if settings.emb_backend == EmbeddingBackend.huggingface: - if not settings.huggingface or not settings.huggingface.emb_model: - if settings.local and settings.local.emb_model: - logger.warning( - "HuggingFace backend selected, but 'huggingface.emb_model' missing. Using 'local.emb_model'." - ) - model_name = settings.local.emb_model - else: - raise ValueError( - "HuggingFace embedding backend selected, but 'huggingface.emb_model' (or 'local.emb_model') is missing in config." - ) - else: - model_name = settings.huggingface.emb_model - - logger.info(f"Using HuggingFace embedding model: {model_name}") - return HuggingFaceEmbeddings(model_name=model_name) + if not settings.huggingface: + raise ValueError( + "HuggingFace embedding backend selected, but 'huggingface' configuration section is missing." + ) + if not settings.huggingface.emb_model: + raise ValueError("HuggingFace configuration requires 'emb_model'.") + return HuggingFaceEmbeddings(model_name=settings.huggingface.emb_model) raise ValueError(f"Unknown or unhandled embedding backend type: {settings.emb_backend}") diff --git a/generic_rag/parsers/config.py b/generic_rag/parsers/config.py index 1815937..6ce2402 100644 --- a/generic_rag/parsers/config.py +++ b/generic_rag/parsers/config.py @@ -17,6 +17,7 @@ class ChatBackend(str, Enum): google_vertex = "google_vertex" aws = "aws" local = "local" + huggingface = "huggingface" def __str__(self): return self.value @@ -50,6 +51,8 @@ class OpenAISettings(BaseModel): """OpenAI specific settings.""" api_key: Optional[SecretStr] = None + chat_model: Optional[str] = None + emb_model: Optional[str] = None class GoogleVertexSettings(BaseModel): @@ -64,9 +67,9 @@ class GoogleVertexSettings(BaseModel): class AwsSettings(BaseModel): """AWS specific settings (e.g., for Bedrock).""" - access_key_id: Optional[SecretStr] = None - secret_access_key: Optional[SecretStr] = None - region_name: Optional[str] = None + chat_model: Optional[str] = None + emb_model: Optional[str] = None + region: Optional[str] = None class LocalSettings(BaseModel): @@ -79,6 +82,7 @@ class LocalSettings(BaseModel): class HuggingFaceSettings(BaseModel): """HuggingFace specific settings (if different from local embeddings).""" + chat_model: Optional[str] = None emb_model: Optional[str] = None api_token: Optional[SecretStr] = None From a58ef2f365c59c06e06e24dabe792a1480a6138f Mon Sep 17 00:00:00 2001 From: Ruben Lucas Date: Thu, 17 Apr 2025 08:32:38 +0200 Subject: [PATCH 07/12] =?UTF-8?q?=F0=9F=8E=A8=20Use=20project=20root=20ins?= =?UTF-8?q?tead=20of=20relative=20path?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- generic_rag/app.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/generic_rag/app.py b/generic_rag/app.py index d17b211..d682d1a 100644 --- a/generic_rag/app.py +++ b/generic_rag/app.py @@ -24,7 +24,8 @@ system_prompt = ( "If you don't know the answer, say that you don't know." ) -CONFIG_FILE_PATH = Path("config.yaml") +PROJECT_ROOT = Path(__file__).resolve().parent.parent +CONFIG_FILE_PATH = PROJECT_ROOT / "config.yaml" try: settings: AppSettings = load_settings(CONFIG_FILE_PATH) From 9935d9e8d379e3fd035a21afc1cc3d5fadfdfbc9 Mon Sep 17 00:00:00 2001 From: Ruben Lucas Date: Thu, 17 Apr 2025 13:06:27 +0200 Subject: [PATCH 08/12] =?UTF-8?q?=E2=9C=A8=20Change=20config=20file=20loca?= =?UTF-8?q?tion=20using=20argument=20parser?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- generic_rag/app.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/generic_rag/app.py b/generic_rag/app.py index d682d1a..d1383b7 100644 --- a/generic_rag/app.py +++ b/generic_rag/app.py @@ -1,3 +1,4 @@ +import argparse import json import logging import os @@ -17,6 +18,8 @@ from generic_rag.parsers.parser import add_pdf_files, add_urls logger = logging.getLogger("sogeti-rag") logger.setLevel(logging.DEBUG) +PROJECT_ROOT = Path(__file__).resolve().parent.parent + system_prompt = ( "You are an assistant for question-answering tasks. " "If the question is in Dutch, answer in Dutch. If the question is in English, answer in English." @@ -24,13 +27,21 @@ system_prompt = ( "If you don't know the answer, say that you don't know." ) -PROJECT_ROOT = Path(__file__).resolve().parent.parent -CONFIG_FILE_PATH = PROJECT_ROOT / "config.yaml" +parser = argparse.ArgumentParser(description="A Sogeti Netherlands Generic RAG demo.") +parser.add_argument( + "-c", + "--config", + type=Path, + default=PROJECT_ROOT / "config.yaml", + help="Path to configuration file (YAML format). Defaults to 'config.yaml' in project root.", +) + +args = parser.parse_args() try: - settings: AppSettings = load_settings(CONFIG_FILE_PATH) + settings: AppSettings = load_settings(args.config) except (FileNotFoundError, Exception) as e: - logger.error(f"Failed to load configuration from {CONFIG_FILE_PATH}. Exiting.") + logger.error(f"Failed to load configuration from {args.config}. Exiting.") sys.exit(1) embedding_function = get_embedding_model(settings) From adcaceab8a6017745cdd6db34e8e73b1e8e49517 Mon Sep 17 00:00:00 2001 From: Ruben Lucas Date: Thu, 17 Apr 2025 13:16:41 +0200 Subject: [PATCH 09/12] =?UTF-8?q?=F0=9F=8E=A8=20Fix=20import=20order?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- generic_rag/backend/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/generic_rag/backend/models.py b/generic_rag/backend/models.py index e8a3063..cf21a3e 100644 --- a/generic_rag/backend/models.py +++ b/generic_rag/backend/models.py @@ -1,7 +1,5 @@ import logging -from generic_rag.parsers.config import AppSettings, ChatBackend, EmbeddingBackend - from langchain_core.embeddings import Embeddings from langchain_core.language_models.chat_models import BaseChatModel from langchain_aws import BedrockEmbeddings, ChatBedrock @@ -10,6 +8,8 @@ from langchain_huggingface import HuggingFaceEmbeddings, ChatHuggingFace, Huggin from langchain_ollama import ChatOllama, OllamaEmbeddings from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings, ChatOpenAI, OpenAIEmbeddings +from generic_rag.parsers.config import AppSettings, ChatBackend, EmbeddingBackend + logger = logging.getLogger(__name__) From 6b4dfa13b6cd26084333856ba3449ffdecb24087 Mon Sep 17 00:00:00 2001 From: Ruben Lucas Date: Thu, 17 Apr 2025 13:47:36 +0200 Subject: [PATCH 10/12] =?UTF-8?q?=F0=9F=93=9D=20Add=20separate=20project?= =?UTF-8?q?=20env=20setup=20section?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c94f3fa..9980400 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ A generic Retrieval Augmented Generation (RAG) demo from Sogeti Netherlands buil - [Table of Contents](#table-of-contents) - [Features](#features) - [Getting started](#getting-started) + - [Project Environment Setup](#project-environment-setup) - [Installation of system dependencies](#installation-of-system-dependencies) - [Unstructered PDF loader (optional)](#unstructered-pdf-loader-optional) - [Local LLM (optional)](#local-llm-optional) @@ -25,7 +26,7 @@ A generic Retrieval Augmented Generation (RAG) demo from Sogeti Netherlands buil ## Getting started -### Installation of system dependencies +### Project Environment Setup This project leverages a modern packaging method defined in `pyproject.toml`. After cloning the repository, you can install the project along with its dependencies. You have two options: 1. Using uv @@ -44,6 +45,10 @@ source .venv/bin/activate # Activate the virtual environment (use ".venv\Scrip pip install . # Install the project and its dependencies ``` +### Installation of system dependencies + +Some optional features require additional system applications to be installed. + #### Unstructered PDF loader (optional) If you would like to run the application using the unstructered PDF loader (`pdf.unstructured` setting) you need to install two system dependencies. From 26b374cf41152d000c5e78878ef359e97da6625a Mon Sep 17 00:00:00 2001 From: Ruben Lucas Date: Thu, 17 Apr 2025 13:59:09 +0200 Subject: [PATCH 11/12] =?UTF-8?q?=E2=8F=AA=20Add=20back=20.env=20package?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 1 + requirements.txt | 2 ++ 2 files changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index bd09387..a0af033 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ requires-python = ">=3.12,<3.13" dependencies = [ "beautifulsoup4>=4.13.3", "chainlit>=2.3.0", + "dotenv>=0.9.9", "langchain>=0.3.20", "langchain-aws>=0.2.15", "langchain-chroma>=0.2.2", diff --git a/requirements.txt b/requirements.txt index 46f5596..cf3ffe8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -117,6 +117,8 @@ distro==1.9.0 # posthog docstring-parser==0.16 # via google-cloud-aiplatform +dotenv==0.9.9 + # via sogeti-generic-rag-demo (pyproject.toml) durationpy==0.9 # via kubernetes effdet==0.4.1 From 74dd3b6947230e41f671956d1d53eedc17c44ee1 Mon Sep 17 00:00:00 2001 From: Ruben Lucas Date: Fri, 18 Apr 2025 11:42:40 +0200 Subject: [PATCH 12/12] =?UTF-8?q?=F0=9F=8E=A8=20Add=20.env=20only=20for=20?= =?UTF-8?q?API=20keys?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 5 +--- README.md | 14 ++++++++-- config.example.yaml => config.yaml | 11 +++----- generic_rag/backend/models.py | 45 +++++++++++++++++++----------- generic_rag/parsers/config.py | 4 --- 5 files changed, 46 insertions(+), 33 deletions(-) rename config.example.yaml => config.yaml (80%) diff --git a/.gitignore b/.gitignore index 6578b8f..d7ad83d 100644 --- a/.gitignore +++ b/.gitignore @@ -166,7 +166,4 @@ chainlit.md .files/ # Chroma DB -.chroma_db/ - -# Settings -config.yaml \ No newline at end of file +.chroma_db/ \ No newline at end of file diff --git a/README.md b/README.md index 9980400..d560fde 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ A generic Retrieval Augmented Generation (RAG) demo from Sogeti Netherlands buil - [Local LLM (optional)](#local-llm-optional) - [Running generic RAG demo](#running-generic-rag-demo) - [config.yaml file](#configyaml-file) + - [.env file](#env-file) - [Chainlit starters](#chainlit-starters) - [Dev details](#dev-details) - [Linting](#linting) @@ -96,11 +97,11 @@ python generic_rag/app.py -p data # will work and parsers all pdf files in ./da python generic_rag/app.py --help # will work and prints command line options ``` -Please configure your `config.yaml` file with your cloud provider (backend) of choice. See the `config.example.yaml` file as a starting point that holds all possible options. +Please configure your `config.yaml` and `.env` file with your cloud provider (backend) of choice. See the sections below for more details. ### config.yaml file -A config.yaml file is required to specify your API endpoints, local backends, and environment variables. Use the provided config.yaml.example as a starting point. Update the file according to your backend settings and project requirements. +A config.yaml file is required to specify your API endpoints and local backends. Use the provided `config.yaml.example` as a starting point. Update the file according to your backend settings and project requirements. Key configuration points include: - Chat Backend: Choose among azure, openai, google_vertex, aws, or local. @@ -116,6 +117,15 @@ For more information on configuring Langchain endpoints and models, please see: > for local models we currently use Ollama +### .env file + +Set the API keys for your chosen cloud provider (backend). This ensures that your application can authenticate and interact with the services. + +```text +AZURE_OPENAI_API_KEY=your_azure_api_key +OPENAI_API_KEY=your_openai_api_key +``` + ### Chainlit starters Chainlit suggestions (starters) can be set with the `CHAINLIT_STARTERS` environment variable. diff --git a/config.example.yaml b/config.yaml similarity index 80% rename from config.example.yaml rename to config.yaml index 12bd131..4c877d8 100644 --- a/config.example.yaml +++ b/config.yaml @@ -8,16 +8,14 @@ use_conditional_graph: false # Use a conditional RAG model with historical chat # --- Provider Specific Settings --- azure: - openai_api_key: "your_openai_api_key" llm_endpoint: "https://example.openai.azure.com" llm_deployment_name: "gpt-4o-mini" llm_api_version: "2025-01-01-preview" - emb_endpoint: "https://example.openai.azure.com" # Can be same as LLM endpoint + emb_endpoint: "https://example.openai.azure.com" emb_deployment_name: "text-embedding-3-large" emb_api_version: "2023-05-15" openai: - openai_api_key: "your_openai_api_key" chat_model: "gpt-4o-mini" emb_model: "text-embedding-3-large" @@ -31,15 +29,14 @@ aws: chat_model: "amazon.titan-llm-v1" emb_model: "amazon.titan-embed-text-v1" region: "us-east-1" - credentials: "PATH_TO_YOUR_CREDENTIALS_FILE.json" local: # Settings for local models (e.g., Ollama) chat_model: "llama3.1:8b" emb_model: "llama3.1:8b" -huggingface: # Settings specific to HuggingFace embedding backend +huggingface: chat_model: "meta-llama/Llama-2-7b-chat-hf" - emb_model: "sentence-transformers/paraphrase-MiniLM-L12-v2" + emb_model: "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" # --- Data Processing Settings --- @@ -61,4 +58,4 @@ web: chroma_db: location: "/app/data/vector_database" # Override default DB path (default: '.chroma_db') - reset: True # Reset the database on startup? (default: false) + reset: False # Reset the database on startup? (default: false) diff --git a/generic_rag/backend/models.py b/generic_rag/backend/models.py index cf21a3e..3f2d0d6 100644 --- a/generic_rag/backend/models.py +++ b/generic_rag/backend/models.py @@ -1,4 +1,5 @@ import logging +import os from langchain_core.embeddings import Embeddings from langchain_core.language_models.chat_models import BaseChatModel @@ -39,19 +40,26 @@ def get_chat_model(settings: AppSettings) -> BaseChatModel: raise ValueError( "Azure configuration requires 'llm_endpoint', 'llm_deployment_name', and 'llm_api_version'." ) + if "AZURE_OPENAI_API_KEY" not in os.environ: + raise ValueError( + "The environment variable 'AZURE_OPENAI_API_KEY' is missing. Please set the variable in your '.env' file before running the script." + ) return AzureChatOpenAI( azure_endpoint=settings.azure.llm_endpoint, azure_deployment=settings.azure.llm_deployment_name, openai_api_version=settings.azure.llm_api_version, - openai_api_key=settings.azure.openai_api_key.get_secret_value() if settings.azure.openai_api_key else None, ) if settings.chat_backend == ChatBackend.openai: if not settings.openai: raise ValueError("OpenAI chat backend selected, but 'openai' configuration section is missing.") - if not settings.openai.api_key or not settings.openai.chat_model: - raise ValueError("OpenAI configuration requires 'api_key' and 'chat_model'.") - return ChatOpenAI(model=settings.openai.chat_model, openai_api_key=settings.openai.api_key.get_secret_value()) + if not settings.openai.chat_model: + raise ValueError("OpenAI configuration requires 'chat_model'.") + if "OPENAI_API_KEY" not in os.environ: + raise ValueError( + "The environment variable 'OPENAI_API_KEY' is missing. Please set the variable in your '.env' file before running the script." + ) + return ChatOpenAI(model=settings.openai.chat_model) if settings.chat_backend == ChatBackend.google_vertex: if not settings.google_vertex: @@ -63,7 +71,7 @@ def get_chat_model(settings: AppSettings) -> BaseChatModel: or not settings.google_vertex.project_id or not settings.google_vertex.location ): - raise ValueError("Google Vertex configuration requires 'chat_model' and 'project_id'.") + raise ValueError("Google Vertex configuration requires 'chat_model', 'project_id' and 'location'.") return ChatVertexAI( model_name=settings.google_vertex.chat_model, project=settings.google_vertex.project_id, @@ -74,10 +82,10 @@ def get_chat_model(settings: AppSettings) -> BaseChatModel: if not settings.aws: raise ValueError("AWS Bedrock chat backend selected, but 'aws' configuration section is missing.") if not settings.aws.chat_model or not settings.aws.region_name: - raise ValueError("AWS Bedrock configuration requires 'chat_model' and 'region_name'") + raise ValueError("AWS Bedrock configuration requires 'chat_model' and 'region'") return ChatBedrock( model_id=settings.aws.chat_model, - region_name=settings.aws.region_name, + region_name=settings.aws.region, ) if settings.chat_backend == ChatBackend.local: @@ -132,21 +140,26 @@ def get_embedding_model(settings: AppSettings) -> Embeddings: raise ValueError( "Azure configuration requires 'emb_endpoint', 'emb_deployment_name', and 'emb_api_version'." ) + if "AZURE_OPENAI_API_KEY" not in os.environ: + raise ValueError( + "The environment variable 'AZURE_OPENAI_API_KEY' is missing. Please set the variable in your '.env' file before running the script." + ) return AzureOpenAIEmbeddings( azure_endpoint=settings.azure.emb_endpoint, azure_deployment=settings.azure.emb_deployment_name, openai_api_version=settings.azure.emb_api_version, - openai_api_key=settings.azure.openai_api_key.get_secret_value() if settings.azure.openai_api_key else None, ) if settings.emb_backend == EmbeddingBackend.openai: if not settings.openai: raise ValueError("OpenAI embedding backend selected, but 'openai' configuration section is missing.") - if not settings.openai.api_key: - raise ValueError("OpenAI configuration requires 'api_key'.") - return OpenAIEmbeddings( - model=settings.openai.emb_model, openai_api_key=settings.openai.api_key.get_secret_value() - ) + if not settings.openai.emb_model: + raise ValueError("OpenAI configuration requires 'emb_model'.") + if "OPENAI_API_KEY" not in os.environ: + raise ValueError( + "The environment variable 'OPENAI_API_KEY' is missing. Please set the variable in your '.env' file before running the script." + ) + return OpenAIEmbeddings(model=settings.openai.emb_model) if settings.emb_backend == EmbeddingBackend.google_vertex: if not settings.google_vertex: @@ -168,9 +181,9 @@ def get_embedding_model(settings: AppSettings) -> Embeddings: if settings.emb_backend == EmbeddingBackend.aws: if not settings.aws: raise ValueError("AWS Bedrock embedding backend selected, but 'aws' configuration section is missing.") - if not settings.aws.emb_model or not settings.aws.region_name: - raise ValueError("AWS Bedrock configuration requires 'emb_model' and 'region_name'") - return BedrockEmbeddings(model_id=settings.aws.emb_model, region_name=settings.aws.region_name) + if not settings.aws.emb_model or not settings.aws.region: + raise ValueError("AWS Bedrock configuration requires 'emb_model' and 'region'") + return BedrockEmbeddings(model_id=settings.aws.emb_model, region_name=settings.aws.region) if settings.emb_backend == EmbeddingBackend.local: if not settings.local: diff --git a/generic_rag/parsers/config.py b/generic_rag/parsers/config.py index 6ce2402..7ef2dee 100644 --- a/generic_rag/parsers/config.py +++ b/generic_rag/parsers/config.py @@ -6,7 +6,6 @@ from pydantic import ( BaseModel, Field, ValidationError, - SecretStr, ) import sys @@ -38,7 +37,6 @@ class EmbeddingBackend(str, Enum): class AzureSettings(BaseModel): """Azure specific settings.""" - openai_api_key: Optional[SecretStr] = None llm_endpoint: Optional[str] = None llm_deployment_name: Optional[str] = None llm_api_version: Optional[str] = None @@ -50,7 +48,6 @@ class AzureSettings(BaseModel): class OpenAISettings(BaseModel): """OpenAI specific settings.""" - api_key: Optional[SecretStr] = None chat_model: Optional[str] = None emb_model: Optional[str] = None @@ -84,7 +81,6 @@ class HuggingFaceSettings(BaseModel): chat_model: Optional[str] = None emb_model: Optional[str] = None - api_token: Optional[SecretStr] = None class PdfSettings(BaseModel):