From 2ba3eadec4751033bb2a51f6332d7d426f1cd1f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nielson=20Jann=C3=A9?= Date: Mon, 17 Mar 2025 14:25:39 +0100 Subject: [PATCH] Add some doc-strings --- generic_rag/parsers/parser.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/generic_rag/parsers/parser.py b/generic_rag/parsers/parser.py index eaf885a..36f0866 100644 --- a/generic_rag/parsers/parser.py +++ b/generic_rag/parsers/parser.py @@ -27,6 +27,11 @@ def code_handler(element: Tag) -> str: def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None: + """ + Adds a list of URLs as vector documents to the provided vector store. + + The URL's will be fetched and split into chunks of text with the provided chunk size. + """ all_splits = [] for url in urls: if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0: @@ -72,6 +77,11 @@ def add_pdf_files( add_start_index: bool, unstructerd: bool, ) -> None: + """ + Adds a list of PDF files as vector documents to the provided vector store. + + The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap. + """ pdf_files = get_all_local_pdf_files(file_paths) new_pdfs = []