diff --git a/generic_rag/parsers/parser.py b/generic_rag/parsers/parser.py index eaf885a..36f0866 100644 --- a/generic_rag/parsers/parser.py +++ b/generic_rag/parsers/parser.py @@ -27,6 +27,11 @@ def code_handler(element: Tag) -> str: def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None: + """ + Adds a list of URLs as vector documents to the provided vector store. + + The URL's will be fetched and split into chunks of text with the provided chunk size. + """ all_splits = [] for url in urls: if len(vector_store.get(where={"source": url}, limit=1)["ids"]) > 0: @@ -72,6 +77,11 @@ def add_pdf_files( add_start_index: bool, unstructerd: bool, ) -> None: + """ + Adds a list of PDF files as vector documents to the provided vector store. + + The PDF file will be parsed per page and split into chunks of text with the provided chunk size and overlap. + """ pdf_files = get_all_local_pdf_files(file_paths) new_pdfs = []