From e79d7b9867927391fd36ea8ffef5c5138b2f5180 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nielson=20Jann=C3=A9?= Date: Mon, 17 Mar 2025 11:57:10 +0100 Subject: [PATCH] Adds a structured pdf parser --- generic_rag/app.py | 8 ++++++++ generic_rag/parsers/parser.py | 10 ++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/generic_rag/app.py b/generic_rag/app.py index 4010184..8e8a7c8 100644 --- a/generic_rag/app.py +++ b/generic_rag/app.py @@ -38,6 +38,14 @@ parser.add_argument( "If a path is a file, only that file will be used. " "If the path is relative it will be relative to the current working directory.", ) +parser.add_argument( + "--unstructured-pdf", + action="store_true", + help="Use an unstructered PDF parser. " + "An unstructured PDF parser might be usefull for PDF files " + "that contain a lot of images, tables or text as images. " + "Please use '-r' when switching parsers on already indexed data.", +) parser.add_argument("--pdf-chunk_size", type=int, default=1000, help="The size of the chunks to split the text into.") parser.add_argument("--pdf-chunk_overlap", type=int, default=200, help="The overlap between the chunks.") parser.add_argument( diff --git a/generic_rag/parsers/parser.py b/generic_rag/parsers/parser.py index ac59d4b..eaf885a 100644 --- a/generic_rag/parsers/parser.py +++ b/generic_rag/parsers/parser.py @@ -4,6 +4,7 @@ from pathlib import Path import requests from bs4 import BeautifulSoup, Tag from langchain_chroma import Chroma +from langchain_community.document_loaders import PyMuPDFLoader from langchain_community.vectorstores.utils import filter_complex_metadata from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter from langchain_unstructured import UnstructuredLoader @@ -64,7 +65,12 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None: def add_pdf_files( - vector_store: Chroma, file_paths: list[Path], chunk_size: int, chunk_overlap: int, add_start_index: bool + vector_store: Chroma, + file_paths: list[Path], + chunk_size: int, + chunk_overlap: int, + add_start_index: bool, + unstructerd: bool, ) -> None: pdf_files = get_all_local_pdf_files(file_paths) @@ -80,7 +86,7 @@ def add_pdf_files( loaded_document = [] for file in new_pdfs: - loader = UnstructuredLoader(file_path=file, strategy="hi_res") + loader = UnstructuredLoader(file_path=file, strategy="hi_res") if unstructerd else PyMuPDFLoader(file_path=file) for document in loader.lazy_load(): loaded_document.append(document)