Adds a structured pdf parser

2025-03-17 11:57:10 +01:00 · 2025-03-17 11:57:10 +01:00 · e79d7b9867
commit e79d7b9867
parent cfbfe5f609
2 changed files with 16 additions and 2 deletions
--- a/generic_rag/app.py
+++ b/generic_rag/app.py
@ -38,6 +38,14 @@ parser.add_argument(
    "If a path is a file, only that file will be used. "
    "If the path is relative it will be relative to the current working directory.",
 )
+parser.add_argument(
+    "--unstructured-pdf",
+    action="store_true",
+    help="Use an unstructered PDF parser. "
+    "An unstructured PDF parser might be usefull for PDF files "
+    "that contain a lot of images, tables or text as images. "
+    "Please use '-r' when switching parsers on already indexed data.",
+)
 parser.add_argument("--pdf-chunk_size", type=int, default=1000, help="The size of the chunks to split the text into.")
 parser.add_argument("--pdf-chunk_overlap", type=int, default=200, help="The overlap between the chunks.")
 parser.add_argument(
--- a/generic_rag/parsers/parser.py
+++ b/generic_rag/parsers/parser.py
@ -4,6 +4,7 @@ from pathlib import Path
 import requests
 from bs4 import BeautifulSoup, Tag
 from langchain_chroma import Chroma
+from langchain_community.document_loaders import PyMuPDFLoader
 from langchain_community.vectorstores.utils import filter_complex_metadata
 from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
 from langchain_unstructured import UnstructuredLoader
@ -64,7 +65,12 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:


 def add_pdf_files(
-    vector_store: Chroma, file_paths: list[Path], chunk_size: int, chunk_overlap: int, add_start_index: bool
+    vector_store: Chroma,
+    file_paths: list[Path],
+    chunk_size: int,
+    chunk_overlap: int,
+    add_start_index: bool,
+    unstructerd: bool,
 ) -> None:
    pdf_files = get_all_local_pdf_files(file_paths)

@ -80,7 +86,7 @@ def add_pdf_files(

    loaded_document = []
    for file in new_pdfs:
-        loader = UnstructuredLoader(file_path=file, strategy="hi_res")
+        loader = UnstructuredLoader(file_path=file, strategy="hi_res") if unstructerd else PyMuPDFLoader(file_path=file)
        for document in loader.lazy_load():
            loaded_document.append(document)