From e79d7b9867927391fd36ea8ffef5c5138b2f5180 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nielson=20Jann=C3=A9?= <nielsonj@gmail.com>
Date: Mon, 17 Mar 2025 11:57:10 +0100
Subject: [PATCH] Adds a structured pdf parser

---
 generic_rag/app.py            |  8 ++++++++
 generic_rag/parsers/parser.py | 10 ++++++++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/generic_rag/app.py b/generic_rag/app.py
index 4010184..8e8a7c8 100644
--- a/generic_rag/app.py
+++ b/generic_rag/app.py
@@ -38,6 +38,14 @@ parser.add_argument(
     "If a path is a file, only that file will be used. "
     "If the path is relative it will be relative to the current working directory.",
 )
+parser.add_argument(
+    "--unstructured-pdf",
+    action="store_true",
+    help="Use an unstructered PDF parser. "
+    "An unstructured PDF parser might be usefull for PDF files "
+    "that contain a lot of images, tables or text as images. "
+    "Please use '-r' when switching parsers on already indexed data.",
+)
 parser.add_argument("--pdf-chunk_size", type=int, default=1000, help="The size of the chunks to split the text into.")
 parser.add_argument("--pdf-chunk_overlap", type=int, default=200, help="The overlap between the chunks.")
 parser.add_argument(
diff --git a/generic_rag/parsers/parser.py b/generic_rag/parsers/parser.py
index ac59d4b..eaf885a 100644
--- a/generic_rag/parsers/parser.py
+++ b/generic_rag/parsers/parser.py
@@ -4,6 +4,7 @@ from pathlib import Path
 import requests
 from bs4 import BeautifulSoup, Tag
 from langchain_chroma import Chroma
+from langchain_community.document_loaders import PyMuPDFLoader
 from langchain_community.vectorstores.utils import filter_complex_metadata
 from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
 from langchain_unstructured import UnstructuredLoader
@@ -64,7 +65,12 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
 
 
 def add_pdf_files(
-    vector_store: Chroma, file_paths: list[Path], chunk_size: int, chunk_overlap: int, add_start_index: bool
+    vector_store: Chroma,
+    file_paths: list[Path],
+    chunk_size: int,
+    chunk_overlap: int,
+    add_start_index: bool,
+    unstructerd: bool,
 ) -> None:
     pdf_files = get_all_local_pdf_files(file_paths)
 
@@ -80,7 +86,7 @@ def add_pdf_files(
 
     loaded_document = []
     for file in new_pdfs:
-        loader = UnstructuredLoader(file_path=file, strategy="hi_res")
+        loader = UnstructuredLoader(file_path=file, strategy="hi_res") if unstructerd else PyMuPDFLoader(file_path=file)
         for document in loader.lazy_load():
             loaded_document.append(document)