Adds a structured pdf parser

This commit is contained in:
Nielson Janné 2025-03-17 11:57:10 +01:00
parent cfbfe5f609
commit e79d7b9867
2 changed files with 16 additions and 2 deletions

View File

@ -38,6 +38,14 @@ parser.add_argument(
"If a path is a file, only that file will be used. " "If a path is a file, only that file will be used. "
"If the path is relative it will be relative to the current working directory.", "If the path is relative it will be relative to the current working directory.",
) )
parser.add_argument(
"--unstructured-pdf",
action="store_true",
help="Use an unstructered PDF parser. "
"An unstructured PDF parser might be usefull for PDF files "
"that contain a lot of images, tables or text as images. "
"Please use '-r' when switching parsers on already indexed data.",
)
parser.add_argument("--pdf-chunk_size", type=int, default=1000, help="The size of the chunks to split the text into.") parser.add_argument("--pdf-chunk_size", type=int, default=1000, help="The size of the chunks to split the text into.")
parser.add_argument("--pdf-chunk_overlap", type=int, default=200, help="The overlap between the chunks.") parser.add_argument("--pdf-chunk_overlap", type=int, default=200, help="The overlap between the chunks.")
parser.add_argument( parser.add_argument(

View File

@ -4,6 +4,7 @@ from pathlib import Path
import requests import requests
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from langchain_chroma import Chroma from langchain_chroma import Chroma
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores.utils import filter_complex_metadata from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
from langchain_unstructured import UnstructuredLoader from langchain_unstructured import UnstructuredLoader
@ -64,7 +65,12 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
def add_pdf_files( def add_pdf_files(
vector_store: Chroma, file_paths: list[Path], chunk_size: int, chunk_overlap: int, add_start_index: bool vector_store: Chroma,
file_paths: list[Path],
chunk_size: int,
chunk_overlap: int,
add_start_index: bool,
unstructerd: bool,
) -> None: ) -> None:
pdf_files = get_all_local_pdf_files(file_paths) pdf_files = get_all_local_pdf_files(file_paths)
@ -80,7 +86,7 @@ def add_pdf_files(
loaded_document = [] loaded_document = []
for file in new_pdfs: for file in new_pdfs:
loader = UnstructuredLoader(file_path=file, strategy="hi_res") loader = UnstructuredLoader(file_path=file, strategy="hi_res") if unstructerd else PyMuPDFLoader(file_path=file)
for document in loader.lazy_load(): for document in loader.lazy_load():
loaded_document.append(document) loaded_document.append(document)