forked from AI_team/Philosophy-RAG-demo
Adds a structured pdf parser
This commit is contained in:
parent
cfbfe5f609
commit
e79d7b9867
@ -38,6 +38,14 @@ parser.add_argument(
|
||||
"If a path is a file, only that file will be used. "
|
||||
"If the path is relative it will be relative to the current working directory.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--unstructured-pdf",
|
||||
action="store_true",
|
||||
help="Use an unstructered PDF parser. "
|
||||
"An unstructured PDF parser might be usefull for PDF files "
|
||||
"that contain a lot of images, tables or text as images. "
|
||||
"Please use '-r' when switching parsers on already indexed data.",
|
||||
)
|
||||
parser.add_argument("--pdf-chunk_size", type=int, default=1000, help="The size of the chunks to split the text into.")
|
||||
parser.add_argument("--pdf-chunk_overlap", type=int, default=200, help="The overlap between the chunks.")
|
||||
parser.add_argument(
|
||||
|
||||
@ -4,6 +4,7 @@ from pathlib import Path
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from langchain_chroma import Chroma
|
||||
from langchain_community.document_loaders import PyMuPDFLoader
|
||||
from langchain_community.vectorstores.utils import filter_complex_metadata
|
||||
from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
|
||||
from langchain_unstructured import UnstructuredLoader
|
||||
@ -64,7 +65,12 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
|
||||
|
||||
|
||||
def add_pdf_files(
|
||||
vector_store: Chroma, file_paths: list[Path], chunk_size: int, chunk_overlap: int, add_start_index: bool
|
||||
vector_store: Chroma,
|
||||
file_paths: list[Path],
|
||||
chunk_size: int,
|
||||
chunk_overlap: int,
|
||||
add_start_index: bool,
|
||||
unstructerd: bool,
|
||||
) -> None:
|
||||
pdf_files = get_all_local_pdf_files(file_paths)
|
||||
|
||||
@ -80,7 +86,7 @@ def add_pdf_files(
|
||||
|
||||
loaded_document = []
|
||||
for file in new_pdfs:
|
||||
loader = UnstructuredLoader(file_path=file, strategy="hi_res")
|
||||
loader = UnstructuredLoader(file_path=file, strategy="hi_res") if unstructerd else PyMuPDFLoader(file_path=file)
|
||||
for document in loader.lazy_load():
|
||||
loaded_document.append(document)
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user