forked from AI_team/Philosophy-RAG-demo
Adds a structured pdf parser
This commit is contained in:
parent
cfbfe5f609
commit
e79d7b9867
@ -38,6 +38,14 @@ parser.add_argument(
|
|||||||
"If a path is a file, only that file will be used. "
|
"If a path is a file, only that file will be used. "
|
||||||
"If the path is relative it will be relative to the current working directory.",
|
"If the path is relative it will be relative to the current working directory.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--unstructured-pdf",
|
||||||
|
action="store_true",
|
||||||
|
help="Use an unstructered PDF parser. "
|
||||||
|
"An unstructured PDF parser might be usefull for PDF files "
|
||||||
|
"that contain a lot of images, tables or text as images. "
|
||||||
|
"Please use '-r' when switching parsers on already indexed data.",
|
||||||
|
)
|
||||||
parser.add_argument("--pdf-chunk_size", type=int, default=1000, help="The size of the chunks to split the text into.")
|
parser.add_argument("--pdf-chunk_size", type=int, default=1000, help="The size of the chunks to split the text into.")
|
||||||
parser.add_argument("--pdf-chunk_overlap", type=int, default=200, help="The overlap between the chunks.")
|
parser.add_argument("--pdf-chunk_overlap", type=int, default=200, help="The overlap between the chunks.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|||||||
@ -4,6 +4,7 @@ from pathlib import Path
|
|||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
from langchain_chroma import Chroma
|
from langchain_chroma import Chroma
|
||||||
|
from langchain_community.document_loaders import PyMuPDFLoader
|
||||||
from langchain_community.vectorstores.utils import filter_complex_metadata
|
from langchain_community.vectorstores.utils import filter_complex_metadata
|
||||||
from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
|
from langchain_text_splitters import HTMLSemanticPreservingSplitter, RecursiveCharacterTextSplitter
|
||||||
from langchain_unstructured import UnstructuredLoader
|
from langchain_unstructured import UnstructuredLoader
|
||||||
@ -64,7 +65,12 @@ def add_urls(vector_store: Chroma, urls: list[str], chunk_size: int) -> None:
|
|||||||
|
|
||||||
|
|
||||||
def add_pdf_files(
|
def add_pdf_files(
|
||||||
vector_store: Chroma, file_paths: list[Path], chunk_size: int, chunk_overlap: int, add_start_index: bool
|
vector_store: Chroma,
|
||||||
|
file_paths: list[Path],
|
||||||
|
chunk_size: int,
|
||||||
|
chunk_overlap: int,
|
||||||
|
add_start_index: bool,
|
||||||
|
unstructerd: bool,
|
||||||
) -> None:
|
) -> None:
|
||||||
pdf_files = get_all_local_pdf_files(file_paths)
|
pdf_files = get_all_local_pdf_files(file_paths)
|
||||||
|
|
||||||
@ -80,7 +86,7 @@ def add_pdf_files(
|
|||||||
|
|
||||||
loaded_document = []
|
loaded_document = []
|
||||||
for file in new_pdfs:
|
for file in new_pdfs:
|
||||||
loader = UnstructuredLoader(file_path=file, strategy="hi_res")
|
loader = UnstructuredLoader(file_path=file, strategy="hi_res") if unstructerd else PyMuPDFLoader(file_path=file)
|
||||||
for document in loader.lazy_load():
|
for document in loader.lazy_load():
|
||||||
loaded_document.append(document)
|
loaded_document.append(document)
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user