Fix pdf source retrieval information

This commit is contained in:
Nielson Janné 2025-03-28 15:08:32 +01:00
parent d1e9b3d8cf
commit cd14c8add2

View File

@ -1,3 +1,4 @@
from pathlib import Path
from typing import Any, Union
from langchain import hub
@ -53,15 +54,29 @@ class RetGenLangGraph:
pdf_sources = {}
for context in self.last_invoke["context"]:
try:
if context.metadata["filetype"] == "application/pdf":
source = context.metadata["source"]
page_number = context.metadata["page_number"]
if source in pdf_sources:
pdf_sources[source].add(page_number)
else:
pdf_sources[source] = {page_number}
Path(context.metadata["source"]).suffix == ".pdf"
except KeyError:
continue
else:
source = context.metadata["source"]
if source not in pdf_sources:
pdf_sources[source] = set()
# The page numbers are in the `page_numer` and `page` fields.
try:
page_number = context.metadata["page_number"]
except KeyError:
pass
else:
pdf_sources[source].add(page_number)
try:
page_number = context.metadata["page"]
except KeyError:
pass
else:
pdf_sources[source].add(page_number)
return pdf_sources
@ -75,9 +90,10 @@ class RetGenLangGraph:
web_sources = set()
for context in self.last_invoke["context"]:
try:
if context.metadata["filetype"] == "web":
web_sources.add(context.metadata["source"])
context.metadata["filetype"] == "web"
except KeyError:
pass
continue
else:
web_sources.add(context.metadata["source"])
return web_sources