Fix pdf source retrieval information

2025-03-28 15:08:32 +01:00 · 2025-03-28 15:08:32 +01:00 · cd14c8add2
commit cd14c8add2
parent d1e9b3d8cf
1 changed files with 29 additions and 13 deletions
--- a/generic_rag/graphs/ret_gen.py
+++ b/generic_rag/graphs/ret_gen.py
@ -1,3 +1,4 @@
+from pathlib import Path
 from typing import Any, Union

 from langchain import hub
@ -53,15 +54,29 @@ class RetGenLangGraph:
        pdf_sources = {}
        for context in self.last_invoke["context"]:
            try:
-                if context.metadata["filetype"] == "application/pdf":
-                    source = context.metadata["source"]
-                    page_number = context.metadata["page_number"]
-                    if source in pdf_sources:
-                        pdf_sources[source].add(page_number)
-                    else:
-                        pdf_sources[source] = {page_number}
+                Path(context.metadata["source"]).suffix == ".pdf"
+            except KeyError:
+                continue
+            else:
+                source = context.metadata["source"]
+
+            if source not in pdf_sources:
+                pdf_sources[source] = set()
+
+            # The page numbers are in the `page_numer` and `page` fields.
+            try:
+                page_number = context.metadata["page_number"]
            except KeyError:
                pass
+            else:
+                pdf_sources[source].add(page_number)
+
+            try:
+                page_number = context.metadata["page"]
+            except KeyError:
+                pass
+            else:
+                pdf_sources[source].add(page_number)

        return pdf_sources

@ -75,9 +90,10 @@ class RetGenLangGraph:
        web_sources = set()
        for context in self.last_invoke["context"]:
            try:
-                if context.metadata["filetype"] == "web":
-                    web_sources.add(context.metadata["source"])
+                context.metadata["filetype"] == "web"
            except KeyError:
-                pass
+                continue
+            else:
+                web_sources.add(context.metadata["source"])

        return web_sources