From cfbfe5f609833a12416b726100e7bf4ce7582377 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nielson=20Jann=C3=A9?= Date: Sat, 15 Mar 2025 15:35:29 +0100 Subject: [PATCH] Add pdf/web source and viewer to chainlit --- generic_rag/app.py | 56 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/generic_rag/app.py b/generic_rag/app.py index 76272d8..4010184 100644 --- a/generic_rag/app.py +++ b/generic_rag/app.py @@ -11,6 +11,7 @@ from langchain import hub from langchain_chroma import Chroma from langchain_core.documents import Document from langgraph.graph import START, StateGraph +from langgraph.pregel.io import AddableValuesDict from parsers.parser import add_pdf_files, add_urls from typing_extensions import List, TypedDict @@ -106,7 +107,60 @@ async def on_chat_start(): async def on_message(message: cl.Message): graph = cl.user_session.get("graph") response = graph.invoke({"question": message.content}) - await cl.Message(content=response).send() + + answer = response["answer"] + answer += "\n\n" + + pdf_sources = get_pdf_sources(response) + web_sources = get_web_sources(response) + + elements = [] + if len(pdf_sources) > 0: + answer += "The following PDF source were consulted:\n" + for source, page_numbers in pdf_sources.items(): + page_numbers = list(page_numbers) + page_numbers.sort() + # display="side" seems to be not supported by chainlit for PDF's, so we use "inline" instead + elements.append(cl.Pdf(name="pdf", display="inline", path=source, page=page_numbers[0])) + answer += f"'{source}' on page(s): {page_numbers}\n" + + if len(web_sources) > 0: + answer += f"The following web sources were consulted: {web_sources}\n" + + await cl.Message(content=answer, elements=elements).send() + + +def get_pdf_sources(response: AddableValuesDict) -> dict[str, list[int]]: + """ + Function that retrieves the PDF sources with page numbers from a response. + """ + pdf_sources = {} + for context in response["context"]: + try: + if context.metadata["filetype"] == "application/pdf": + source = context.metadata["source"] + page_number = context.metadata["page_number"] + if source in pdf_sources: + pdf_sources[source].add(page_number) + else: + pdf_sources[source] = {page_number} + except KeyError: + pass + return pdf_sources + + +def get_web_sources(response: AddableValuesDict) -> set: + """ + Function that retrieves the web sources from a response. + """ + web_sources = set() + for context in response["context"]: + try: + if context.metadata["filetype"] == "web": + web_sources.add(context.metadata["source"]) + except KeyError: + pass + return web_sources @cl.set_starters