# This is to demonstrate the core logic for the project # 1. Get the link to PDF # 2. Read the content of the PDF # 3. Iterate: # 3.1 Create a chunk (set of pages) # 3.2 Create summary by combining partial summary & chunk ### 1. Import the libraries import streamlit as st import time import os from dotenv import load_dotenv from langchain.prompts import PromptTemplate # from langchain_community.llms import HuggingFaceHub from langchain_community.llms import HuggingFaceEndpoint from langchain_community.document_loaders import PyPDFLoader # This is to simplify local development # Without this you will need to copy/paste the API key with every change try: # CHANGE the location of the file load_dotenv('C:\\Users\\raj\\.jupyter\\.env1') # Add the API key to the session - use it for populating the interface if os.getenv('HUGGINGFACEHUB_API_TOKEN'): st.session_state['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN') else: st.session_state['HUGGINGFACEHUB_API_TOKEN'] = '' except: print("Environment file not found !! Copy & paste your HuggingFace API key.") # Prompt to be used template = """ extend the abstractive summary below with the new content. Keep total size of the extended summary around 3000 words. summary: {summary} new content: {content} extended summary: """ prompt_template = PromptTemplate( input_variables = ['summary', 'content'], template = template ) # Model for summarization model_id = 'mistralai/Mistral-7B-Instruct-v0.2' CONTEXT_WINDOW_SIZE=32000 MAX_TOKENS=2000 if 'SUMMARY' not in st.session_state: st.session_state['SUMMARY'] = '' if 'HUGGINGFACEHUB_API_TOKEN' not in st.session_state: st.session_state['HUGGINGFACEHUB_API_TOKEN'] = '' # function to generate the summary def generate_summary(): # Create an LLM llm = HuggingFaceEndpoint( repo_id=model_id, max_new_tokens=MAX_TOKENS, huggingfacehub_api_token = hugging_face_api_key ) # Show spinner, while we are waiting for the response with st.spinner('Invoking LLM ... '): # 1. Load the PDF file partial_summary = '' loader = PyPDFLoader(pdf_link) pages = loader.load() page_count = len(pages) print("Number of pages = ", page_count) # 2. Iterate to generate the summary next_page_index = 0 while next_page_index < len(pages): 'Processing chunk, starting with page index : ',next_page_index # Holds the chunk = a set of contenated pages new_content = '' # Loop to create chunk for i, doc in enumerate(pages[next_page_index : ]): last_i = i if len(partial_summary) + len(new_content) + len(doc.page_content) + MAX_TOKENS < CONTEXT_WINDOW_SIZE : new_content = new_content + doc.page_content else: break # Initialize the new content and next page index next_page_index = next_page_index + last_i + 1 # Pass the current summary and new content to LLM for summarization query = prompt_template.format(summary=partial_summary, content=new_content) partial_summary = llm.invoke(query) st.session_state['SUMMARY'] = partial_summary # Title st.title('PDF Summarizer') if 'HUGGINGFACEHUB_API_TOKEN' in st.session_state: hugging_face_api_key = st.sidebar.text_input('HuggingFace API key',value=st.session_state['HUGGINGFACEHUB_API_TOKEN']) else: hugging_face_api_key = st.sidebar.text_input('HuggingFace API key',placeholder='copy & paste your API key') # draw the box for query pdf_link = st.text_input('Link to PDF document', placeholder='copy/paste link to the PDF', value='https://sgp.fas.org/crs/misc/R47644.pdf') # button st.button("Generate sumary", on_click=generate_summary) st.text_area('Response', value = st.session_state['SUMMARY'], height=800)