import os from langchain.chains.llm import LLMChain from langchain.chat_models import ChatOpenAI from langchain.prompts import PromptTemplate from langchain.document_loaders import PDFPlumberLoader from langchain.text_splitter import CharacterTextSplitter from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain from langchain.chains.combine_documents.stuff import StuffDocumentsChain os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY") llm = ChatOpenAI(temperature=0, model_name="gpt-4-0125-preview") def get_summ(path): loader = PDFPlumberLoader(path) docs = loader.load() # Map map_template = """The following is a set of documents {docs} Based on this list of docs, please identify the main themes and determine the genes relevant or irrelevant to the discussed disease followed by any associated p-values if available. Helpful Answer:""" map_prompt = PromptTemplate.from_template(map_template) map_chain = LLMChain(llm=llm, prompt=map_prompt) # Reduce reduce_template = """The following is set of summaries: {doc_summaries} Take these and distill it into a final, consolidated summary of the main themes. Helpful Answer:""" reduce_prompt = PromptTemplate.from_template(reduce_template) # Run chain reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt) # Takes a list of documents, combines them into a single string, and passes this to an LLMChain combine_documents_chain = StuffDocumentsChain( llm_chain=reduce_chain, document_variable_name="doc_summaries" ) # Combines and iteravely reduces the mapped documents reduce_documents_chain = ReduceDocumentsChain( # This is final chain that is called. combine_documents_chain=combine_documents_chain, # If documents exceed context for `StuffDocumentsChain` collapse_documents_chain=combine_documents_chain, # The maximum number of tokens to group documents into. token_max=100000, ) # Combining documents by mapping a chain over them, then combining results map_reduce_chain = MapReduceDocumentsChain( # Map chain llm_chain=map_chain, # Reduce chain reduce_documents_chain=reduce_documents_chain, # The variable name in the llm_chain to put the documents in document_variable_name="docs", # Return the results of the map steps in the output return_intermediate_steps=False, ) text_splitter = CharacterTextSplitter.from_tiktoken_encoder( chunk_size=100000, chunk_overlap=0 ) split_docs = text_splitter.split_documents(docs) return map_reduce_chain.run(split_docs)