Spaces:

KalbeDigitalLab
/

NutriGenMePE

Sleeping

App Files Files Community

NutriGenMePE / summ.py

adhisetiawan

Update summ.py

bb6f5d9 verified 4 months ago

raw history blame

No virus

2.7 kB

	import os
	from langchain.chains.llm import LLMChain
	from langchain.chat_models import ChatOpenAI
	from langchain.prompts import PromptTemplate
	from langchain.document_loaders import PDFPlumberLoader
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
	from langchain.chains.combine_documents.stuff import StuffDocumentsChain

	os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")

	llm = ChatOpenAI(temperature=0, model_name="gpt-4-0125-preview")

	def get_summ(path):

	loader = PDFPlumberLoader(path)
	docs = loader.load()
	# Map
	map_template = """The following is a set of documents
	{docs}
	Based on this list of docs, please identify the main themes and determine the genes relevant or irrelevant to the discussed disease followed by any associated p-values if available.
	Helpful Answer:"""
	map_prompt = PromptTemplate.from_template(map_template)
	map_chain = LLMChain(llm=llm, prompt=map_prompt)

	# Reduce
	reduce_template = """The following is set of summaries:
	{doc_summaries}
	Take these and distill it into a final, consolidated summary of the main themes.
	Helpful Answer:"""
	reduce_prompt = PromptTemplate.from_template(reduce_template)

	# Run chain
	reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

	# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
	combine_documents_chain = StuffDocumentsChain(
	llm_chain=reduce_chain, document_variable_name="doc_summaries"
	)

	# Combines and iteravely reduces the mapped documents
	reduce_documents_chain = ReduceDocumentsChain(
	# This is final chain that is called.
	combine_documents_chain=combine_documents_chain,
	# If documents exceed context for `StuffDocumentsChain`
	collapse_documents_chain=combine_documents_chain,
	# The maximum number of tokens to group documents into.
	token_max=100000,
	)

	# Combining documents by mapping a chain over them, then combining results
	map_reduce_chain = MapReduceDocumentsChain(
	# Map chain
	llm_chain=map_chain,
	# Reduce chain
	reduce_documents_chain=reduce_documents_chain,
	# The variable name in the llm_chain to put the documents in
	document_variable_name="docs",
	# Return the results of the map steps in the output
	return_intermediate_steps=False,
	)

	text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
	chunk_size=100000, chunk_overlap=0
	)
	split_docs = text_splitter.split_documents(docs)

	return map_reduce_chain.run(split_docs)