from dotenv import load_dotenv from langchain import OpenAI from langchain.chains.summarize import load_summarize_chain from langchain.text_splitter import RecursiveCharacterTextSplitter from load import parse_document load_dotenv() DOCUMENT_PATH = "data/raw/cixiidae" llm = OpenAI(temperature=0) def summarize(raw_documents): text_splitter = RecursiveCharacterTextSplitter( separators=["\n\n", "\n"], chunk_size=6000, chunk_overlap=300 ) docs = text_splitter.split_documents(raw_documents) num_docs = len(docs) num_tokens_first_doc = llm.get_num_tokens(docs[0].page_content) print( f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens" ) summary_chain = load_summarize_chain(llm=llm, chain_type="map_reduce") output = summary_chain.run(docs) return output def main(): name = "Fulgoroidea2008-FulgoromorphaSeychellesPreliminaryChecklis-Holzinger-LöckerLöcker" raw_documents = parse_document(f"data/raw/cixiidae/${name}.pdf") output = summarize(raw_documents) print(output) with open( "data/processed/cixiidae/${name}-summary.txt", "w", ) as f: f.write(output) if __name__ == "__main__": main()