Spaces:
Runtime error
Runtime error
from dotenv import load_dotenv | |
from langchain import OpenAI | |
from langchain.chains.summarize import load_summarize_chain | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from load import parse_document | |
load_dotenv() | |
DOCUMENT_PATH = "data/raw/cixiidae" | |
llm = OpenAI(temperature=0) | |
def summarize(raw_documents): | |
text_splitter = RecursiveCharacterTextSplitter( | |
separators=["\n\n", "\n"], chunk_size=6000, chunk_overlap=300 | |
) | |
docs = text_splitter.split_documents(raw_documents) | |
num_docs = len(docs) | |
num_tokens_first_doc = llm.get_num_tokens(docs[0].page_content) | |
print( | |
f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens" | |
) | |
summary_chain = load_summarize_chain(llm=llm, chain_type="map_reduce") | |
output = summary_chain.run(docs) | |
return output | |
def main(): | |
name = "Fulgoroidea2008-FulgoromorphaSeychellesPreliminaryChecklis-Holzinger-LöckerLöcker" | |
raw_documents = parse_document(f"data/raw/cixiidae/${name}.pdf") | |
output = summarize(raw_documents) | |
print(output) | |
with open( | |
"data/processed/cixiidae/${name}-summary.txt", | |
"w", | |
) as f: | |
f.write(output) | |
if __name__ == "__main__": | |
main() | |