# https://huggingface.co/spaces/micknikolic/enron # here are the imports from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores import Chroma from langchain.text_splitter import CharacterTextSplitter from langchain.chains import RetrievalQA from langchain import OpenAI, VectorDBQA from langchain.document_loaders import DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter import os import nltk import pytesseract import pandas as pd pd.set_option('display.max_columns',None, 'display.max_rows',None, 'display.max_colwidth',None ) import numpy as np import os import re import io import gradio import warnings warnings.filterwarnings('ignore') # here is the code # data loading. # i am using a subset of the enron dataset, as it would be computationally very expensive to work with over 500k observations locally. data = pd.read_csv('subset_enron.csv',encoding='utf-8') data = data.sample(frac=0.01,random_state=12) #(5174, 2) # Text pre-processing. cleaned_message = data["message"].apply(lambda x: re.sub(r'\\{1,2}n', '', x)) content = cleaned_message.tolist() class Document: def __init__(self, page_content, metadata=None): self.page_content = page_content self.metadata = metadata if metadata is not None else {} documents = [Document(page_content) for page_content in content] text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) texts = text_splitter.split_documents(documents) openAI_embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("openai_api_key")) vStore = Chroma.from_documents(documents=texts, embedding=openAI_embeddings) # Retrieval QA model_retrieval = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=os.getenv("openai_api_key"), temperature=0.2, top_p=0.2, max_tokens=2000), chain_type="stuff", retriever=vStore.as_retriever()) # Building Gradio based app. The Retrieval model. def get_answer(question): """ Returns the answer on a given question. Args: question (string): end-user's input. Returns: the model's answer based on the enron emails dataset. """ response = model_retrieval.run(question) return response iface = gradio.Interface( fn=get_answer, inputs=gradio.Textbox(label="Enter your question here"), outputs=[ gradio.Textbox(label="Answer")], title="Retrieval QA for the subset of the Enron dataset", examples=[ "Who are the receivers of the emails from this corpus of emails?", "What's at the center of these emails?" ] ) iface.launch()