enron / app.py
micknikolic's picture
Update app.py
d5f344a verified
raw
history blame contribute delete
No virus
2.86 kB
# https://huggingface.co/spaces/micknikolic/enron
# here are the imports
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain import OpenAI, VectorDBQA
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import nltk
import pytesseract
import pandas as pd
pd.set_option('display.max_columns',None,
'display.max_rows',None,
'display.max_colwidth',None
)
import numpy as np
import os
import re
import io
import gradio
import warnings
warnings.filterwarnings('ignore')
# here is the code
# data loading.
# i am using a subset of the enron dataset, as it would be computationally very expensive to work with over 500k observations locally.
data = pd.read_csv('subset_enron.csv',encoding='utf-8')
data = data.sample(frac=0.01,random_state=12) #(5174, 2)
# Text pre-processing.
cleaned_message = data["message"].apply(lambda x: re.sub(r'\\{1,2}n', '', x))
content = cleaned_message.tolist()
class Document:
def __init__(self, page_content, metadata=None):
self.page_content = page_content
self.metadata = metadata if metadata is not None else {}
documents = [Document(page_content) for page_content in content]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
openAI_embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("openai_api_key"))
vStore = Chroma.from_documents(documents=texts, embedding=openAI_embeddings)
# Retrieval QA
model_retrieval = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=os.getenv("openai_api_key"),
temperature=0.2,
top_p=0.2,
max_tokens=2000),
chain_type="stuff", retriever=vStore.as_retriever())
# Building Gradio based app. The Retrieval model.
def get_answer(question):
"""
Returns the answer on a given question.
Args:
question (string): end-user's input.
Returns:
the model's answer based on the enron emails dataset.
"""
response = model_retrieval.run(question)
return response
iface = gradio.Interface(
fn=get_answer,
inputs=gradio.Textbox(label="Enter your question here"),
outputs=[
gradio.Textbox(label="Answer")],
title="Retrieval QA for the subset of the Enron dataset",
examples=[
"Who are the receivers of the emails from this corpus of emails?",
"What's at the center of these emails?"
]
)
iface.launch()