Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
"""assessment3_Maria_Maraki.ipynb | |
Automatically generated by Colab. | |
Original file is located at | |
https://colab.research.google.com/drive/1jm_hI8O4Y0HgNNdWLnkLBIjlzSaGwwBS | |
""" | |
########################################################################################################################################################### | |
#The provided code has undergone minor adjustments from its original source (colab enviroment) to ensure its compatibility with the Hugging Face ecosystem. | |
########################################################################################################################################################### | |
"""Since the dataset **emails.csv** in the [Enron Email Dataset](https://www.kaggle.com/datasets/wcukierski/enron-email-dataset/code) was too big, I split the original dataset into smaller .csv files and then chose one of the split files: ***emails_subset.csv*** | |
This is the code I used: | |
``` | |
import os | |
import pandas as pd | |
``` | |
``` | |
def split_csv(input_file, output_folder, chunk_size): | |
os.makedirs(output_folder, exist_ok=True) | |
reader = pd.read_csv(input_file, chunksize=chunk_size) | |
for i, chunk in enumerate(reader): | |
chunk.to_csv(os.path.join(output_folder, f"output_{i}.csv"), index=False) | |
``` | |
``` | |
input_file = 'emails.csv' | |
output_folder = 'split_files' | |
``` | |
``` | |
target_size = 1000000 | |
chunk_size = 500000 # Start with a reasonable default | |
total_rows = sum(1 for line in open(input_file)) # Count total number of rows | |
rows_per_chunk = max(1, total_rows * target_size // os.path.getsize(input_file)) | |
split_csv(input_file, output_folder, rows_per_chunk) | |
``` | |
P.S. I didn't do this on this notebook, cause I'm working in google colab and I couldn't upload the original file -due to its size- on my google drive. | |
""" | |
import pandas as pd | |
pd.set_option('display.max_columns',None, | |
'display.max_rows',None, | |
'display.max_colwidth',None | |
) | |
email_data = pd.read_csv('emails_subset.csv') | |
email_data.head() | |
"""# Embeddings of the email dataset stored in a ChromaDB database""" | |
import email | |
import openai | |
import os | |
import numpy as np | |
import chromadb | |
import nltk | |
import pytesseract | |
import gradio as gr | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain.vectorstores import Chroma | |
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter | |
from langchain.chains import RetrievalQA | |
from langchain import OpenAI, VectorDBQA | |
from langchain.document_loaders import DirectoryLoader | |
import warnings | |
warnings.filterwarnings('ignore') | |
openAI_embeddings = OpenAIEmbeddings(openai_api_key=os.environ.get('OPENAI_API_KEY')) | |
content = [] | |
for item in email_data.message: | |
text = email.message_from_string(item) | |
message = (text.get_payload()) | |
cleaned_message = message.replace("\n","").replace("\r","").replace("> >>> > >","") | |
content.append(cleaned_message) | |
class Document: | |
def __init__(self, page_content, metadata=None): | |
self.page_content = page_content | |
self.metadata = metadata if metadata is not None else {} | |
documents = [Document(page_content) for page_content in content] | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
final_text = text_splitter.split_documents(documents) | |
collection = Chroma.from_documents( | |
documents=final_text, | |
embedding=openAI_embeddings) | |
"""# Fine-tuning a Language Model on the Dataset | |
The fine-tuning task kept crushing my notebook and I had to restart so I stored it into a different notebook. | |
""" | |
"""# Gradio Interface that answers questions related to the case""" | |
email_data_retrieval = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=os.environ.get('OPENAI_API_KEY'), | |
temperature=0.6, | |
top_p=0.5, | |
max_tokens=500), | |
chain_type='stuff', retriever=collection.as_retriever()) | |
def qa_retrieval(question): | |
answer = email_data_retrieval.run(question) | |
return answer | |
iface = gradio.Interface( | |
fn=qa_retrieval, | |
inputs=gr.Textbox(label="Write your question regarding the Enron Case here:"), | |
outputs=gr.Textbox(label="Answer of the question:"), | |
title="QA Retrieval - Case Specific: Enron Email Dataset" | |
) | |
iface.launch() | |