# -*- coding: utf-8 -*- """assessment3_Maria_Maraki.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1jm_hI8O4Y0HgNNdWLnkLBIjlzSaGwwBS """ ########################################################################################################################################################### #The provided code has undergone minor adjustments from its original source (colab enviroment) to ensure its compatibility with the Hugging Face ecosystem. ########################################################################################################################################################### """Since the dataset **emails.csv** in the [Enron Email Dataset](https://www.kaggle.com/datasets/wcukierski/enron-email-dataset/code) was too big, I split the original dataset into smaller .csv files and then chose one of the split files: ***emails_subset.csv*** This is the code I used: ``` import os import pandas as pd ``` ``` def split_csv(input_file, output_folder, chunk_size): os.makedirs(output_folder, exist_ok=True) reader = pd.read_csv(input_file, chunksize=chunk_size) for i, chunk in enumerate(reader): chunk.to_csv(os.path.join(output_folder, f"output_{i}.csv"), index=False) ``` ``` input_file = 'emails.csv' output_folder = 'split_files' ``` ``` target_size = 1000000 chunk_size = 500000 # Start with a reasonable default total_rows = sum(1 for line in open(input_file)) # Count total number of rows rows_per_chunk = max(1, total_rows * target_size // os.path.getsize(input_file)) split_csv(input_file, output_folder, rows_per_chunk) ``` P.S. I didn't do this on this notebook, cause I'm working in google colab and I couldn't upload the original file -due to its size- on my google drive. """ import pandas as pd pd.set_option('display.max_columns',None, 'display.max_rows',None, 'display.max_colwidth',None ) email_data = pd.read_csv('emails_subset.csv') email_data.head() """# Embeddings of the email dataset stored in a ChromaDB database""" import email import openai import os import numpy as np import chromadb import nltk import pytesseract import gradio as gr from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores import Chroma from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter from langchain.chains import RetrievalQA from langchain import OpenAI, VectorDBQA from langchain.document_loaders import DirectoryLoader import warnings warnings.filterwarnings('ignore') openAI_embeddings = OpenAIEmbeddings(openai_api_key=os.environ.get('OPENAI_API_KEY')) content = [] for item in email_data.message: text = email.message_from_string(item) message = (text.get_payload()) cleaned_message = message.replace("\n","").replace("\r","").replace("> >>> > >","") content.append(cleaned_message) class Document: def __init__(self, page_content, metadata=None): self.page_content = page_content self.metadata = metadata if metadata is not None else {} documents = [Document(page_content) for page_content in content] text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) final_text = text_splitter.split_documents(documents) collection = Chroma.from_documents( documents=final_text, embedding=openAI_embeddings) """# Fine-tuning a Language Model on the Dataset The fine-tuning task kept crushing my notebook and I had to restart so I stored it into a different notebook. """ """# Gradio Interface that answers questions related to the case""" email_data_retrieval = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=os.environ.get('OPENAI_API_KEY'), temperature=0.6, top_p=0.5, max_tokens=500), chain_type='stuff', retriever=collection.as_retriever()) def qa_retrieval(question): answer = email_data_retrieval.run(question) return answer iface = gradio.Interface( fn=qa_retrieval, inputs=gr.Textbox(label="Write your question regarding the Enron Case here:"), outputs=gr.Textbox(label="Answer of the question:"), title="QA Retrieval - Case Specific: Enron Email Dataset" ) iface.launch()