# -*- coding: utf-8 -*- """Copy of assessment3_Elina_Hemink.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1xhBZL_ztniX37QTt8SK_mV7nZKO_UrwW ## Create embeddings of the email dataset and store in a chromadb database """ import chromadb from chromadb.utils import embedding_functions import pandas as pd import email from sklearn.model_selection import train_test_split # Loading email.csv dataset emails = pd.read_csv('emails.csv') print(emails.head()) # Getting the content of the emails and saving to a list content_text = [] for item in emails.message: text = email.message_from_string(item) message = (text.get_payload()) cleaned_message = message.replace("\n","").replace("\r","").replace("> >>> > >","") content_text.append(cleaned_message) # Taking a sample of the dataset train, test = train_test_split(content_text, train_size = 0.001) # Dataset is too large to complete embedding step # Setting up ids for ChromaDB collections ids = [] for i in range(len(train)): id = 'id'+str(i+1) ids.append(id) # Creating collection client = chromadb.Client() collection = client.create_collection(name="Enron_emails") collection.add( documents = train, ids = ids ) """## Fine-tune a Language Model on the Dataset""" from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments # Load pre-trained GPT2 tokenizer and model tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2') # Tokenize the dataset tokenizer.add_special_tokens({'pad_token': '[PAD]'}) tokenized_emails = tokenizer(train, truncation=True, padding=True) # Extract token IDs from BatchEncoding object token_ids_list = tokenized_emails['input_ids'] # Save token IDs to a text file with open('tokenized_emails.txt', 'w') as f: for token_ids in token_ids_list: f.write(' '.join(map(str, token_ids)) + '\n') # Initialize TextDataset with the file path dataset = TextDataset(tokenizer=tokenizer, file_path = 'tokenized_emails.txt', block_size=128) # Define data collator data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # Define training arguments training_args = TrainingArguments( output_dir='./output', num_train_epochs=3, per_device_train_batch_size=8, ) # Initialize Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, ) # Fine-tune the model trainer.train() # Save the fine-tuned model model.save_pretrained("/fine_tuned_model") tokenizer.save_pretrained("/fine_tuned_model") """## Create a Gradio Interface""" import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline # Initialize fine-tuned model and tokenizer model_dir= "/fine_tuned_model" tokenizer = AutoTokenizer.from_pretrained(model_dir) model = AutoModelForCausalLM.from_pretrained(model_dir) # Create a text generation pipeline text_gen = pipeline("text-generation", model=model, tokenizer=tokenizer) # Define question_answering function def question_answer(question): generated = text_gen(question, max_length=200, num_return_sequences=1) generated_tokens = generated[0]['generated_text'].replace(question, "") generated_token_ids = [int(token) for token in generated_tokens.strip().split()] answer = tokenizer.decode(generated_token_ids) return answer # Set up gradio interface iface = gr.Interface(fn = question_answer, inputs='text', outputs='text', title='Fine-tuned Enron Question Answering', description='Ask a question regarding the Enron case') iface.launch() """## Deploy the Gradio Interface in a Huggingface Space"""