seemapatil commited on
Commit
690a0b1
·
1 Parent(s): 1237635

Create text_tagging.py

Browse files
Files changed (1) hide show
  1. text_tagging.py +47 -0
text_tagging.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments
2
+ from datasets import load_dataset
3
+ import json
4
+
5
+ # Read requirements.txt file
6
+ with open('requirements.txt', 'r') as req_file:
7
+ requirements = req_file.read().splitlines()
8
+
9
+ # Install the required libraries
10
+ for requirement in requirements:
11
+ # Use your preferred method to install the libraries
12
+ # e.g., subprocess, pip, etc.
13
+
14
+ # Load and preprocess the IMDB dataset in JSON format
15
+ with open('IMDB Dataset.json', 'r') as json_file:
16
+ imdb_data = json.load(json_file)
17
+
18
+ # Select only 30 words from the dataset
19
+ preprocessed_data = []
20
+ for entry in imdb_data:
21
+ text = entry['text']
22
+ words = text.split()[:30]
23
+ preprocessed_entry = {
24
+ 'text': ' '.join(words),
25
+ 'label': entry['label']
26
+ }
27
+ preprocessed_data.append(preprocessed_entry)
28
+
29
+ # Convert the preprocessed data to a dataset
30
+ dataset = load_dataset('json', data=preprocessed_data)
31
+
32
+ # Tokenize the dataset
33
+ tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
34
+ def tokenize_function(examples):
35
+ return tokenizer(examples["text"], padding="max_length", truncation=True)
36
+
37
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
38
+
39
+ # Fine-tune the Bloom model
40
+ model = AutoModelForSequenceClassification.from_pretrained("bigscience/bloom-560m", num_labels=2)
41
+
42
+ training_args = TrainingArguments(output_dir="test_trainer")
43
+
44
+ import numpy as np
45
+ import evaluate
46
+
47
+ metric = evaluate.load("accuracy")