Spaces:

seemapatil
/

text_tagging

Build error

App Files Files Community

seemapatil commited on Jul 4, 2023

Commit

690a0b1

1 Parent(s): 1237635

Create text_tagging.py

Browse files

Files changed (1) hide show

text_tagging.py +47 -0

text_tagging.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments
+from datasets import load_dataset
+import json
+# Read requirements.txt file
+with open('requirements.txt', 'r') as req_file:
+    requirements = req_file.read().splitlines()
+# Install the required libraries
+for requirement in requirements:
+    # Use your preferred method to install the libraries
+    # e.g., subprocess, pip, etc.
+# Load and preprocess the IMDB dataset in JSON format
+with open('IMDB Dataset.json', 'r') as json_file:
+    imdb_data = json.load(json_file)
+# Select only 30 words from the dataset
+preprocessed_data = []
+for entry in imdb_data:
+    text = entry['text']
+    words = text.split()[:30]
+    preprocessed_entry = {
+        'text': ' '.join(words),
+        'label': entry['label']
+    }
+    preprocessed_data.append(preprocessed_entry)
+# Convert the preprocessed data to a dataset
+dataset = load_dataset('json', data=preprocessed_data)
+# Tokenize the dataset
+tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
+def tokenize_function(examples):
+    return tokenizer(examples["text"], padding="max_length", truncation=True)
+tokenized_datasets = dataset.map(tokenize_function, batched=True)
+# Fine-tune the Bloom model
+model = AutoModelForSequenceClassification.from_pretrained("bigscience/bloom-560m", num_labels=2)
+training_args = TrainingArguments(output_dir="test_trainer")
+import numpy as np
+import evaluate
+metric = evaluate.load("accuracy")