juliensimon
/

xlm-v-base-language-id

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b99978a873521a77c36ee174badbc6198d2fbd242c9618c12a524f45b64a14d2
 size 3114359925

 version https://git-lfs.github.com/spec/v1
+oid sha256:d62f1c5ab88bf2f7b3820b4b411f1b51a423796b4c6ad6fa37f8e21629d5c28d
 size 3114359925

train-xlm.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import evaluate
+import numpy as np
+from datasets import load_dataset
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    Trainer,
+    TrainingArguments,
+)
+dataset_id = "google/fleurs"
+model_id = "facebook/xlm-v-base"
+metric_name = "accuracy"
+# Keep only the raw transcription and the language id (which we'll use as label)
+columns_to_remove = [
+    "audio",
+    "id",
+    "num_samples",
+    "path",
+    "transcription",
+    "gender",
+    "language",
+    "lang_group_id",
+]
+train, val = load_dataset(
+    dataset_id, "all", split=["train", "validation"], ignore_verifications=True
+)
+# Build the label2id and id2label dictionaries
+unique_langs = set()
+label2id = {}
+id2label = {}
+for lang, lang_id in zip(val["language"], val["lang_id"]):
+    if lang not in unique_langs:
+        unique_langs.add(lang)
+        id2label[lang_id] = lang
+        label2id[lang] = lang_id
+id2label = dict(sorted(id2label.items(), key=lambda item: item[0]))
+label2id = dict(sorted(label2id.items(), key=lambda item: item[1]))
+train = train.remove_columns(columns_to_remove)
+val = val.remove_columns(columns_to_remove)
+train = train.rename_column("raw_transcription", "text")
+val = val.rename_column("raw_transcription", "text")
+train = train.rename_column("lang_id", "label")
+val = val.rename_column("lang_id", "label")
+train = train.shuffle(seed=42)
+val = val.shuffle(seed=42)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+def preprocess(data):
+    return tokenizer(data["text"], truncation=True)
+processed_train = train.map(preprocess, batched=True)
+processed_val = val.map(preprocess, batched=True)
+print(processed_train)
+print(processed_val)
+# Fine-tune the model
+model = AutoModelForSequenceClassification.from_pretrained(
+    model_id,
+    num_labels=len(id2label),
+    label2id=label2id,
+    id2label=id2label,
+    ignore_mismatched_sizes=True,
+)
+args = TrainingArguments(
+    "xlm-v-base-language-id",
+    learning_rate=3e-5,
+    warmup_ratio=0.1,
+    per_device_train_batch_size=16,
+    gradient_accumulation_steps=4,
+    per_device_eval_batch_size=16,
+    num_train_epochs=5,
+    load_best_model_at_end=True,
+    metric_for_best_model=metric_name,
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    logging_steps=10,
+    fp16=True,
+    push_to_hub=True,
+)
+metric = evaluate.load(metric_name)
+def compute_metrics(eval_pred):
+    predictions = np.argmax(eval_pred.predictions, axis=1)
+    return metric.compute(predictions=predictions, references=eval_pred.label_ids)
+trainer = Trainer(
+    model,
+    args,
+    train_dataset=processed_train,
+    eval_dataset=processed_val,
+    tokenizer=tokenizer,
+    compute_metrics=compute_metrics,
+)
+trainer.train()
+trainer.save_model("./my_model")