wav2vec2-base-mal / push_to_hub.py
aoxo's picture
Create push_to_hub.py
1a7c2c6 verified
from transformers import Wav2Vec2ForCTC, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor
import json
# Path to your local model directory and vocab file
local_model_path = './wav2vec2-base-mal' # Directory with model checkpoints
vocab_path = './vocab.json' # Path to your vocab.json file
# Hugging Face model ID (replace with your username)
model_id = "aoxo/wav2vec2-base-mal"
# Load vocab
with open(vocab_path, 'r') as f:
vocab_dict = json.load(f)
# Create custom tokenizer
tokenizer = Wav2Vec2CTCTokenizer(
vocab_path,
unk_token="[UNK]",
pad_token="[PAD]",
word_delimiter_token="|"
)
# Create feature extractor
feature_extractor = Wav2Vec2FeatureExtractor(
feature_size=1,
sampling_rate=16000,
padding_value=0.0,
do_normalize=True,
return_attention_mask=False
)
# Create processor
processor = Wav2Vec2Processor(
feature_extractor=feature_extractor,
tokenizer=tokenizer
)
# Load the model from the checkpoint directory
model = Wav2Vec2ForCTC.from_pretrained(local_model_path)
# Push to Hugging Face Hub
model.push_to_hub(model_id)
processor.push_to_hub(model_id)
tokenizer.push_to_hub(model_id)
print(f"Model, processor, and tokenizer successfully pushed to {model_id}")