no_speech_prob in pipeline?
#22
by
rizwanishaq
- opened
How we can get the no_speech_prob in the pipeline?
Hey! Currently, these outputs are not yet supported mostly because they are whisper specific. Would recommend you to use the following code to get them:
from transformers import WhisperTokenizerFast, WhisperForConditionalGeneration, WhisperProcessor
from datasets import load_dataset
import datasets
import torch
# Load the sample audio from common_voice dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
input_speech = ds[40]["audio"]["array"]
# Initialize the model and the processor
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
# Extract input features from the raw speech
input_features = processor(raw_speech=input_speech, return_tensors="pt")
# Generate output logits with output_scores set to True
output = model.generate(**input_features, output_scores=True)
# Access the last logits and calculate the probability of the target token
last_logits = torch.cat(output.scores)
target_token = "<|nospeech|>"
target_token_id = processor.tokenizer.convert_tokens_to_ids(target_token)
target_token_prob = last_logits[:, target_token_id]
print(f"Probability of {target_token}: {target_token_prob}")
print(tokenizer.decode(output.sequences[0])"
# '<|startoftranscript|><|en|><|startoflm|> A man said to the universe, Sir, I exist.<|endoftext|>'
Hey! Currently, these outputs are not yet supported mostly because they are whisper specific. Would recommend you to use the following code to get them:
from transformers import WhisperTokenizerFast, WhisperForConditionalGeneration, WhisperProcessor from datasets import load_dataset import datasets import torch # Load the sample audio from common_voice dataset ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") input_speech = ds[40]["audio"]["array"] # Initialize the model and the processor model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3") processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3") # Extract input features from the raw speech input_features = processor(raw_speech=input_speech, return_tensors="pt") # Generate output logits with output_scores set to True output = model.generate(**input_features, output_scores=True) # Access the last logits and calculate the probability of the target token last_logits = torch.cat(output.scores) target_token = "<|nospeech|>" target_token_id = processor.tokenizer.convert_tokens_to_ids(target_token) target_token_prob = last_logits[:, target_token_id] print(f"Probability of {target_token}: {target_token_prob}") print(tokenizer.decode(output.sequences[0])" # '<|startoftranscript|><|en|><|startoflm|> A man said to the universe, Sir, I exist.<|endoftext|>'
Is there a way to set the compression ratio in pipe as well?
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
import torch
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = librispeech_dummy[0]["audio"]
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
outputs = model.generate(
input_features, output_scores=True, return_dict_in_generate=True, max_new_tokens=128
)
transition_scores = model.compute_transition_scores(
outputs.sequences, outputs.scores, normalize_logits=True
)
pred_text = processor.batch_decode(outputs.sequences, skip_special_tokens=True)
pred_language = processor.batch_decode(outputs.sequences[:, 1:2], skip_special_tokens=False)
lang_prob = torch.exp(transition_scores[:, 0])
print(pred_text)
print(pred_language)
print(lang_prob)
I am using this code to get language_probablity, but no_speech_probablity, still not clear.