no_speech_prob in pipeline?

#22
by rizwanishaq - opened

How we can get the no_speech_prob in the pipeline?

Hey! Currently, these outputs are not yet supported mostly because they are whisper specific. Would recommend you to use the following code to get them:

from transformers import WhisperTokenizerFast, WhisperForConditionalGeneration, WhisperProcessor
from datasets import load_dataset
import datasets
import torch 

# Load the sample audio from common_voice dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
input_speech = ds[40]["audio"]["array"]

# Initialize  the model and the processor
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")

# Extract input features from the raw speech
input_features = processor(raw_speech=input_speech, return_tensors="pt")

# Generate output logits with output_scores set to True
output = model.generate(**input_features,  output_scores=True)

# Access the last logits and calculate the probability of the target token
last_logits = torch.cat(output.scores)

target_token = "<|nospeech|>"
target_token_id = processor.tokenizer.convert_tokens_to_ids(target_token)
target_token_prob = last_logits[:, target_token_id]
print(f"Probability of {target_token}: {target_token_prob}")
print(tokenizer.decode(output.sequences[0])"
#  '<|startoftranscript|><|en|><|startoflm|> A man said to the universe, Sir, I exist.<|endoftext|>' 

Hey! Currently, these outputs are not yet supported mostly because they are whisper specific. Would recommend you to use the following code to get them:

from transformers import WhisperTokenizerFast, WhisperForConditionalGeneration, WhisperProcessor
from datasets import load_dataset
import datasets
import torch 

# Load the sample audio from common_voice dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
input_speech = ds[40]["audio"]["array"]

# Initialize  the model and the processor
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")

# Extract input features from the raw speech
input_features = processor(raw_speech=input_speech, return_tensors="pt")

# Generate output logits with output_scores set to True
output = model.generate(**input_features,  output_scores=True)

# Access the last logits and calculate the probability of the target token
last_logits = torch.cat(output.scores)

target_token = "<|nospeech|>"
target_token_id = processor.tokenizer.convert_tokens_to_ids(target_token)
target_token_prob = last_logits[:, target_token_id]
print(f"Probability of {target_token}: {target_token_prob}")
print(tokenizer.decode(output.sequences[0])"
#  '<|startoftranscript|><|en|><|startoflm|> A man said to the universe, Sir, I exist.<|endoftext|>' 

Is there a way to set the compression ratio in pipe as well?

from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
import torch

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")

librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = librispeech_dummy[0]["audio"]

input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features

outputs = model.generate(
    input_features, output_scores=True, return_dict_in_generate=True, max_new_tokens=128
)

transition_scores = model.compute_transition_scores(
    outputs.sequences, outputs.scores, normalize_logits=True
)

pred_text = processor.batch_decode(outputs.sequences, skip_special_tokens=True)
pred_language = processor.batch_decode(outputs.sequences[:, 1:2], skip_special_tokens=False)
lang_prob = torch.exp(transition_scores[:, 0])

print(pred_text)
print(pred_language)
print(lang_prob)

I am using this code to get language_probablity, but no_speech_probablity, still not clear.

Sign up or log in to comment