Spaces:

aiola
/

whisper-ner-v1

Running on Zero

App Files Files Community

aiola commited on Nov 1, 2024

Commit

2d4fde9

verified ·

1 Parent(s): f6ac33a

Update app with tag and masking model

Browse files

Files changed (1) hide show

app.py +44 -33

app.py CHANGED Viewed

@@ -9,39 +9,40 @@ import re
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Load model and processor
-processor = WhisperProcessor.from_pretrained("aiola/whisper-ner-v1")
-model = WhisperForConditionalGeneration.from_pretrained("aiola/whisper-ner-v1")
 model = model.to(device)
 examples = [
     [
         "audio/sports.wav",
-        "football-club, football-player, action"
     ],
     [
         "audio/entertainment.wav",
-        "movie, date, actor, tv-show, musician"
     ],
     [
         "audio/672-122797-0026.wav",
-        "biological-classification, desire, demographic-group, object-category, relationship-role, reflexive-pronoun, furniture-type"
     ],
-    [
-        "audio/7021-85628-0025.wav",
-        "action-goal, person's-title, emotional-connection, personal-qualities, pronoun-target, assignmentaction, physical-action, family-role"
     ],
     [
         "audio/672-122797-0024.wav",
-        "health-warning, importance-indicator, event, sentiment"
-    ],
-    [
-        "audio/672-122797-0027.wav",
-        "action, emotional-resilience, comparative-path-characteristic, social-role"
     ],
     [
         "audio/672-122797-0048.wav",
-        "weapon, emotional-state, household-chore, atmosphere-quality"
     ],
 ]
@@ -54,8 +55,8 @@ def unify_ner_text(text, symbols_to_replace=("/", " ", ":", "_")):
     return text.lower()
-def extract_entities_and_clean_text_fixed(text):
-    entity_pattern = r"<(.*?)>(.*?)<\1>>"
     entities = []
     clean_text = []
     current_pos = 0
@@ -66,7 +67,7 @@ def extract_entities_and_clean_text_fixed(text):
         clean_text.append(text[current_pos:match.start()])
         entity_type = match.group(1)
-        entity_text = match.group(2)
         start_pos = len("".join(clean_text))  # Start position in the clean text
         end_pos = start_pos + len(entity_text)
@@ -94,7 +95,7 @@ def extract_entities_and_clean_text_fixed(text):
 @spaces.GPU  # This decorator ensures your function can use GPU on Hugging Face Spaces
-def transcribe_and_recognize_entities(audio_file, prompt):
     target_sample_rate = 16000
     signal, sampling_rate = torchaudio.load(audio_file)
     resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=target_sample_rate)
@@ -108,6 +109,8 @@ def transcribe_and_recognize_entities(audio_file, prompt):
     ner_types = prompt.split(',')
     processed_ner_types = [unify_ner_text(ner_type.strip()) for ner_type in ner_types]
     prompt = ", ".join(processed_ner_types)
     print(f"Prompt after unify_ner_text: {prompt}")
     prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt")
@@ -122,36 +125,44 @@ def transcribe_and_recognize_entities(audio_file, prompt):
     )
     transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-    clean_text_fixed, extracted_entities_fixed = extract_entities_and_clean_text_fixed(transcription)
     return transcription, {"text": clean_text_fixed, "entities": extracted_entities_fixed}
 with gr.Blocks(title="WhisperNER v1") as demo:
     gr.Markdown(
         """
-        # Whisper-NER: ASR with zero-shot NER
         WhisperNER is a unified model for automatic speech recognition (ASR) and named entity recognition (NER), with zero-shot capabilities.
         The WhisperNER model is designed as a strong base model for the downstream task of ASR with NER, and can be fine-tuned on specific datasets for improved performance.
         ## Links
-        * Paper: [WhisperNER: Unified Open Named Entity and Speech Recognition](https://arxiv.org/abs/2409.08107).
-        * Model: https://huggingface.co/aiola/whisper-ner-v1
-        * Code: https://github.com/aiola-lab/whisper-ner
         """
     )
     with gr.Row() as row1:
         with gr.Column() as col1:
-            audio_input = gr.Audio(label="Audio Example", type="filepath")
         with gr.Column() as col2:
-            label_input = gr.Textbox(label="Entity Labels")
     submit_btn = gr.Button("Submit")
     gr.Markdown("## Output")
     with gr.Row() as row3:
@@ -163,7 +174,7 @@ with gr.Blocks(title="WhisperNER v1") as demo:
     examples = gr.Examples(
         examples,
         fn=transcribe_and_recognize_entities,
-        inputs=[audio_input, label_input],
         outputs=[transcript_output, highlighted_text_output],
         cache_examples=True,
         run_on_click=True,
@@ -172,12 +183,12 @@ with gr.Blocks(title="WhisperNER v1") as demo:
     # Submitting
     label_input.submit(
         fn=transcribe_and_recognize_entities,
-        inputs=[audio_input, label_input],
         outputs=[transcript_output, highlighted_text_output],
     )
     submit_btn.click(
         fn=transcribe_and_recognize_entities,
-        inputs=[audio_input, label_input],
         outputs=[transcript_output, highlighted_text_output],
     )

 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Load model and processor
+processor = WhisperProcessor.from_pretrained("aiola/whisper-ner-tag-and-mask-v1")
+model = WhisperForConditionalGeneration.from_pretrained("aiola/whisper-ner-tag-and-mask-v1")
 model = model.to(device)
 examples = [
     [
         "audio/sports.wav",
+        "football-club, football-player, referee",
+        False
     ],
     [
         "audio/entertainment.wav",
+        "movie, date, actor, tv-show, musician",
+        True
     ],
     [
         "audio/672-122797-0026.wav",
+        "biological-classification, desire, demographic-group, object-category, relationship-role, reflexive-pronoun, furniture-type",
+        False
     ],
+[
+        "audio/672-122797-0027.wav",
+        "action, emotional-resilience, comparative-path-characteristic, social-role",
+        True
     ],
     [
         "audio/672-122797-0024.wav",
+        "health-warning, importance-indicator, event, sentiment",
+        False
     ],
     [
         "audio/672-122797-0048.wav",
+        "weapon, emotional-state, household-chore, atmosphere-quality",
+        False
     ],
 ]
     return text.lower()
+def extract_entities_and_clean_text_fixed(text, ner_mask=False):
+    entity_pattern = r"<(.*?)>(.*?)<\1>>" if not ner_mask else r"<(.*?)>>"
     entities = []
     clean_text = []
     current_pos = 0
         clean_text.append(text[current_pos:match.start()])
         entity_type = match.group(1)
+        entity_text = "-" if ner_mask else match.group(2)
         start_pos = len("".join(clean_text))  # Start position in the clean text
         end_pos = start_pos + len(entity_text)
 @spaces.GPU  # This decorator ensures your function can use GPU on Hugging Face Spaces
+def transcribe_and_recognize_entities(audio_file, prompt, ner_mask=False):
     target_sample_rate = 16000
     signal, sampling_rate = torchaudio.load(audio_file)
     resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=target_sample_rate)
     ner_types = prompt.split(',')
     processed_ner_types = [unify_ner_text(ner_type.strip()) for ner_type in ner_types]
     prompt = ", ".join(processed_ner_types)
+    if ner_mask:
+        prompt = f"<|mask|>{prompt}"
     print(f"Prompt after unify_ner_text: {prompt}")
     prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt")
     )
     transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+    clean_text_fixed, extracted_entities_fixed = extract_entities_and_clean_text_fixed(transcription, ner_mask=ner_mask)
     return transcription, {"text": clean_text_fixed, "entities": extracted_entities_fixed}
 with gr.Blocks(title="WhisperNER v1") as demo:
     gr.Markdown(
         """
+        # 🔥 Whisper-NER: ASR with zero-shot NER
         WhisperNER is a unified model for automatic speech recognition (ASR) and named entity recognition (NER), with zero-shot capabilities.
         The WhisperNER model is designed as a strong base model for the downstream task of ASR with NER, and can be fine-tuned on specific datasets for improved performance.
+        The [aiola/whisper-ner-tag-and-mask-v1](https://huggingface.co/aiola/whisper-ner-tag-and-mask-v1) model was finetuned from
+        the [aiola/whisper-ner-v1](https://huggingface.co/aiola/whisper-ner-v1) checkpoint using the NuNER dataset to perform joint audio transcription and NER tagging or NER masking.
+        The model was not trained on PII specific datasets, hence can perform general and open type entity masking.
+        It should be further funetuned in order to be used for PII detection. The model was trained and evaluated only on English data. Check out the paper for full details.
         ## Links
+        * 📄 Paper: [WhisperNER: Unified Open Named Entity and Speech Recognition](https://arxiv.org/abs/2409.08107)
+        * 🤗 [WhisperNER model collection](https://huggingface.co/collections/aiola/whisperner-6723f14506f3662cf3a73df2)
+        * 💻 Code: https://github.com/aiola-lab/whisper-ner
         """
     )
     with gr.Row() as row1:
         with gr.Column() as col1:
+            audio_input = gr.Audio(value=examples[0][0], label="Audio Example", type="filepath")
         with gr.Column() as col2:
+            label_input = gr.Textbox(label="Entity Labels", value=examples[0][1])
+            ner_mask = gr.Checkbox(
+                value=examples[0][2],
+                label="Entity Mask",
+                info="Mask or tag entities in the transcription.",
+                scale=0,
+            )
     submit_btn = gr.Button("Submit")
     gr.Markdown("## Output")
     with gr.Row() as row3:
     examples = gr.Examples(
         examples,
         fn=transcribe_and_recognize_entities,
+        inputs=[audio_input, label_input, ner_mask],
         outputs=[transcript_output, highlighted_text_output],
         cache_examples=True,
         run_on_click=True,
     # Submitting
     label_input.submit(
         fn=transcribe_and_recognize_entities,
+        inputs=[audio_input, label_input, ner_mask],
         outputs=[transcript_output, highlighted_text_output],
     )
     submit_btn.click(
         fn=transcribe_and_recognize_entities,
+        inputs=[audio_input, label_input, ner_mask],
         outputs=[transcript_output, highlighted_text_output],
     )