Spaces:
Sleeping
Sleeping
import os | |
import json | |
import gradio as gr | |
import torch | |
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM | |
import logging | |
import traceback | |
import sys | |
from audio_processing import AudioProcessor | |
import spaces | |
from chunkedTranscriber import ChunkedTranscriber | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
handlers=[logging.StreamHandler(sys.stdout)] | |
) | |
logger = logging.getLogger(__name__) | |
def load_qa_model(): | |
"""Load question-answering model with support for long input contexts.""" | |
try: | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4" | |
# Load tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=os.getenv("HF_TOKEN")) | |
tokenizer.model_max_length = 8192 # Ensure the tokenizer can handle 8192 tokens | |
# Load the model | |
model = AutoModelForCausalLM.from_pretrained( | |
model_id, | |
torch_dtype=torch.bfloat16, | |
device_map="auto", | |
rope_scaling={ | |
"type": "dynamic", # Ensure compatibility with long contexts | |
"factor": 8.0 | |
}, | |
use_auth_token=os.getenv("HF_TOKEN") | |
) | |
# Load the pipeline | |
qa_pipeline = pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
max_new_tokens=4096, # Adjust as needed for your use case | |
) | |
return qa_pipeline | |
except Exception as e: | |
logger.error(f"Failed to load Q&A model: {str(e)}") | |
return None | |
# def load_qa_model(): | |
# """Load question-answering model""" | |
# try: | |
# model_id = "meta-llama/Meta-Llama-3-8B-Instruct" | |
# qa_pipeline = pipeline( | |
# "text-generation", | |
# model="hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | |
# model_kwargs={"torch_dtype": torch.bfloat16}, | |
# device_map="auto", | |
# use_auth_token=os.getenv("HF_TOKEN") | |
# ) | |
# return qa_pipeline | |
# except Exception as e: | |
# logger.error(f"Failed to load Q&A model: {str(e)}") | |
# return None | |
def load_summarization_model(): | |
"""Load summarization model""" | |
try: | |
summarizer = pipeline( | |
"summarization", | |
model="sshleifer/distilbart-cnn-12-6", | |
device=0 if torch.cuda.is_available() else -1 | |
) | |
return summarizer | |
except Exception as e: | |
logger.error(f"Failed to load summarization model: {str(e)}") | |
return None | |
def process_audio(audio_file, translate=False): | |
"""Process audio file""" | |
transcriber = ChunkedTranscriber(chunk_size=5, overlap=1) | |
_translation, _output = transcriber.transcribe_audio(audio_file, translate=True) | |
return _translation, _output | |
# try: | |
# processor = AudioProcessor() | |
# language_segments, final_segments = processor.process_audio(audio_file, translate) | |
# # Format output | |
# transcription = "" | |
# full_text = "" | |
# # Add language detection information | |
# for segment in language_segments: | |
# transcription += f"Language: {segment['language']}\n" | |
# transcription += f"Time: {segment['start']:.2f}s - {segment['end']:.2f}s\n\n" | |
# # Add transcription/translation information | |
# transcription += "Transcription with language detection:\n\n" | |
# for segment in final_segments: | |
# transcription += f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['language']}):\n" | |
# transcription += f"Original: {segment['text']}\n" | |
# if translate and 'translated' in segment: | |
# transcription += f"Translated: {segment['translated']}\n" | |
# full_text += segment['translated'] + " " | |
# else: | |
# full_text += segment['text'] + " " | |
# transcription += "\n" | |
# return transcription, full_text | |
# except Exception as e: | |
# logger.error(f"Audio processing failed: {str(e)}") | |
# raise gr.Error(f"Processing failed: {str(e)}") | |
def summarize_text(text): | |
"""Summarize text""" | |
try: | |
summarizer = load_summarization_model() | |
if summarizer is None: | |
return "Summarization model could not be loaded." | |
logger.info("Successfully loaded summarization Model") | |
# logger.info(f"\n\n {text}\n") | |
summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] | |
return summary | |
except Exception as e: | |
logger.error(f"Summarization failed: {str(e)}") | |
return "Error occurred during summarization." | |
def answer_question(context, question): | |
"""Answer questions about the text""" | |
try: | |
qa_pipeline = load_qa_model() | |
if qa_pipeline is None: | |
return "Q&A model could not be loaded." | |
if not question : | |
return "Please enter your Question" | |
messages = [ | |
# {"role": "system", "content": "You are a helpful assistant who can answer questions based on the given context."}, | |
{"role":"system", "content": """ | |
Analyze a translated transcript of a conversation that may contain multiple speakers and summarize the information in a structured intelligence document. | |
The input format will include word-level or sentence-level timestamps, each indicating the speaker ID, language, and translated text. | |
# Input Format Overview | |
Word-Level Timestamps Example: | |
``` | |
[Start Time - End Time] - Speaker <ID> - Language: <Translated Language> - Translated Text: "<Word>" | |
``` | |
Example: | |
``` | |
0.01-0.02 - Speaker 1 - Language: English - Translated Text: "Proceed" | |
0.02-0.025 - Speaker 1 - Language: English - Translated Text: "with" | |
0.025-0.032 - Speaker 2 - Language: English - Translated Text: "caution" | |
``` | |
Optional Sentence-Level Structure Example: | |
``` | |
[Start Time - End Time] - Speaker <ID> - Language: <Translated Language> - Translated Text: "<Sentence>" | |
``` | |
Example with Sentence Grouping: | |
``` | |
0.01-0.05 - Speaker 1 - Language: English - Translated Text: "Proceed with caution." | |
0.06-0.12 - Speaker 2 - Language: English - Translated Text: "All systems are ready." | |
``` | |
# Intelligence Summary Document Structure | |
Use the format below to create a structured summary for each conversation transcript received: | |
### 1. Top-Level Status & Assessment: | |
- **Threat Level Assessment**: | |
- Choose one: | |
- Completely Innocuous | |
- Likely Innocuous | |
- Unclear — Requires Investigation | |
- Likely Dangerous — Immediate Action | |
- Likely Dangerous — Delayed Action | |
- 100% Dangerous — Immediate Action | |
- 100% Dangerous — Delayed Action | |
- **Humanitarian Alert**: Identify any indications of distress, coercion, or need for assistance, such as signs of duress or requests for help. | |
### 2. Basic Metadata: | |
- **Number of Speakers**: Total and unique speakers detected. | |
- **Languages**: List of languages used, with indication of who spoke which language. | |
- **Location**: Actual or inferred locations of participants. | |
- **Communication Medium**: Identify the method of interaction (e.g., phone call, direct conversation). | |
### 3. Conversation Overview: | |
- **Summary**: Concise breakdown of the main points and context. | |
- **Alarming Keywords**: Identify any concerning words, including but not limited to keywords like "kill," "attack," "weapon," etc. | |
- **Suspicious or Cryptic Phrases**: Statements that appear coded or unclear in the context of the discussion. | |
### 4. In-Depth Analysis: | |
- **Network Connections**: Identify mentions of additional individuals or groups involved. | |
- **Intent & Emotional Tone Detection**: Analyze emotional cues (e.g., anger, fear, calmness, urgency). Identify signs of deception or tension. | |
- **Behavioral Patterns**: Highlight repeated themes, phrases, or signals of planning and coordination. | |
- **Code Words & Cryptic Language**: Detect terms that may indicate hidden or covert meaning. | |
- **Geolocation References**: Point out any inferences regarding regional language or place names. | |
- **Sentiment on Strategic Issues**: Identify any indication of radical, dissenting, or anti-national views that could imply unrest or extremism. | |
### 5. Resource Mentions & Operational Logistics: | |
- **Resource & Asset Mentions**: List any mention of tools, weapons, vehicles, or supply logistics. | |
- **Behavioral Deviations**: Identify shifts in tone, speech, or demeanor suggesting stress, coercion, urgency, or preparation. | |
### 6. Prioritization, Recommendations & Actionables: | |
- **High-Risk Alert Priority**: Identify whether the conversation should be flagged for further attention. | |
- **Recommended Actions**: | |
- **Surveillance**: Suggest surveillance if concerning patterns or keywords are detected. | |
- **Intervention**: Recommend intervention for urgent/high-risk cases. | |
- **Humanitarian Assistance**: Suggest immediate support for any signs of distress. | |
- **Follow-Up Analysis**: Identify statements that need deeper review for clarity or to understand potential hidden meanings. | |
# Steps | |
1. Analyze the input conversation for participant information and context. | |
2. Fill in each section of the Intelligence Summary Document structure. | |
3. Ensure all details, especially those related to potential risk factors or alerts, are captured and highlighted clearly. | |
# Output Format | |
Provide one structured Intelligence Summary Document for the conversation in either plain text format or structured JSON. | |
# JSON Format Example: | |
```json | |
{ | |
"Top-Level Status & Assessment": { | |
"Threat Level Assessment": "Unclear - Requires Investigation", | |
"Humanitarian Alert": "No distress signals detected." | |
}, | |
"Basic Metadata": { | |
"Number of Speakers": 2, | |
"Languages": { | |
"Speaker 1": "English", | |
"Speaker 2": "English" | |
}, | |
"Location": "Unknown", | |
"Communication Medium": "Direct conversation" | |
}, | |
"Conversation Overview": { | |
"Summary": "A cautious approach was suggested by Speaker 1, followed by an assurance from Speaker 2 that systems are ready.", | |
"Alarming Keywords": [], | |
"Suspicious or Cryptic Phrases": [] | |
}, | |
"In-Depth Analysis": { | |
"Network Connections": "None identified", | |
"Intent & Emotional Tone Detection": "Calm, precautionary tone", | |
"Behavioral Patterns": "Speaker 1 expressing concern, Speaker 2 providing assurance", | |
"Code Words & Cryptic Language": [], | |
"Geolocation References": [], | |
"Sentiment on Strategic Issues": "No radical or dissenting sentiment detected" | |
}, | |
"Resource Mentions & Operational Logistics": { | |
"Resource & Asset Mentions": [], | |
"Behavioral Deviations": "None noted" | |
}, | |
"Prioritization, Recommendations & Actionables": { | |
"High-Risk Alert Priority": "Low", | |
"Recommended Actions": { | |
"Surveillance": "No further surveillance needed.", | |
"Intervention": "Not required.", | |
"Humanitarian Assistance": "Not required.", | |
"Follow-Up Analysis": "No unusual phrases detected requiring review." | |
} | |
} | |
} | |
``` | |
# Notes | |
- Ensure that you mark any ambiguous segments as requiring further investigation. | |
- Pay attention to emotional tone shifts or sudden changes in behavior. | |
- If any direct or implied threat is detected, prioritize appropriately using the provided classifications. | |
- Err on the side of caution. In case there is even a remote possibility that there might be something that required human attention, flag it. | |
Analyze a translated transcript of a conversation that may contain multiple speakers and summarize the information in a structured intelligence document. | |
The input format will include word-level or sentence-level timestamps, each indicating the speaker ID, language, and translated text. | |
# Input Format Overview | |
Word-Level Timestamps Example: | |
``` | |
[Start Time - End Time] - Speaker <ID> - Language: <Translated Language> - Translated Text: "<Word>" | |
``` | |
Example: | |
``` | |
0.01-0.02 - Speaker 1 - Language: English - Translated Text: "Proceed" | |
0.02-0.025 - Speaker 1 - Language: English - Translated Text: "with" | |
0.025-0.032 - Speaker 2 - Language: English - Translated Text: "caution" | |
``` | |
Optional Sentence-Level Structure Example: | |
``` | |
[Start Time - End Time] - Speaker <ID> - Language: <Translated Language> - Translated Text: "<Sentence>" | |
``` | |
Example with Sentence Grouping: | |
``` | |
0.01-0.05 - Speaker 1 - Language: English - Translated Text: "Proceed with caution." | |
0.06-0.12 - Speaker 2 - Language: English - Translated Text: "All systems are ready." | |
``` | |
# Intelligence Summary Document Structure | |
Use the format below to create a structured summary for each conversation transcript received: | |
### 1. Top-Level Status & Assessment: | |
- **Threat Level Assessment**: | |
- Choose one: | |
- Completely Innocuous | |
- Likely Innocuous | |
- Unclear — Requires Investigation | |
- Likely Dangerous — Immediate Action | |
- Likely Dangerous — Delayed Action | |
- 100% Dangerous — Immediate Action | |
- 100% Dangerous — Delayed Action | |
- **Humanitarian Alert**: Identify any indications of distress, coercion, or need for assistance, such as signs of duress or requests for help. | |
### 2. Basic Metadata: | |
- **Number of Speakers**: Total and unique speakers detected. | |
- **Languages**: List of languages used, with indication of who spoke which language. | |
- **Location**: Actual or inferred locations of participants. | |
- **Communication Medium**: Identify the method of interaction (e.g., phone call, direct conversation). | |
### 3. Conversation Overview: | |
- **Summary**: Concise breakdown of the main points and context. | |
- **Alarming Keywords**: Identify any concerning words, including but not limited to keywords like "kill," "attack," "weapon," etc. | |
- **Suspicious or Cryptic Phrases**: Statements that appear coded or unclear in the context of the discussion. | |
### 4. In-Depth Analysis: | |
- **Network Connections**: Identify mentions of additional individuals or groups involved. | |
- **Intent & Emotional Tone Detection**: Analyze emotional cues (e.g., anger, fear, calmness, urgency). Identify signs of deception or tension. | |
- **Behavioral Patterns**: Highlight repeated themes, phrases, or signals of planning and coordination. | |
- **Code Words & Cryptic Language**: Detect terms that may indicate hidden or covert meaning. | |
- **Geolocation References**: Point out any inferences regarding regional language or place names. | |
- **Sentiment on Strategic Issues**: Identify any indication of radical, dissenting, or anti-national views that could imply unrest or extremism. | |
### 5. Resource Mentions & Operational Logistics: | |
- **Resource & Asset Mentions**: List any mention of tools, weapons, vehicles, or supply logistics. | |
- **Behavioral Deviations**: Identify shifts in tone, speech, or demeanor suggesting stress, coercion, urgency, or preparation. | |
### 6. Prioritization, Recommendations & Actionables: | |
- **High-Risk Alert Priority**: Identify whether the conversation should be flagged for further attention. | |
- **Recommended Actions**: | |
- **Surveillance**: Suggest surveillance if concerning patterns or keywords are detected. | |
- **Intervention**: Recommend intervention for urgent/high-risk cases. | |
- **Humanitarian Assistance**: Suggest immediate support for any signs of distress. | |
- **Follow-Up Analysis**: Identify statements that need deeper review for clarity or to understand potential hidden meanings. | |
# Steps | |
1. Analyze the input conversation for participant information and context. | |
2. Fill in each section of the Intelligence Summary Document structure. | |
3. Ensure all details, especially those related to potential risk factors or alerts, are captured and highlighted clearly. | |
# Output Format | |
Provide one structured Intelligence Summary Document for the conversation in either plain text format or structured JSON. | |
# JSON Format Example: | |
```json | |
{ | |
"Top-Level Status & Assessment": { | |
"Threat Level Assessment": "Unclear - Requires Investigation", | |
"Humanitarian Alert": "No distress signals detected." | |
}, | |
"Basic Metadata": { | |
"Number of Speakers": 2, | |
"Languages": { | |
"Speaker 1": "English", | |
"Speaker 2": "English" | |
}, | |
"Location": "Unknown", | |
"Communication Medium": "Direct conversation" | |
}, | |
"Conversation Overview": { | |
"Summary": "A cautious approach was suggested by Speaker 1, followed by an assurance from Speaker 2 that systems are ready.", | |
"Alarming Keywords": [], | |
"Suspicious or Cryptic Phrases": [] | |
}, | |
"In-Depth Analysis": { | |
"Network Connections": "None identified", | |
"Intent & Emotional Tone Detection": "Calm, precautionary tone", | |
"Behavioral Patterns": "Speaker 1 expressing concern, Speaker 2 providing assurance", | |
"Code Words & Cryptic Language": [], | |
"Geolocation References": [], | |
"Sentiment on Strategic Issues": "No radical or dissenting sentiment detected" | |
}, | |
"Resource Mentions & Operational Logistics": { | |
"Resource & Asset Mentions": [], | |
"Behavioral Deviations": "None noted" | |
}, | |
"Prioritization, Recommendations & Actionables": { | |
"High-Risk Alert Priority": "Low", | |
"Recommended Actions": { | |
"Surveillance": "No further surveillance needed.", | |
"Intervention": "Not required.", | |
"Humanitarian Assistance": "Not required.", | |
"Follow-Up Analysis": "No unusual phrases detected requiring review." | |
} | |
} | |
} | |
``` | |
# Notes | |
- Ensure that you mark any ambiguous segments as requiring further investigation. | |
- Pay attention to emotional tone shifts or sudden changes in behavior. | |
- If any direct or implied threat is detected, prioritize appropriately using the provided classifications. | |
- Err on the side of caution. In case there is even a remote possibility that there might be something that required human attention, flag it. | |
"""}, | |
{"role": "user", "content": f"Context: {text}\n\nQuestion: {question}"} | |
] | |
response = qa_pipeline(messages, max_new_tokens=256)[0]['generated_text'] | |
return response | |
except Exception as e: | |
logger.error(f"Q&A failed: {str(e)}") | |
return f"Error occurred during Q&A process: {str(e)}" | |
# Create Gradio interface | |
with gr.Blocks() as iface: | |
gr.Markdown("# Automatic Speech Recognition for Indic Languages") | |
with gr.Row(): | |
with gr.Column(): | |
audio_input = gr.Audio(type="filepath") | |
translate_checkbox = gr.Checkbox(label="Enable Translation") | |
process_button = gr.Button("Process Audio") | |
with gr.Column(): | |
# ASR_RESULT = gr.Textbox(label="Output") | |
full_text_output = gr.Textbox(label="Full Text", lines=5) | |
translation_output = gr.Textbox(label="Transcription/Translation", lines=10) | |
with gr.Row(): | |
with gr.Column(): | |
summarize_button = gr.Button("Summarize") | |
summary_output = gr.Textbox(label="Summary", lines=3) | |
with gr.Column(): | |
question_input = gr.Textbox(label="Ask a question about the transcription") | |
answer_button = gr.Button("Get Answer") | |
answer_output = gr.Textbox(label="Answer", lines=3) | |
# Set up event handlers | |
process_button.click( | |
process_audio, | |
inputs=[audio_input, translate_checkbox], | |
outputs=[translation_output, full_text_output] | |
# outputs=[ASR_RESULT] | |
) | |
# translated_text = ''.join(item['translated'] for item in ASR_RESULT if 'translated' in item) | |
summarize_button.click( | |
summarize_text, | |
# inputs=[ASR_RESULT], | |
inputs=[translation_output], | |
outputs=[summary_output] | |
) | |
answer_button.click( | |
answer_question, | |
inputs=[full_text_output, question_input], | |
outputs=[answer_output] | |
) | |
# Add system information | |
gr.Markdown(f""" | |
## System Information | |
- Device: {"CUDA" if torch.cuda.is_available() else "CPU"} | |
- CUDA Available: {"Yes" if torch.cuda.is_available() else "No"} | |
## Features | |
- Automatic language detection | |
- High-quality transcription using MMS | |
- Optional translation to English | |
- Text summarization | |
- Question answering | |
""") | |
if __name__ == "__main__": | |
iface.launch(server_port=None) |