File size: 4,939 Bytes
fb3c53c d9346bd fb3c53c 857507d fb3c53c 2c8e4b5 9d6e60d fb3c53c 2c8e4b5 fb3c53c 2c8e4b5 fb3c53c 9d6e60d fb3c53c 2c8e4b5 97bfe69 2c8e4b5 97bfe69 1b10bcc 97bfe69 2c8e4b5 8f8089e 2c8e4b5 f37ce91 2c8e4b5 bc091a0 2c8e4b5 0264b46 2c8e4b5 8f8089e f37ce91 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import chainlit as cl
from faster_whisper import WhisperModel
from openai import AsyncOpenAI
import os
os.environ["HF_HOME"] = "/app/.cache"
# Model path for the fine-tuned Whisper model
model_path = "jacktol/whisper-medium.en-fine-tuned-for-ATC-faster-whisper"
# Initialize the Whisper model and OpenAI client
whisper_model = WhisperModel(model_path, device="cpu", compute_type="float32")
client = AsyncOpenAI()
# System prompt for converting transcript to standard ATC syntax
system_prompt = """Convert the provided transcript into standard pilot-ATC syntax without altering the content.
Ensure that all runway and heading numbers are formatted correctly (e.g., '11L' for 'one one left'). Use standard
aviation phraseology wherever applicable. Maintain the segmentation of the transcript as provided, but exclude the timestamps.
Based on the context and segmentation of each transmission, label it as either 'ATC' or 'Pilot'. At the very beginning of your
response place a horizontal div with "---" and then line-break, and then add a H2 which says "Transcription", and then
proceed with the transcription."""
# Function to transcribe audio and return the concatenated transcript with segment info
def transcribe_audio(file_path):
segments, info = whisper_model.transcribe(file_path, beam_size=5)
transcript = []
# Combine all segments with timestamps
for segment in segments:
transcript.append(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
return '\n'.join(transcript).strip()
# Start chat session
@cl.on_chat_start
async def on_chat_start():
try:
# Initialize the session data
if cl.user_session.get("transcription_counter") is None:
cl.user_session.set("transcription_counter", 0)
# Full welcome message
welcome_message = """
## Welcome to the **ATC Transcription Assistant**
---
### What is this tool for?
This tool transcribes **Air Traffic Control (ATC)** audio using OpenAI’s **Whisper medium.en** model, fine-tuned for ATC communications. Developed as part of a research project, the fine-tuned **Whisper medium.en** model offers significant improvements in transcription accuracy for ATC audio.
---
### Performance
- **Fine-tuned Whisper medium.en WER**: 15.08%
- **Non fine-tuned Whisper medium.en WER**: 94.59%
- **Relative improvement**: 84.06%
While the fine-tuned model performs better, **we cannot guarantee the accuracy of the transcriptions**. For more details, see the [blog post](https://jacktol.net/posts/fine-tuning_whisper_on_atc_data), or check out the [project repository](https://github.com/jack-tol/fine-tuning-whisper-on-atc-data). Feel free to contact me at [contact@jacktol.net](mailto:contact@jacktol.net).
---
### How to Use
1. **Upload an ATC audio file**: Upload an audio file in **MP3** or **WAV** format containing ATC communications.
2. **View the transcription**: The tool will transcribe the audio and display the text on the screen.
3. **Transcribe another audio**: Click **New Chat** in the top-right to start a new transcription.
---
To get started, upload the audio below.
"""
await cl.Message(content=welcome_message).send()
# Prompt the user to upload an audio file
files = await cl.AskFileMessage(
content="",
accept={
"audio/wav": [".wav"],
"audio/mpeg": [".mp3"]
},
max_size_mb=50,
timeout=3600
).send()
if files:
audio_file = files[0]
# Get the full segmented transcription with timestamps
transcription = transcribe_audio(audio_file.path)
# Send the entire transcription to the LLM for ATC syntax processing
msg = cl.Message(content="")
await msg.send()
# Process the transcription via the LLM
stream = await client.chat.completions.create(
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": transcription},
],
stream=True,
model="gpt-4o", # Use the appropriate model
temperature=0,
)
# Stream the ATC-processed output
async for part in stream:
token = part.choices[0].delta.content or ""
await msg.stream_token(token)
# Ensure the final token is sent and the message stream is complete
await msg.send() # This will mark the end of the streaming process
except Exception as e:
# Log any errors that occur during session initialization
print(f"Error during on_chat_start: {str(e)}")
# Stop chat session cleanup
@cl.on_stop
async def on_chat_stop():
# Clean up any session data or resources here, if needed
print("Session ended, resources cleaned up.") |