katospiegel's picture
feat: v0.1.1
aaba65a
# Schema version for tracking updates to the schema format
schema-version: "v0.5.0"
# Component Information
component-name: odtp-pyannote-whisper
component-version: "v0.1.1"
component-license: Apache 2.0
component-type: ephemeral
component-description: Transcribe or translate audio files using Whisper and Pyannote for speaker diarization
component-authors:
- name: Carlos Vivar Rios
orcid: null
component-repository:
url: "https://github.com/odtp-org/odtp-pyannote-whisper"
doi: null
component-docker-image: null
tags:
- audio
- transcription
- translation
- whisper
- pyannote
# Tool Information
tools:
- tool-name: whisper
tool-authors:
- name: OpenAI
orcid: null
tool-version: latest
tool-repository:
url: "https://github.com/openai/whisper"
doi: null
tool-license: MIT
- tool-name: pyannote
tool-authors:
- name: Hervé Bredin
orcid: null
tool-version: latest
tool-repository:
url: "https://github.com/pyannote/pyannote-audio"
doi: null
tool-license: MIT
# Secrets (ENV variables)
secrets:
- name: HF_TOKEN
description: Hugging Face API token for accessing pyannote models
type: str
# Build Arguments (if any)
build-args: null
# Exposed Ports
ports: null
# Parameters for the Component
parameters:
- name: MODEL
default-value: large-v3
datatype: str
description: Whisper model to use for transcription/translation
parameter-bounds: null
options:
- tiny
- base
- small
- medium
- large
- large-v2
- large-v3
allow-custom-value: false
- name: TASK
default-value: transcribe
datatype: str
description: Task to perform (transcribe or translate)
parameter-bounds: null
options:
- transcribe
- translate
allow-custom-value: false
- name: LANGUAGE
default-value: auto
datatype: str
description: Source language code (use 'auto' for auto-detection)
parameter-bounds: null
options:
- auto
- en
- es
- fr
- de
- it
- pt
- nl
- ja
- zh
- ru
allow-custom-value: true
# Data Inputs
data-inputs:
- name: INPUT_FILE
type: .wav
path: /odtp/odtp-input
description: Input audio file in WAV format
naming-convention: null
# Data Outputs
data-outputs:
- name: OUTPUT_FILE
type: .srt
path: /odtp/odtp-output
description: Transcription/translation output in SRT format with speaker diarization
naming-convention: null
- name: OUTPUT_JSON_FILE
type: .json
path: /odtp/odtp-output
description: Transcription/translation output in JSON format with speaker diarization
naming-convention: null
- name: OUTPUT_AUDIO_FILE
type: .wav
path: /odtp/odtp-output
description: Audio in wav format
naming-convention: null
- name: OUTPUT_PARAGRAPHS_FILE
type: .json
path: /odtp/odtp-output
description: Markdown file with the paragraphs containing speaker diarization and transcription/translation
naming-convention: null
- name: OUTPUT_MD_FILE
type: .md
path: /odtp/odtp-output
description: Markdown file with the speaker diarization and transcription/translation
naming-convention: null
- name: OUTPUT_PDF_FILE
type: .pdf
path: /odtp/odtp-output
description: PDF file with the speaker diarization and transcription/translation
naming-convention: null
# Validation Schemas (Future Development)
schema-input: null
schema-output: null
# Device Requirements
devices:
- type: gpu
required: true