Spaces:

katospiegel
/

odtp-pyannote-whisper

Running

File size: 3,625 Bytes

# Schema version for tracking updates to the schema format
schema-version: "v0.5.0"

# Component Information
component-name: odtp-pyannote-whisper
component-version: "v0.1.1"
component-license: Apache 2.0
component-type: ephemeral
component-description: Transcribe or translate audio files using Whisper and Pyannote for speaker diarization
component-authors:
  - name: Carlos Vivar Rios
    orcid: null
component-repository:
  url: "https://github.com/odtp-org/odtp-pyannote-whisper"
  doi: null
component-docker-image: null
tags:
  - audio
  - transcription
  - translation
  - whisper
  - pyannote

# Tool Information
tools:
  - tool-name: whisper
    tool-authors:
      - name: OpenAI
        orcid: null
    tool-version: latest
    tool-repository:
      url: "https://github.com/openai/whisper"
      doi: null
    tool-license: MIT

  - tool-name: pyannote
    tool-authors:
      - name: Hervé Bredin
        orcid: null
    tool-version: latest
    tool-repository:
      url: "https://github.com/pyannote/pyannote-audio"
      doi: null
    tool-license: MIT

# Secrets (ENV variables)
secrets:
  - name: HF_TOKEN
    description: Hugging Face API token for accessing pyannote models
    type: str

# Build Arguments (if any)
build-args: null

# Exposed Ports
ports: null

# Parameters for the Component
parameters:
  - name: MODEL
    default-value: large-v3
    datatype: str
    description: Whisper model to use for transcription/translation
    parameter-bounds: null
    options:
      - tiny
      - base
      - small
      - medium
      - large
      - large-v2
      - large-v3
    allow-custom-value: false

  - name: TASK
    default-value: transcribe
    datatype: str
    description: Task to perform (transcribe or translate)
    parameter-bounds: null
    options:
      - transcribe
      - translate
    allow-custom-value: false

  - name: LANGUAGE
    default-value: auto
    datatype: str
    description: Source language code (use 'auto' for auto-detection)
    parameter-bounds: null
    options:
      - auto
      - en
      - es
      - fr
      - de
      - it
      - pt
      - nl
      - ja
      - zh
      - ru
    allow-custom-value: true

# Data Inputs
data-inputs:
  - name: INPUT_FILE
    type: .wav
    path: /odtp/odtp-input
    description: Input audio file in WAV format
    naming-convention: null

# Data Outputs
data-outputs:
  - name: OUTPUT_FILE
    type: .srt
    path: /odtp/odtp-output
    description: Transcription/translation output in SRT format with speaker diarization
    naming-convention: null

  - name: OUTPUT_JSON_FILE
    type: .json
    path: /odtp/odtp-output
    description: Transcription/translation output in JSON format with speaker diarization
    naming-convention: null

  - name: OUTPUT_AUDIO_FILE
    type: .wav
    path: /odtp/odtp-output
    description: Audio in wav format
    naming-convention: null

  - name: OUTPUT_PARAGRAPHS_FILE
    type: .json
    path: /odtp/odtp-output
    description: Markdown file with the paragraphs containing speaker diarization and transcription/translation
    naming-convention: null

  - name: OUTPUT_MD_FILE
    type: .md
    path: /odtp/odtp-output
    description: Markdown file with the speaker diarization and transcription/translation
    naming-convention: null

  - name: OUTPUT_PDF_FILE
    type: .pdf
    path: /odtp/odtp-output
    description: PDF file with the speaker diarization and transcription/translation
    naming-convention: null

# Validation Schemas (Future Development)
schema-input: null
schema-output: null

# Device Requirements
devices:
  - type: gpu
    required: true