File size: 3,625 Bytes
6934a38
 
 
 
 
aaba65a
c72eec6
6934a38
 
 
 
 
 
 
 
 
2a1a32a
6934a38
 
 
 
 
2a1a32a
6934a38
2a1a32a
6934a38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a1a32a
6934a38
2a1a32a
6934a38
 
 
2a1a32a
6934a38
 
2a1a32a
6934a38
 
2a1a32a
6934a38
2a1a32a
6934a38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a1a32a
6934a38
 
 
 
2a1a32a
6934a38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a1a32a
6934a38
 
 
 
 
 
 
 
 
 
 
 
 
2a1a32a
6934a38
 
 
 
 
2a1a32a
c72eec6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6934a38
 
 
2a1a32a
6934a38
2a1a32a
6934a38
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# Schema version for tracking updates to the schema format
schema-version: "v0.5.0"

# Component Information
component-name: odtp-pyannote-whisper
component-version: "v0.1.1"
component-license: Apache 2.0
component-type: ephemeral
component-description: Transcribe or translate audio files using Whisper and Pyannote for speaker diarization
component-authors:
  - name: Carlos Vivar Rios
    orcid: null
component-repository:
  url: "https://github.com/odtp-org/odtp-pyannote-whisper"
  doi: null
component-docker-image: null
tags:
  - audio
  - transcription
  - translation
  - whisper
  - pyannote

# Tool Information
tools:
  - tool-name: whisper
    tool-authors:
      - name: OpenAI
        orcid: null
    tool-version: latest
    tool-repository:
      url: "https://github.com/openai/whisper"
      doi: null
    tool-license: MIT

  - tool-name: pyannote
    tool-authors:
      - name: Hervé Bredin
        orcid: null
    tool-version: latest
    tool-repository:
      url: "https://github.com/pyannote/pyannote-audio"
      doi: null
    tool-license: MIT

# Secrets (ENV variables)
secrets:
  - name: HF_TOKEN
    description: Hugging Face API token for accessing pyannote models
    type: str

# Build Arguments (if any)
build-args: null

# Exposed Ports
ports: null

# Parameters for the Component
parameters:
  - name: MODEL
    default-value: large-v3
    datatype: str
    description: Whisper model to use for transcription/translation
    parameter-bounds: null
    options:
      - tiny
      - base
      - small
      - medium
      - large
      - large-v2
      - large-v3
    allow-custom-value: false

  - name: TASK
    default-value: transcribe
    datatype: str
    description: Task to perform (transcribe or translate)
    parameter-bounds: null
    options:
      - transcribe
      - translate
    allow-custom-value: false

  - name: LANGUAGE
    default-value: auto
    datatype: str
    description: Source language code (use 'auto' for auto-detection)
    parameter-bounds: null
    options:
      - auto
      - en
      - es
      - fr
      - de
      - it
      - pt
      - nl
      - ja
      - zh
      - ru
    allow-custom-value: true

# Data Inputs
data-inputs:
  - name: INPUT_FILE
    type: .wav
    path: /odtp/odtp-input
    description: Input audio file in WAV format
    naming-convention: null

# Data Outputs
data-outputs:
  - name: OUTPUT_FILE
    type: .srt
    path: /odtp/odtp-output
    description: Transcription/translation output in SRT format with speaker diarization
    naming-convention: null

  - name: OUTPUT_JSON_FILE
    type: .json
    path: /odtp/odtp-output
    description: Transcription/translation output in JSON format with speaker diarization
    naming-convention: null

  - name: OUTPUT_AUDIO_FILE
    type: .wav
    path: /odtp/odtp-output
    description: Audio in wav format
    naming-convention: null

  - name: OUTPUT_PARAGRAPHS_FILE
    type: .json
    path: /odtp/odtp-output
    description: Markdown file with the paragraphs containing speaker diarization and transcription/translation
    naming-convention: null

  - name: OUTPUT_MD_FILE
    type: .md
    path: /odtp/odtp-output
    description: Markdown file with the speaker diarization and transcription/translation
    naming-convention: null

  - name: OUTPUT_PDF_FILE
    type: .pdf
    path: /odtp/odtp-output
    description: PDF file with the speaker diarization and transcription/translation
    naming-convention: null

# Validation Schemas (Future Development)
schema-input: null
schema-output: null

# Device Requirements
devices:
  - type: gpu
    required: true