File size: 2,885 Bytes
fac06d0
 
 
1936f1e
de84263
e4297a8
de84263
 
4b85b27
 
fac06d0
fb970e3
fac06d0
 
292ce47
1936f1e
de84263
1936f1e
 
 
de84263
4b85b27
 
 
 
 
 
 
 
 
 
292ce47
4b85b27
de84263
 
 
1936f1e
 
 
 
292ce47
 
 
1936f1e
c031f24
 
 
 
 
292ce47
c031f24
1936f1e
292ce47
 
 
1936f1e
292ce47
 
 
 
 
 
 
 
 
 
 
1936f1e
165b80a
1936f1e
be37f4c
292ce47
 
 
 
 
 
 
 
 
 
 
 
fac06d0
534a7d7
de84263
292ce47
306a78c
 
536f3e9
306a78c
292ce47
534a7d7
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from transformers import pipeline
import gradio as gr
import time
from video_downloader import download_video, download_video1
from moviepy.editor import AudioFileClip
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
import datetime
import os
from pydub import AudioSegment
from pydub.silence import split_on_silence

pipe = pipeline("automatic-speech-recognition", model="Artanis1551/whisper_romanian3")


def process_video1(from_date, to_date):
    video_path = download_video1(from_date, to_date)

    # Extract audio from the video
    audio_path = f"audio_{from_date}_{to_date}.wav"
    AudioFileClip(video_path).write_audiofile(audio_path)

    # Split the audio into chunks
    audio = AudioSegment.from_wav(audio_path)
    chunks = split_on_silence(audio, min_silence_len=500, silence_thresh=-40)

    # Transcribe each chunk
    transcription = ""
    for i, chunk in enumerate(chunks):
        chunk.export(f"chunk{i}.wav", format="wav")
        with open(f"chunk{i}.wav", "rb") as audio_file:
            audio = audio_file.read()
        transcription += pipe(audio)["text"] + "\n "
        os.remove(f"chunk{i}.wav")

    # Remove the audio file
    os.remove(audio_path)

    return video_path, transcription


def process_video(date):
    # Download the video
    video_path = download_video(date)

    # Extract the first 30 seconds of the video
    short_video_path = f"short_{date}.mp4"
    ffmpeg_extract_subclip(video_path, 0, 30, targetname=short_video_path)

    # Extract audio from the short video
    audio_path = f"audio_{date}.wav"
    AudioFileClip(short_video_path).write_audiofile(audio_path)

    # Split the audio into chunks
    audio = AudioSegment.from_wav(audio_path)
    chunks = split_on_silence(audio, min_silence_len=500, silence_thresh=-40)

    # Transcribe each chunk
    transcription = ""
    for i, chunk in enumerate(chunks):
        chunk.export(f"chunk{i}.wav", format="wav")
        with open(f"chunk{i}.wav", "rb") as audio_file:
            audio = audio_file.read()
        transcription += pipe(audio)["text"] + " "
        os.remove(f"chunk{i}.wav")

    # Remove the audio file
    os.remove(audio_path)

    return short_video_path, transcription


# iface = gr.Interface(
#     fn=process_video1,
#     inputs=[
#         gr.inputs.Textbox(label="From date with format YYYY-MM-DD"),
#         gr.inputs.Textbox(label="Date with format YYYY-MM-DD"),
#     ],
#     outputs=[
#         gr.outputs.Video(),
#         gr.Textbox(lines=1000, max_lines=1000, interactive=True),
#     ],
#     title="Swedish Transcription Test",
# )

iface = gr.Interface(
    fn=process_video,
    inputs=gr.inputs.Textbox(label="Date with format YYYYMMDD"),
    outputs=[
        gr.outputs.Video(),
        gr.Textbox(lines=1000, max_lines=1000, interactive=True),
    ],
    title="Romanian Transcription Test",
)

iface.launch()