Spaces:

WhisperTube
/

whispertube_backend

Runtime error

File size: 7,866 Bytes

import os
from abc import ABC, abstractmethod

from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import SRTFormatter, WebVTTFormatter


class Transcription(ABC):

    def __init__(self, media_path, output_path, subtitle_format):
        self.media_path = media_path
        self.output_path = os.path.join(os.getcwd(), output_path)
        self.filename = os.path.splitext(media_path)[0]
        self.subtitle_format = subtitle_format

    @abstractmethod
    def generate_transcript(self):
        pass

    @abstractmethod
    def save_transcript(self):
        pass

class YouTubeTranscriptAPI(Transcription):
    def __init__(self, url, media_path, output_path, subtitle_format='srt', transcript_language='en'):
        super().__init__(media_path, output_path, subtitle_format)
        self.url = url
        self.video_id = url.split('v=')[1]
        self.transcript_language = transcript_language
        self.supported_subtitle_formats = ['srt', 'vtt']
        assert(self.subtitle_format.lower() in self.supported_subtitle_formats)

    def get_available_transcripts(self):
        '''
        Returns a dictionary of available transcripts & their info
        '''

        # Getting List of all Available Transcripts
        transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_id)

        # Converting to Available Transcripts to Dictionary
        transcripts_info = dict()
        for transcript in transcript_list:
            transcript_info = {
                'language': transcript.language,
                'is_generated': transcript.is_generated,
                'is_translatable': transcript.is_translatable
            }
            transcripts_info[transcript.language_code] = transcript_info
        return transcripts_info

    def generate_transcript(self):
        '''
        Generates the transcript for the media file
        '''
        self.transcript = YouTubeTranscriptApi.get_transcript(self.video_id, languages=[self.transcript_language])

    def save_transcript(self):
        '''
        Writes the transcript into file
        '''

        # Getting the Formatter
        if self.subtitle_format == 'srt':
            formatter = SRTFormatter()
        elif self.subtitle_format == 'vtt':
            formatter = WebVTTFormatter()

        # Getting the Formatted Transcript
        formatted_transcript = formatter.format_transcript(self.transcript)

        # Writing the Formatted Transcript
        file_path = f'{self.filename}.{self.subtitle_format}'
        with open(file_path, 'w', encoding='utf-8') as transcript_file:
            transcript_file.write(formatted_transcript)
        return file_path


class Whisper(Transcription):
    def __init__(self, media_path, output_path, subtitle_format, word_level):
        super().__init__(media_path, output_path, subtitle_format)
        self.word_level = word_level
        self.supported_subtitle_formats = ['ass', 'srt', 'vtt']
        assert(self.subtitle_format.lower() in self.supported_subtitle_formats)


class FasterWhisper(Whisper):
    def __init__(self, model, media_path, output_path, subtitle_format='srt', word_level=True):
        super().__init__(media_path, output_path, subtitle_format, word_level)
        self.model = model

    def generate_transcript(self):
        '''
        Generates the transcript for the media file
        '''

        all_text = []
        all_segments = []

        if self.word_level:

            # Generating Word Level Transcript
            segments, info = self.model.transcribe(self.media_path, word_timestamps=True)

            # Converting to Dictionary
            all_segments = []
            for segment in segments:
                for word in segment.words:
                    all_text.append(word.word)
                    segment_info = {
                        'text': word.word,
                        'start': round(word.start, 2),
                        'end': round(word.end, 2)
                    }
                    all_segments.append(segment_info)

        else:

            # Generating Word Level Transcript
            segments, info = self.model.transcribe(self.media_path, beam_size=5)

            # Converting to Dictionary
            for segment in segments:
                all_text.append(segment.text)
                segment_info = {
                    'text': segment.text,
                    'start': round(segment.start, 2),
                    'end': round(segment.end, 2)
                }
                all_segments.append(segment_info)

        # Setting Transcript Properties
        self.text = ' '.join(all_text)
        self.language = info.language
        self.segments = all_segments

        # Returning Transcript Properties as Dictionary
        transcript_dict = {
            'language': self.language,
            'text': self.text,
            'segments': self.segments
        }
        return transcript_dict


    def save_transcript(self, transcript, output_file):
        '''
        Writes the transcript into file
        '''
        # TODO: Can't seem to find any built-in methods for writing transcript
        pass

class StableWhisper(Whisper):
    def __init__(self, model, media_path, output_path, subtitle_format='srt', word_level=True):
        super().__init__(media_path, output_path, subtitle_format, word_level)
        self.model = model

    def generate_transcript(self):
        '''
        Generates the transcript for the media file
        '''

        # Generating Word Level Transcript
        self.result = self.model.transcribe(self.media_path, word_timestamps=self.word_level)

        # Converting to Dictionary
        self.resultdict = self.result.to_dict()

        # Formatting Dictionary
        all_segments = []
        if self.word_level:

            all_segments = []
            for segment in self.resultdict['segments']:
                for word in segment['words']:
                    segment_info = {
                        'text': word['word'],
                        'start': round(word['start'], 2),
                        'end': round(word['end'], 2)
                    }
                    all_segments.append(segment_info)

        else:

            for segment in self.resultdict['segments']:
                segment_info = {
                    'text': segment['text'],
                    'start': round(segment['start'], 2),
                    'end': round(segment['end'], 2)
                }
                all_segments.append(segment_info)

        # Setting Transcript Properties
        self.text = self.resultdict['text']
        self.language = self.resultdict['language']
        self.segments = all_segments

        # Returning Transcript Properties as Dictionary
        transcript_dict = {
            'language': self.language,
            'text': self.text,
            'segments': self.segments
        }
        return transcript_dict

    def save_transcript(self):
        '''
        Writes the transcript into file
        '''
        # Writing to TXT file in UTF-8 format
        file_path = os.path.join(self.output_path, f'{self.filename}.txt')
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(self.text)
        return file_path

    def save_subtitles(self):
        '''
        Writes the subtitles into file
        '''
        # Writing according to the Format
        file_path = os.path.join(self.output_path, f'{self.filename}.{self.subtitle_format}')
        if self.subtitle_format == 'ass':
            self.result.to_ass(file_path, segment_level=True, word_level=self.word_level)
        elif self.subtitle_format in ['srt', 'vtt']:
            self.result.to_srt_vtt(file_path, segment_level=True, word_level=self.word_level)
        return file_path