RamAnanth1 commited on
Commit
b9354c2
1 Parent(s): 3b07320

Upload with huggingface_hub

Browse files
dataset/hf_dataset.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from datasets import load_dataset, Dataset
3
+ from datasets.data_files import EmptyDatasetError
4
+
5
+ class HFDataset(ABC):
6
+ """
7
+ Create a dataset to save the transcripts from Youtube.
8
+ """
9
+ def __init__(self, name) -> None:
10
+ self.name = name
11
+ if name != "":
12
+ self._init_dataset()
13
+ else:
14
+ self.dataset = Dataset.from_dict({})
15
+ self.exist = False
16
+ self.is_empty = True
17
+
18
+ @abstractmethod
19
+ def generate_dataset():
20
+ pass
21
+
22
+ def _init_dataset(self):
23
+ try:
24
+ self.dataset = load_dataset(self.name)
25
+ self.exist = True
26
+ self.is_empty = False
27
+ except EmptyDatasetError:
28
+ self.dataset = Dataset.from_dict({})
29
+ self.exist = True
30
+ self.is_empty = True
31
+ pass
32
+ except FileNotFoundError:
33
+ self.dataset = Dataset.from_dict({})
34
+ self.exist = False
35
+ self.is_empty = True
36
+ pass
37
+
38
+ def upload(self):
39
+ self.dataset.push_to_hub(self.name)
dataset/transcript_dataset.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import os
3
+ import validators
4
+ import pandas as pd
5
+ from downloader import WhisperPP, YoutubeDownloader
6
+ from interpreter import WhisperInterpreter
7
+ from datasets import load_dataset, concatenate_datasets, Dataset
8
+ from dataset.hf_dataset import HFDataset
9
+
10
+ class TranscriptDataset(HFDataset):
11
+
12
+ def __init__(self, name) -> None:
13
+ super().__init__(name)
14
+
15
+ def generate_dataset(self, input, download_path, overwrite, whisper_config):
16
+ if validators.url(input):
17
+ self.from_url(input, download_path, overwrite, **whisper_config)
18
+ else:
19
+ self.from_files(input, overwrite, **whisper_config)
20
+
21
+ def from_url(self, url: str, download_path: str = "tmp/", overwrite: bool = False, **whisper_config: dict) -> None:
22
+ if self.is_empty:
23
+ emptyDataset = self.dataset
24
+ else:
25
+ #emptyDataset=self.dataset["train"].filter(lambda e: e["id"] is None)
26
+ emptyDataset=self.dataset["train"]
27
+ whisper_config["number_videos"] = 5
28
+ whisperPP = WhisperPP(emptyDataset, **whisper_config)
29
+ downloader = YoutubeDownloader(download_path)
30
+ if not overwrite:
31
+ downloader.config["download_archive"] = os.path.join(download_path,"video_record.txt")
32
+ self._fill_archive(downloader.config["download_archive"])
33
+ downloader.download(url, whisperPP)
34
+ self._concatenate_datasets(whisperPP.get_data())
35
+
36
+ def from_files(self, input:str, overwrite: bool = False, **whisper_config):
37
+ if (whisper_config.get("mode", None) is not None):
38
+ interpreter = WhisperInterpreter(whisper_config.pop("model_size"))
39
+ process = getattr(interpreter, whisper_config.pop("mode"))
40
+ result = process(input, **whisper_config)
41
+ if type(result) == list:
42
+ dataset = Dataset.from_list(result)
43
+ else:
44
+ dataset = Dataset.from_list([result])
45
+ else:
46
+ fileName = "tmp/*.json" if os.path.isdir(input) else input
47
+ dataset=load_dataset("json", data_files=glob.glob(fileName), split="train")
48
+
49
+ self._concatenate_datasets(dataset)
50
+
51
+ def _fill_archive(self, archive_file):
52
+ if not self.is_empty:
53
+ with open(archive_file, "w") as f:
54
+ for id in self.dataset["train"]["id"]:
55
+ f.write(f"youtube {id}\n")
56
+
57
+ def _concatenate_datasets(self, dataset):
58
+ if not self.is_empty:
59
+ selectedIDs = list(set(dataset["id"])-set(self.dataset["train"]["id"]))
60
+ filteredDataset = dataset.filter(lambda element: element["id"] in selectedIDs)
61
+ self.dataset["train"] = concatenate_datasets([self.dataset["train"],filteredDataset])
62
+ else:
63
+ self.dataset = dataset
downloader/downloader.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+ class Downloader(ABC):
4
+ """
5
+ A video downloader from online platforms to a specified format
6
+ """
7
+
8
+ @abstractmethod
9
+ def __init__(self, download_path):
10
+ self.download_path = download_path
11
+
12
+ @abstractmethod
13
+ def download(self):
14
+ pass
downloader/whisper_post_processor.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from interpreter import WhisperInterpreter
2
+ from utils import VIDEO_INFO, json_dump
3
+ from yt_dlp.postprocessor import PostProcessor
4
+ from datasets import Dataset
5
+ import re
6
+
7
+ class WhisperPP(PostProcessor):
8
+ def __init__(self,data,**whisper_options):
9
+ super().__init__()
10
+ self._options = whisper_options
11
+ interpreter = WhisperInterpreter(self._options.pop("model_size","base"))
12
+ self.data = data
13
+ self._process = getattr(interpreter, self._options.pop("mode","transcribe"))
14
+ self._write = self._options.pop("write")
15
+ self.videos_to_process = self._options.pop("number_videos",0)
16
+ self.repoId = self._get_name()
17
+
18
+ def run(self, info):
19
+ self.to_screen(f"Processing Video {info['id']}")
20
+ result = {key: info[key] for key in VIDEO_INFO}
21
+ result.update(self._process(info["filepath"], **self._options))
22
+ self.to_screen(f"Processed Video {info['id']} and appended results.")
23
+ self._update_data(result)
24
+ if self._write:
25
+ json_dump(result, f"{info['filepath'].split('.')[0]}.json")
26
+ return [], info
27
+
28
+ def _update_data(self, record):
29
+ dataType = type(self.data)
30
+ if dataType == list:
31
+ self.data.append(record)
32
+ else:
33
+ self.data = self.data.add_item(record)
34
+ if self.data.num_rows >= self.videos_to_process and self.videos_to_process != 0:
35
+ self.data.push_to_hub(self.repoId)
36
+
37
+ def get_data(self):
38
+ return self.data
39
+
40
+ def _get_name(self):
41
+ if self.data.info.download_checksums is not None:
42
+ regex = r"(?<=datasets\/)(.*?)(?=\/resolve)"
43
+ repoId = re.compile(regex)
44
+ url = list(self.data.info.download_checksums.keys())[0]
45
+ return repoId.findall(url)[0]
46
+ return ""
downloader/youtube_downloader.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yt_dlp
3
+ from downloader import Downloader
4
+ from yt_dlp.postprocessor import PostProcessor
5
+ from utils import YT_OPTIONS
6
+
7
+ class YoutubeDownloader(Downloader):
8
+
9
+ def __init__(self, download_path:str) -> None:
10
+ super().__init__(download_path)
11
+ self._ydl_options = YT_OPTIONS
12
+ self._ydl_options["outtmpl"] = os.path.join(download_path,"%(id)s.%(ext)s")
13
+
14
+
15
+ def download(self, url: str, CustomPP: PostProcessor, when: str = "post_process") -> None:
16
+ with yt_dlp.YoutubeDL(self._ydl_options) as ydl:
17
+ ydl.add_post_processor(CustomPP, when=when)
18
+ ydl.download(url)
19
+
20
+ @property
21
+ def config(self):
22
+ return self._ydl_options
23
+
24
+ @config.setter
25
+ def config(self, key: str, value: str) -> None:
26
+ self._ydl_options[key] = value
interpreter/interpreter.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+ class Interpreter(ABC):
4
+ """
5
+ An interpreter make some audio operations to transcribe or translate
6
+ the video content.
7
+ """
8
+
9
+ @abstractmethod
10
+ def transcribe(self):
11
+ pass
12
+
13
+ @abstractmethod
14
+ def translate(self):
15
+ pass
interpreter/whisper_interpreter.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ from typing import Any, Optional
3
+ import whisper, os
4
+ from interpreter import Interpreter
5
+ from utils import SEGMENTS_INFO, AUDIO_FILES, json_dump
6
+
7
+ class WhisperInterpreter(Interpreter):
8
+
9
+ def __init__(self, model_size: str) -> None:
10
+ self.model = whisper.load_model(model_size)
11
+
12
+ def transcribe(self, file_path: str, **kwargs: Optional[Any]) -> dict:
13
+ return self._execute_task("transcribe", file_path, **kwargs)
14
+
15
+ def translate(self, file_path: str, **kwargs: Optional[Any]) -> dict:
16
+ return self._execute_task("translate", file_path, **kwargs)
17
+
18
+ def _execute_task(self, mode: str, file_path: str, **kwargs: Optional[Any]) -> dict:
19
+ options = dict(task=mode)
20
+ options.update(kwargs)
21
+
22
+ if os.path.isdir(file_path):
23
+ result = []
24
+ files = [x for x in glob.glob(os.path.join(file_path,"*")) if os.path.splitext(x)[1] in AUDIO_FILES]
25
+ for file in files:
26
+ file_processed = dict(filename=file)
27
+ file_processed.update(self._file_extraction(file, **options))
28
+ result.append(file_processed)
29
+ else:
30
+ result = self._file_extraction(file_path, **options)
31
+
32
+ return result
33
+
34
+ def _formatter_result(self, input: dict) -> dict:
35
+ output = dict()
36
+ output["text"] = input["text"]
37
+ output["segments"] = [{key: segment[key] for key in SEGMENTS_INFO} for segment in input["segments"]]
38
+ return output
39
+
40
+ def _file_extraction(self, file_path: str, **kwargs: Optional[Any]) -> dict:
41
+ write = kwargs.pop("write",False)
42
+ result = self._formatter_result(
43
+ self.model.transcribe(file_path, **kwargs)
44
+ )
45
+ if write:
46
+ json_dump(result, f"{file_path.split('.')[0]}.json")
47
+
48
+ return result