Spaces:
Runtime error
Runtime error
RamAnanth1
commited on
Commit
•
b9354c2
1
Parent(s):
3b07320
Upload with huggingface_hub
Browse files- dataset/hf_dataset.py +39 -0
- dataset/transcript_dataset.py +63 -0
- downloader/downloader.py +14 -0
- downloader/whisper_post_processor.py +46 -0
- downloader/youtube_downloader.py +26 -0
- interpreter/interpreter.py +15 -0
- interpreter/whisper_interpreter.py +48 -0
dataset/hf_dataset.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
from datasets import load_dataset, Dataset
|
3 |
+
from datasets.data_files import EmptyDatasetError
|
4 |
+
|
5 |
+
class HFDataset(ABC):
|
6 |
+
"""
|
7 |
+
Create a dataset to save the transcripts from Youtube.
|
8 |
+
"""
|
9 |
+
def __init__(self, name) -> None:
|
10 |
+
self.name = name
|
11 |
+
if name != "":
|
12 |
+
self._init_dataset()
|
13 |
+
else:
|
14 |
+
self.dataset = Dataset.from_dict({})
|
15 |
+
self.exist = False
|
16 |
+
self.is_empty = True
|
17 |
+
|
18 |
+
@abstractmethod
|
19 |
+
def generate_dataset():
|
20 |
+
pass
|
21 |
+
|
22 |
+
def _init_dataset(self):
|
23 |
+
try:
|
24 |
+
self.dataset = load_dataset(self.name)
|
25 |
+
self.exist = True
|
26 |
+
self.is_empty = False
|
27 |
+
except EmptyDatasetError:
|
28 |
+
self.dataset = Dataset.from_dict({})
|
29 |
+
self.exist = True
|
30 |
+
self.is_empty = True
|
31 |
+
pass
|
32 |
+
except FileNotFoundError:
|
33 |
+
self.dataset = Dataset.from_dict({})
|
34 |
+
self.exist = False
|
35 |
+
self.is_empty = True
|
36 |
+
pass
|
37 |
+
|
38 |
+
def upload(self):
|
39 |
+
self.dataset.push_to_hub(self.name)
|
dataset/transcript_dataset.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import os
|
3 |
+
import validators
|
4 |
+
import pandas as pd
|
5 |
+
from downloader import WhisperPP, YoutubeDownloader
|
6 |
+
from interpreter import WhisperInterpreter
|
7 |
+
from datasets import load_dataset, concatenate_datasets, Dataset
|
8 |
+
from dataset.hf_dataset import HFDataset
|
9 |
+
|
10 |
+
class TranscriptDataset(HFDataset):
|
11 |
+
|
12 |
+
def __init__(self, name) -> None:
|
13 |
+
super().__init__(name)
|
14 |
+
|
15 |
+
def generate_dataset(self, input, download_path, overwrite, whisper_config):
|
16 |
+
if validators.url(input):
|
17 |
+
self.from_url(input, download_path, overwrite, **whisper_config)
|
18 |
+
else:
|
19 |
+
self.from_files(input, overwrite, **whisper_config)
|
20 |
+
|
21 |
+
def from_url(self, url: str, download_path: str = "tmp/", overwrite: bool = False, **whisper_config: dict) -> None:
|
22 |
+
if self.is_empty:
|
23 |
+
emptyDataset = self.dataset
|
24 |
+
else:
|
25 |
+
#emptyDataset=self.dataset["train"].filter(lambda e: e["id"] is None)
|
26 |
+
emptyDataset=self.dataset["train"]
|
27 |
+
whisper_config["number_videos"] = 5
|
28 |
+
whisperPP = WhisperPP(emptyDataset, **whisper_config)
|
29 |
+
downloader = YoutubeDownloader(download_path)
|
30 |
+
if not overwrite:
|
31 |
+
downloader.config["download_archive"] = os.path.join(download_path,"video_record.txt")
|
32 |
+
self._fill_archive(downloader.config["download_archive"])
|
33 |
+
downloader.download(url, whisperPP)
|
34 |
+
self._concatenate_datasets(whisperPP.get_data())
|
35 |
+
|
36 |
+
def from_files(self, input:str, overwrite: bool = False, **whisper_config):
|
37 |
+
if (whisper_config.get("mode", None) is not None):
|
38 |
+
interpreter = WhisperInterpreter(whisper_config.pop("model_size"))
|
39 |
+
process = getattr(interpreter, whisper_config.pop("mode"))
|
40 |
+
result = process(input, **whisper_config)
|
41 |
+
if type(result) == list:
|
42 |
+
dataset = Dataset.from_list(result)
|
43 |
+
else:
|
44 |
+
dataset = Dataset.from_list([result])
|
45 |
+
else:
|
46 |
+
fileName = "tmp/*.json" if os.path.isdir(input) else input
|
47 |
+
dataset=load_dataset("json", data_files=glob.glob(fileName), split="train")
|
48 |
+
|
49 |
+
self._concatenate_datasets(dataset)
|
50 |
+
|
51 |
+
def _fill_archive(self, archive_file):
|
52 |
+
if not self.is_empty:
|
53 |
+
with open(archive_file, "w") as f:
|
54 |
+
for id in self.dataset["train"]["id"]:
|
55 |
+
f.write(f"youtube {id}\n")
|
56 |
+
|
57 |
+
def _concatenate_datasets(self, dataset):
|
58 |
+
if not self.is_empty:
|
59 |
+
selectedIDs = list(set(dataset["id"])-set(self.dataset["train"]["id"]))
|
60 |
+
filteredDataset = dataset.filter(lambda element: element["id"] in selectedIDs)
|
61 |
+
self.dataset["train"] = concatenate_datasets([self.dataset["train"],filteredDataset])
|
62 |
+
else:
|
63 |
+
self.dataset = dataset
|
downloader/downloader.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
|
3 |
+
class Downloader(ABC):
|
4 |
+
"""
|
5 |
+
A video downloader from online platforms to a specified format
|
6 |
+
"""
|
7 |
+
|
8 |
+
@abstractmethod
|
9 |
+
def __init__(self, download_path):
|
10 |
+
self.download_path = download_path
|
11 |
+
|
12 |
+
@abstractmethod
|
13 |
+
def download(self):
|
14 |
+
pass
|
downloader/whisper_post_processor.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from interpreter import WhisperInterpreter
|
2 |
+
from utils import VIDEO_INFO, json_dump
|
3 |
+
from yt_dlp.postprocessor import PostProcessor
|
4 |
+
from datasets import Dataset
|
5 |
+
import re
|
6 |
+
|
7 |
+
class WhisperPP(PostProcessor):
|
8 |
+
def __init__(self,data,**whisper_options):
|
9 |
+
super().__init__()
|
10 |
+
self._options = whisper_options
|
11 |
+
interpreter = WhisperInterpreter(self._options.pop("model_size","base"))
|
12 |
+
self.data = data
|
13 |
+
self._process = getattr(interpreter, self._options.pop("mode","transcribe"))
|
14 |
+
self._write = self._options.pop("write")
|
15 |
+
self.videos_to_process = self._options.pop("number_videos",0)
|
16 |
+
self.repoId = self._get_name()
|
17 |
+
|
18 |
+
def run(self, info):
|
19 |
+
self.to_screen(f"Processing Video {info['id']}")
|
20 |
+
result = {key: info[key] for key in VIDEO_INFO}
|
21 |
+
result.update(self._process(info["filepath"], **self._options))
|
22 |
+
self.to_screen(f"Processed Video {info['id']} and appended results.")
|
23 |
+
self._update_data(result)
|
24 |
+
if self._write:
|
25 |
+
json_dump(result, f"{info['filepath'].split('.')[0]}.json")
|
26 |
+
return [], info
|
27 |
+
|
28 |
+
def _update_data(self, record):
|
29 |
+
dataType = type(self.data)
|
30 |
+
if dataType == list:
|
31 |
+
self.data.append(record)
|
32 |
+
else:
|
33 |
+
self.data = self.data.add_item(record)
|
34 |
+
if self.data.num_rows >= self.videos_to_process and self.videos_to_process != 0:
|
35 |
+
self.data.push_to_hub(self.repoId)
|
36 |
+
|
37 |
+
def get_data(self):
|
38 |
+
return self.data
|
39 |
+
|
40 |
+
def _get_name(self):
|
41 |
+
if self.data.info.download_checksums is not None:
|
42 |
+
regex = r"(?<=datasets\/)(.*?)(?=\/resolve)"
|
43 |
+
repoId = re.compile(regex)
|
44 |
+
url = list(self.data.info.download_checksums.keys())[0]
|
45 |
+
return repoId.findall(url)[0]
|
46 |
+
return ""
|
downloader/youtube_downloader.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import yt_dlp
|
3 |
+
from downloader import Downloader
|
4 |
+
from yt_dlp.postprocessor import PostProcessor
|
5 |
+
from utils import YT_OPTIONS
|
6 |
+
|
7 |
+
class YoutubeDownloader(Downloader):
|
8 |
+
|
9 |
+
def __init__(self, download_path:str) -> None:
|
10 |
+
super().__init__(download_path)
|
11 |
+
self._ydl_options = YT_OPTIONS
|
12 |
+
self._ydl_options["outtmpl"] = os.path.join(download_path,"%(id)s.%(ext)s")
|
13 |
+
|
14 |
+
|
15 |
+
def download(self, url: str, CustomPP: PostProcessor, when: str = "post_process") -> None:
|
16 |
+
with yt_dlp.YoutubeDL(self._ydl_options) as ydl:
|
17 |
+
ydl.add_post_processor(CustomPP, when=when)
|
18 |
+
ydl.download(url)
|
19 |
+
|
20 |
+
@property
|
21 |
+
def config(self):
|
22 |
+
return self._ydl_options
|
23 |
+
|
24 |
+
@config.setter
|
25 |
+
def config(self, key: str, value: str) -> None:
|
26 |
+
self._ydl_options[key] = value
|
interpreter/interpreter.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
|
3 |
+
class Interpreter(ABC):
|
4 |
+
"""
|
5 |
+
An interpreter make some audio operations to transcribe or translate
|
6 |
+
the video content.
|
7 |
+
"""
|
8 |
+
|
9 |
+
@abstractmethod
|
10 |
+
def transcribe(self):
|
11 |
+
pass
|
12 |
+
|
13 |
+
@abstractmethod
|
14 |
+
def translate(self):
|
15 |
+
pass
|
interpreter/whisper_interpreter.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
from typing import Any, Optional
|
3 |
+
import whisper, os
|
4 |
+
from interpreter import Interpreter
|
5 |
+
from utils import SEGMENTS_INFO, AUDIO_FILES, json_dump
|
6 |
+
|
7 |
+
class WhisperInterpreter(Interpreter):
|
8 |
+
|
9 |
+
def __init__(self, model_size: str) -> None:
|
10 |
+
self.model = whisper.load_model(model_size)
|
11 |
+
|
12 |
+
def transcribe(self, file_path: str, **kwargs: Optional[Any]) -> dict:
|
13 |
+
return self._execute_task("transcribe", file_path, **kwargs)
|
14 |
+
|
15 |
+
def translate(self, file_path: str, **kwargs: Optional[Any]) -> dict:
|
16 |
+
return self._execute_task("translate", file_path, **kwargs)
|
17 |
+
|
18 |
+
def _execute_task(self, mode: str, file_path: str, **kwargs: Optional[Any]) -> dict:
|
19 |
+
options = dict(task=mode)
|
20 |
+
options.update(kwargs)
|
21 |
+
|
22 |
+
if os.path.isdir(file_path):
|
23 |
+
result = []
|
24 |
+
files = [x for x in glob.glob(os.path.join(file_path,"*")) if os.path.splitext(x)[1] in AUDIO_FILES]
|
25 |
+
for file in files:
|
26 |
+
file_processed = dict(filename=file)
|
27 |
+
file_processed.update(self._file_extraction(file, **options))
|
28 |
+
result.append(file_processed)
|
29 |
+
else:
|
30 |
+
result = self._file_extraction(file_path, **options)
|
31 |
+
|
32 |
+
return result
|
33 |
+
|
34 |
+
def _formatter_result(self, input: dict) -> dict:
|
35 |
+
output = dict()
|
36 |
+
output["text"] = input["text"]
|
37 |
+
output["segments"] = [{key: segment[key] for key in SEGMENTS_INFO} for segment in input["segments"]]
|
38 |
+
return output
|
39 |
+
|
40 |
+
def _file_extraction(self, file_path: str, **kwargs: Optional[Any]) -> dict:
|
41 |
+
write = kwargs.pop("write",False)
|
42 |
+
result = self._formatter_result(
|
43 |
+
self.model.transcribe(file_path, **kwargs)
|
44 |
+
)
|
45 |
+
if write:
|
46 |
+
json_dump(result, f"{file_path.split('.')[0]}.json")
|
47 |
+
|
48 |
+
return result
|