dwb2023 commited on
Commit
416dca9
·
verified ·
1 Parent(s): 257845f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -49
app.py CHANGED
@@ -3,12 +3,11 @@ import json
3
  import time
4
  from datetime import datetime
5
  from pathlib import Path
6
- from uuid import uuid4
7
  import tempfile
 
8
 
9
  import gradio as gr
10
  import yt_dlp as youtube_dl
11
- from huggingface_hub import CommitScheduler
12
  from transformers import (
13
  BitsAndBytesConfig,
14
  AutoModelForSpeechSeq2Seq,
@@ -17,18 +16,19 @@ from transformers import (
17
  pipeline,
18
  )
19
  from transformers.pipelines.audio_utils import ffmpeg_read
20
-
21
  import torch # If you're using PyTorch
22
- import spaces
23
-
24
- os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
25
 
 
26
  MODEL_NAME = "openai/whisper-large-v3"
27
  BATCH_SIZE = 8
28
  YT_LENGTH_LIMIT_S = 4800 # 1 hour 20 minutes
 
29
 
30
- # Quantization
 
31
 
 
32
  bnb_config = BitsAndBytesConfig(
33
  load_in_4bit=True,
34
  bnb_4bit_use_double_quant=True,
@@ -46,28 +46,40 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained(
46
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
47
  feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
48
 
49
- # bnb_config = bnb.QuantizationConfig(bits=4)
50
  pipe = pipeline(
51
  task="automatic-speech-recognition",
52
  model=model,
53
  tokenizer=tokenizer,
54
  feature_extractor=feature_extractor,
55
  chunk_length_s=30,
56
- # device=device,
57
  )
58
 
59
- # Define paths and create directory if not exists
60
- JSON_DATASET_DIR = Path("json_dataset")
61
- JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
62
- JSON_DATASET_PATH = JSON_DATASET_DIR / f"transcriptions-{uuid4()}.json"
63
-
64
- # Initialize CommitScheduler for saving data to Hugging Face Dataset
65
- scheduler = CommitScheduler(
66
- repo_id="yt-transcript-dataset",
67
- repo_type="dataset",
68
- folder_path=JSON_DATASET_DIR,
69
- path_in_repo="data",
70
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  def download_yt_audio(yt_url, filename):
73
  info_loader = youtube_dl.YoutubeDL()
@@ -75,6 +87,7 @@ def download_yt_audio(yt_url, filename):
75
  info = info_loader.extract_info(yt_url, download=False)
76
  except youtube_dl.utils.DownloadError as err:
77
  raise gr.Error(str(err))
 
78
  file_length = info["duration"]
79
  if file_length > YT_LENGTH_LIMIT_S:
80
  yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
@@ -82,42 +95,80 @@ def download_yt_audio(yt_url, filename):
82
  raise gr.Error(
83
  f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video."
84
  )
 
85
  ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
86
  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
87
  ydl.download([yt_url])
 
88
 
89
-
90
- @spaces.GPU(duration=120)
91
  def yt_transcribe(yt_url, task):
 
 
 
 
 
 
 
 
 
92
  with tempfile.TemporaryDirectory() as tmpdirname:
93
  filepath = os.path.join(tmpdirname, "video.mp4")
94
- download_yt_audio(yt_url, filepath)
95
  with open(filepath, "rb") as f:
96
- inputs = f.read()
97
- inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
98
- inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
99
- text = pipe(
100
- inputs,
101
- batch_size=BATCH_SIZE,
102
- generate_kwargs={"task": task},
103
- return_timestamps=True,
104
- )["text"]
105
- save_transcription(yt_url, text)
106
- return text
107
-
108
-
109
- def save_transcription(yt_url, transcription):
110
- with scheduler.lock:
111
- with JSON_DATASET_PATH.open("a") as f:
112
- json.dump(
113
- {
114
- "url": yt_url,
115
- "transcription": transcription,
116
- "datetime": datetime.now().isoformat(),
117
- },
118
- f,
119
- )
120
- f.write("\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  demo = gr.Blocks()
123
 
 
3
  import time
4
  from datetime import datetime
5
  from pathlib import Path
 
6
  import tempfile
7
+ import pandas as pd
8
 
9
  import gradio as gr
10
  import yt_dlp as youtube_dl
 
11
  from transformers import (
12
  BitsAndBytesConfig,
13
  AutoModelForSpeechSeq2Seq,
 
16
  pipeline,
17
  )
18
  from transformers.pipelines.audio_utils import ffmpeg_read
 
19
  import torch # If you're using PyTorch
20
+ from datasets import load_dataset, Dataset, DatasetDict
 
 
21
 
22
+ # Constants
23
  MODEL_NAME = "openai/whisper-large-v3"
24
  BATCH_SIZE = 8
25
  YT_LENGTH_LIMIT_S = 4800 # 1 hour 20 minutes
26
+ DATASET_NAME = "dwb2023/yt-transcripts-v3"
27
 
28
+ # Environment setup
29
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
30
 
31
+ # Model setup
32
  bnb_config = BitsAndBytesConfig(
33
  load_in_4bit=True,
34
  bnb_4bit_use_double_quant=True,
 
46
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
47
  feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
48
 
 
49
  pipe = pipeline(
50
  task="automatic-speech-recognition",
51
  model=model,
52
  tokenizer=tokenizer,
53
  feature_extractor=feature_extractor,
54
  chunk_length_s=30,
 
55
  )
56
 
57
+ def reset_and_update_dataset(new_data):
58
+ # Define the schema for an empty DataFrame
59
+ schema = {
60
+ "url": pd.Series(dtype="str"),
61
+ "transcription": pd.Series(dtype="str"),
62
+ "title": pd.Series(dtype="str"),
63
+ "duration": pd.Series(dtype="int"),
64
+ "uploader": pd.Series(dtype="str"),
65
+ "upload_date": pd.Series(dtype="datetime64[ns]"),
66
+ "description": pd.Series(dtype="str"),
67
+ "datetime": pd.Series(dtype="datetime64[ns]")
68
+ }
69
+
70
+ # Create an empty DataFrame with the defined schema
71
+ df = pd.DataFrame(schema)
72
+
73
+ # Append the new data
74
+ df = pd.concat([df, pd.DataFrame([new_data])], ignore_index=True)
75
+
76
+ # Convert back to dataset
77
+ updated_dataset = Dataset.from_pandas(df)
78
+
79
+ # Push the updated dataset to the hub
80
+ dataset_dict = DatasetDict({"train": updated_dataset})
81
+ dataset_dict.push_to_hub(DATASET_NAME)
82
+ print("Dataset reset and updated successfully!")
83
 
84
  def download_yt_audio(yt_url, filename):
85
  info_loader = youtube_dl.YoutubeDL()
 
87
  info = info_loader.extract_info(yt_url, download=False)
88
  except youtube_dl.utils.DownloadError as err:
89
  raise gr.Error(str(err))
90
+
91
  file_length = info["duration"]
92
  if file_length > YT_LENGTH_LIMIT_S:
93
  yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
 
95
  raise gr.Error(
96
  f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video."
97
  )
98
+
99
  ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
100
  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
101
  ydl.download([yt_url])
102
+ return info
103
 
 
 
104
  def yt_transcribe(yt_url, task):
105
+ # Load the dataset
106
+ dataset = load_dataset(DATASET_NAME, split="train")
107
+
108
+ # Check if the transcription already exists
109
+ for row in dataset:
110
+ if row['url'] == yt_url:
111
+ return row['transcription'] # Return the existing transcription
112
+
113
+ # If transcription does not exist, perform the transcription
114
  with tempfile.TemporaryDirectory() as tmpdirname:
115
  filepath = os.path.join(tmpdirname, "video.mp4")
116
+ info = download_yt_audio(yt_url, filepath)
117
  with open(filepath, "rb") as f:
118
+ video_data = f.read()
119
+ inputs = ffmpeg_read(video_data, pipe.feature_extractor.sampling_rate)
120
+ inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
121
+ text = pipe(
122
+ inputs,
123
+ batch_size=BATCH_SIZE,
124
+ generate_kwargs={"task": task},
125
+ return_timestamps=True,
126
+ )["text"]
127
+
128
+ # Extract additional fields
129
+ try:
130
+ title = info.get("title", "N/A")
131
+ duration = info.get("duration", 0)
132
+ uploader = info.get("uploader", "N/A")
133
+ upload_date = info.get("upload_date", "N/A")
134
+ description = info.get("description", "N/A")
135
+ except KeyError:
136
+ title = "N/A"
137
+ duration = 0
138
+ uploader = "N/A"
139
+ upload_date = "N/A"
140
+ description = "N/A"
141
+
142
+ save_transcription(yt_url, text, title, duration, uploader, upload_date, description)
143
+ return text
144
+
145
+ def save_transcription(yt_url, transcription, title, duration, uploader, upload_date, description):
146
+ data = {
147
+ "url": yt_url,
148
+ "transcription": transcription,
149
+ "title": title,
150
+ "duration": duration,
151
+ "uploader": uploader,
152
+ "upload_date": upload_date,
153
+ "description": description,
154
+ "datetime": datetime.now().isoformat()
155
+ }
156
+
157
+ # Load the existing dataset
158
+ dataset = load_dataset(DATASET_NAME, split="train")
159
+
160
+ # Convert to pandas dataframe
161
+ df = dataset.to_pandas()
162
+
163
+ # Append the new data
164
+ df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
165
+
166
+ # Convert back to dataset
167
+ updated_dataset = Dataset.from_pandas(df)
168
+
169
+ # Push the updated dataset to the hub
170
+ dataset_dict = DatasetDict({"train": updated_dataset})
171
+ dataset_dict.push_to_hub(DATASET_NAME)
172
 
173
  demo = gr.Blocks()
174