knowledge-scribe

Sleeping

App Files Files Community

dwb2023 commited on Jun 10, 2024

Commit

416dca9

verified ·

1 Parent(s): 257845f

Update app.py

Browse files

Files changed (1) hide show

app.py +100 -49

app.py CHANGED Viewed

@@ -3,12 +3,11 @@ import json
 import time
 from datetime import datetime
 from pathlib import Path
-from uuid import uuid4
 import tempfile
 import gradio as gr
 import yt_dlp as youtube_dl
-from huggingface_hub import CommitScheduler
 from transformers import (
     BitsAndBytesConfig,
     AutoModelForSpeechSeq2Seq,
@@ -17,18 +16,19 @@ from transformers import (
     pipeline,
 )
 from transformers.pipelines.audio_utils import ffmpeg_read
 import torch  # If you're using PyTorch
-import spaces
-os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 MODEL_NAME = "openai/whisper-large-v3"
 BATCH_SIZE = 8
 YT_LENGTH_LIMIT_S = 4800  # 1 hour 20 minutes
-# Quantization
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_use_double_quant=True,
@@ -46,28 +46,40 @@ model = AutoModelForSpeechSeq2Seq.from_pretrained(
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
-# bnb_config = bnb.QuantizationConfig(bits=4)
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=model,
     tokenizer=tokenizer,
     feature_extractor=feature_extractor,
     chunk_length_s=30,
-    # device=device,
 )
-# Define paths and create directory if not exists
-JSON_DATASET_DIR = Path("json_dataset")
-JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
-JSON_DATASET_PATH = JSON_DATASET_DIR / f"transcriptions-{uuid4()}.json"
-# Initialize CommitScheduler for saving data to Hugging Face Dataset
-scheduler = CommitScheduler(
-    repo_id="yt-transcript-dataset",
-    repo_type="dataset",
-    folder_path=JSON_DATASET_DIR,
-    path_in_repo="data",
-)
 def download_yt_audio(yt_url, filename):
     info_loader = youtube_dl.YoutubeDL()
@@ -75,6 +87,7 @@ def download_yt_audio(yt_url, filename):
         info = info_loader.extract_info(yt_url, download=False)
     except youtube_dl.utils.DownloadError as err:
         raise gr.Error(str(err))
     file_length = info["duration"]
     if file_length > YT_LENGTH_LIMIT_S:
         yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
@@ -82,42 +95,80 @@ def download_yt_audio(yt_url, filename):
         raise gr.Error(
             f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video."
         )
     ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
     with youtube_dl.YoutubeDL(ydl_opts) as ydl:
         ydl.download([yt_url])
-@spaces.GPU(duration=120)
 def yt_transcribe(yt_url, task):
     with tempfile.TemporaryDirectory() as tmpdirname:
         filepath = os.path.join(tmpdirname, "video.mp4")
-        download_yt_audio(yt_url, filepath)
         with open(filepath, "rb") as f:
-            inputs = f.read()
-    inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
-    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
-    text = pipe(
-        inputs,
-        batch_size=BATCH_SIZE,
-        generate_kwargs={"task": task},
-        return_timestamps=True,
-    )["text"]
-    save_transcription(yt_url, text)
-    return text
-def save_transcription(yt_url, transcription):
-    with scheduler.lock:
-        with JSON_DATASET_PATH.open("a") as f:
-            json.dump(
-                {
-                    "url": yt_url,
-                    "transcription": transcription,
-                    "datetime": datetime.now().isoformat(),
-                },
-                f,
-            )
-            f.write("\n")
 demo = gr.Blocks()

 import time
 from datetime import datetime
 from pathlib import Path
 import tempfile
+import pandas as pd
 import gradio as gr
 import yt_dlp as youtube_dl
 from transformers import (
     BitsAndBytesConfig,
     AutoModelForSpeechSeq2Seq,
     pipeline,
 )
 from transformers.pipelines.audio_utils import ffmpeg_read
 import torch  # If you're using PyTorch
+from datasets import load_dataset, Dataset, DatasetDict
+# Constants
 MODEL_NAME = "openai/whisper-large-v3"
 BATCH_SIZE = 8
 YT_LENGTH_LIMIT_S = 4800  # 1 hour 20 minutes
+DATASET_NAME = "dwb2023/yt-transcripts-v3"
+# Environment setup
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+# Model setup
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_use_double_quant=True,
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
 pipe = pipeline(
     task="automatic-speech-recognition",
     model=model,
     tokenizer=tokenizer,
     feature_extractor=feature_extractor,
     chunk_length_s=30,
 )
+def reset_and_update_dataset(new_data):
+    # Define the schema for an empty DataFrame
+    schema = {
+        "url": pd.Series(dtype="str"),
+        "transcription": pd.Series(dtype="str"),
+        "title": pd.Series(dtype="str"),
+        "duration": pd.Series(dtype="int"),
+        "uploader": pd.Series(dtype="str"),
+        "upload_date": pd.Series(dtype="datetime64[ns]"),
+        "description": pd.Series(dtype="str"),
+        "datetime": pd.Series(dtype="datetime64[ns]")
+    }
+    # Create an empty DataFrame with the defined schema
+    df = pd.DataFrame(schema)
+    # Append the new data
+    df = pd.concat([df, pd.DataFrame([new_data])], ignore_index=True)
+    # Convert back to dataset
+    updated_dataset = Dataset.from_pandas(df)
+    # Push the updated dataset to the hub
+    dataset_dict = DatasetDict({"train": updated_dataset})
+    dataset_dict.push_to_hub(DATASET_NAME)
+    print("Dataset reset and updated successfully!")
 def download_yt_audio(yt_url, filename):
     info_loader = youtube_dl.YoutubeDL()
         info = info_loader.extract_info(yt_url, download=False)
     except youtube_dl.utils.DownloadError as err:
         raise gr.Error(str(err))
     file_length = info["duration"]
     if file_length > YT_LENGTH_LIMIT_S:
         yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
         raise gr.Error(
             f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video."
         )
     ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
     with youtube_dl.YoutubeDL(ydl_opts) as ydl:
         ydl.download([yt_url])
+    return info
 def yt_transcribe(yt_url, task):
+    # Load the dataset
+    dataset = load_dataset(DATASET_NAME, split="train")
+    # Check if the transcription already exists
+    for row in dataset:
+        if row['url'] == yt_url:
+            return row['transcription']  # Return the existing transcription
+    # If transcription does not exist, perform the transcription
     with tempfile.TemporaryDirectory() as tmpdirname:
         filepath = os.path.join(tmpdirname, "video.mp4")
+        info = download_yt_audio(yt_url, filepath)
         with open(filepath, "rb") as f:
+            video_data = f.read()
+        inputs = ffmpeg_read(video_data, pipe.feature_extractor.sampling_rate)
+        inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
+        text = pipe(
+            inputs,
+            batch_size=BATCH_SIZE,
+            generate_kwargs={"task": task},
+            return_timestamps=True,
+        )["text"]
+        # Extract additional fields
+        try:
+            title = info.get("title", "N/A")
+            duration = info.get("duration", 0)
+            uploader = info.get("uploader", "N/A")
+            upload_date = info.get("upload_date", "N/A")
+            description = info.get("description", "N/A")
+        except KeyError:
+            title = "N/A"
+            duration = 0
+            uploader = "N/A"
+            upload_date = "N/A"
+            description = "N/A"
+        save_transcription(yt_url, text, title, duration, uploader, upload_date, description)
+        return text
+def save_transcription(yt_url, transcription, title, duration, uploader, upload_date, description):
+    data = {
+        "url": yt_url,
+        "transcription": transcription,
+        "title": title,
+        "duration": duration,
+        "uploader": uploader,
+        "upload_date": upload_date,
+        "description": description,
+        "datetime": datetime.now().isoformat()
+    }
+    # Load the existing dataset
+    dataset = load_dataset(DATASET_NAME, split="train")
+    # Convert to pandas dataframe
+    df = dataset.to_pandas()
+    # Append the new data
+    df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
+    # Convert back to dataset
+    updated_dataset = Dataset.from_pandas(df)
+    # Push the updated dataset to the hub
+    dataset_dict = DatasetDict({"train": updated_dataset})
+    dataset_dict.push_to_hub(DATASET_NAME)
 demo = gr.Blocks()