Spaces:

Xenova
/

sponsorblock-ml

Running

App Files Files Community

Joshua Lochner commited on Feb 1, 2022

Commit

8b71088

1 Parent(s): a9123fa

Abstract inference code

Browse files

Files changed (2) hide show

src/evaluate.py +17 -92
src/predict.py +146 -42

src/evaluate.py CHANGED Viewed

@@ -1,13 +1,10 @@
-import itertools
-import base64
-import re
-import requests
 from model import get_model_tokenizer
 from utils import jaccard
 from transformers import HfArgumentParser
 from preprocess import DatasetArguments, get_words
 from shared import GeneralArguments
-from predict import ClassifierArguments, predict, TrainingOutputArguments
 from segment import extract_segment, word_start, word_end, SegmentationArguments, add_labels_to_words
 import pandas as pd
 from dataclasses import dataclass, field
@@ -21,18 +18,8 @@ from urllib.parse import quote
 @dataclass
-class EvaluationArguments(TrainingOutputArguments):
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-    max_videos: Optional[int] = field(
-        default=None,
-        metadata={
-            'help': 'The number of videos to test on'
-        }
-    )
-    start_index: int = field(default=None, metadata={
-        'help': 'Video to start the evaluation at.'})
     output_file: Optional[str] = field(
         default='metrics.csv',
         metadata={
@@ -40,13 +27,6 @@ class EvaluationArguments(TrainingOutputArguments):
         }
     )
-    channel_id: Optional[str] = field(
-        default=None,
-        metadata={
-            'help': 'Used to evaluate a channel'
-        }
-    )
 def attach_predictions_to_sponsor_segments(predictions, sponsor_segments):
     """Attach sponsor segments to closest prediction"""
@@ -144,56 +124,6 @@ def calculate_metrics(labelled_words, predictions):
     return metrics
-# Public innertube key (b64 encoded so that it is not incorrectly flagged)
-INNERTUBE_KEY = base64.b64decode(
-    b'QUl6YVN5QU9fRkoyU2xxVThRNFNURUhMR0NpbHdfWTlfMTFxY1c4').decode()
-YT_CONTEXT = {
-    'client': {
-        'userAgent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36,gzip(gfe)',
-        'clientName': 'WEB',
-        'clientVersion': '2.20211221.00.00',
-    }
-}
-_YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;\s*(?:var\s+meta|</script|\n)'
-def get_all_channel_vids(channel_id):
-    continuation = None
-    while True:
-        if continuation is None:
-            params = {'list': channel_id.replace('UC', 'UU', 1)}
-            response = requests.get(
-                'https://www.youtube.com/playlist', params=params)
-            items = json.loads(re.search(_YT_INITIAL_DATA_RE, response.text).group(1))['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content'][
-                'sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['playlistVideoListRenderer']['contents']
-        else:
-            params = {'key': INNERTUBE_KEY}
-            data = {
-                'context': YT_CONTEXT,
-                'continuation': continuation
-            }
-            response = requests.post(
-                'https://www.youtube.com/youtubei/v1/browse', params=params, json=data)
-            items = response.json()[
-                'onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']
-        new_token = None
-        for vid in items:
-            info = vid.get('playlistVideoRenderer')
-            if info:
-                yield info['videoId']
-                continue
-            info = vid.get('continuationItemRenderer')
-            if info:
-                new_token = info['continuationEndpoint']['continuationCommand']['token']
-        if new_token is None:
-            break
-        continuation = new_token
 def main():
     hf_parser = HfArgumentParser((
         EvaluationArguments,
@@ -205,30 +135,25 @@ def main():
     evaluation_args, dataset_args, segmentation_args, classifier_args, _ = hf_parser.parse_args_into_dataclasses()
-    model, tokenizer = get_model_tokenizer(evaluation_args.model_path, evaluation_args.cache_dir)
-    # # TODO find better way of evaluating videos not trained on
-    # dataset = load_dataset('json', data_files=os.path.join(
-    #     dataset_args.data_dir, dataset_args.validation_file))['train']
-    # video_ids = [row['video_id'] for row in dataset]
     # Load labelled data:
     final_path = os.path.join(
-        dataset_args.data_dir, dataset_args.processed_file)
     with open(final_path) as fp:
         final_data = json.load(fp)
-    if evaluation_args.channel_id is not None:
-        start = evaluation_args.start_index or 0
-        end = None if evaluation_args.max_videos is None else start + \
-            evaluation_args.max_videos
-        video_ids = list(itertools.islice(get_all_channel_vids(
-            evaluation_args.channel_id), start, end))
-        print('Found', len(video_ids), 'for channel', evaluation_args.channel_id)
-    else:
         video_ids = list(final_data.keys())
         random.shuffle(video_ids)
@@ -255,7 +180,7 @@ def main():
                 sponsor_segments = final_data.get(video_id)
                 if not sponsor_segments:
-                    # TODO remove - parse using whole database
                     continue
                 words = get_words(video_id)

 from model import get_model_tokenizer
 from utils import jaccard
 from transformers import HfArgumentParser
 from preprocess import DatasetArguments, get_words
 from shared import GeneralArguments
+from predict import ClassifierArguments, predict, InferenceArguments
 from segment import extract_segment, word_start, word_end, SegmentationArguments, add_labels_to_words
 import pandas as pd
 from dataclasses import dataclass, field
 @dataclass
+class EvaluationArguments(InferenceArguments):
+    """Arguments pertaining to how evaluation will occur."""
     output_file: Optional[str] = field(
         default='metrics.csv',
         metadata={
         }
     )
 def attach_predictions_to_sponsor_segments(predictions, sponsor_segments):
     """Attach sponsor segments to closest prediction"""
     return metrics
 def main():
     hf_parser = HfArgumentParser((
         EvaluationArguments,
     evaluation_args, dataset_args, segmentation_args, classifier_args, _ = hf_parser.parse_args_into_dataclasses()
     # Load labelled data:
     final_path = os.path.join(
+        dataset_args.data_dir, dataset_args.processed_database)
+    if not os.path.exists(final_path):
+        print('ERROR: Processed database not found.',
+              f'Run `python src/preprocess.py --update_database --do_process_database` to generate "{final_path}".')
+        return
+    model, tokenizer = get_model_tokenizer(
+        evaluation_args.model_path, evaluation_args.cache_dir)
     with open(final_path) as fp:
         final_data = json.load(fp)
+    if evaluation_args.video_ids:  # Use specified
+        video_ids = evaluation_args.video_ids
+    else:  # Use items found in preprocessed database
         video_ids = list(final_data.keys())
         random.shuffle(video_ids)
                 sponsor_segments = final_data.get(video_id)
                 if not sponsor_segments:
+                    print('No labels found for', video_id)
                     continue
                 words = get_words(video_id)

src/predict.py CHANGED Viewed

@@ -1,3 +1,14 @@
 from utils import re_findall
 from shared import CustomTokens, START_SEGMENT_TEMPLATE, END_SEGMENT_TEMPLATE, OutputArguments, device, seconds_to_time
 from typing import Optional
@@ -11,17 +22,62 @@ from segment import (
     SegmentationArguments
 )
 import preprocess
-from errors import TranscriptError, ModelLoadError, ClassifierLoadError
 from model import ModelArguments, get_classifier_vectorizer, get_model_tokenizer
-from transformers import HfArgumentParser
-from transformers.trainer_utils import get_last_checkpoint
-from dataclasses import dataclass, field
-import logging
-import os
 @dataclass
-class TrainingOutputArguments:
     model_path: str = field(
         default='Xenova/sponsorblock-small',
@@ -34,28 +90,70 @@ class TrainingOutputArguments:
     output_dir: Optional[str] = OutputArguments.__dataclass_fields__[
         'output_dir']
-    def __post_init__(self):
-        if self.model_path is not None:
-            return
-        if os.path.exists(self.output_dir):
-            last_checkpoint = get_last_checkpoint(self.output_dir)
-            if last_checkpoint is not None:
-                self.model_path = last_checkpoint
-                return
-        raise ModelLoadError(
-            'Unable to find model, explicitly set `--model_path`')
 @dataclass
-class PredictArguments(TrainingOutputArguments):
     video_id: str = field(
         default=None,
         metadata={
-            'help': 'Video to predict sponsorship segments for'}
     )
 _SEGMENT_START = START_SEGMENT_TEMPLATE.format(r'(?P<category>\w+)')
 _SEGMENT_END = END_SEGMENT_TEMPLATE.format(r'\w+')
@@ -297,31 +395,37 @@ def main():
     ))
     predict_args, segmentation_args, classifier_args = hf_parser.parse_args_into_dataclasses()
-    if predict_args.video_id is None:
-        print('No video ID supplied. Use `--video_id`.')
         return
-    model, tokenizer = get_model_tokenizer(predict_args.model_path, predict_args.cache_dir)
-    predict_args.video_id = predict_args.video_id.strip()
-    predictions = predict(predict_args.video_id, model, tokenizer,
-                          segmentation_args, classifier_args=classifier_args)
-    video_url = f'https://www.youtube.com/watch?v={predict_args.video_id}'
-    if not predictions:
-        print('No predictions found for', video_url)
-        return
-    print(len(predictions), 'predictions found for', video_url)
-    for index, prediction in enumerate(predictions, start=1):
-        print(f'Prediction #{index}:')
-        print('Text: "',
-              ' '.join([w['text'] for w in prediction['words']]), '"', sep='')
-        print('Time:', seconds_to_time(
-            prediction['start']), '\u2192', seconds_to_time(prediction['end']))
-        print('Category:', prediction.get('category'))
-        if 'probability' in prediction:
-            print('Probability:', prediction['probability'])
         print()

+import itertools
+import base64
+import re
+import requests
+import json
+from transformers import HfArgumentParser
+from transformers.trainer_utils import get_last_checkpoint
+from dataclasses import dataclass, field
+import logging
+import os
+import itertools
 from utils import re_findall
 from shared import CustomTokens, START_SEGMENT_TEMPLATE, END_SEGMENT_TEMPLATE, OutputArguments, device, seconds_to_time
 from typing import Optional
     SegmentationArguments
 )
 import preprocess
+from errors import PredictionException, TranscriptError, ModelLoadError, ClassifierLoadError
 from model import ModelArguments, get_classifier_vectorizer, get_model_tokenizer
+# Public innertube key (b64 encoded so that it is not incorrectly flagged)
+INNERTUBE_KEY = base64.b64decode(
+    b'QUl6YVN5QU9fRkoyU2xxVThRNFNURUhMR0NpbHdfWTlfMTFxY1c4').decode()
+YT_CONTEXT = {
+    'client': {
+        'userAgent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36,gzip(gfe)',
+        'clientName': 'WEB',
+        'clientVersion': '2.20211221.00.00',
+    }
+}
+_YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;\s*(?:var\s+meta|</script|\n)'
+def get_all_channel_vids(channel_id):
+    continuation = None
+    while True:
+        if continuation is None:
+            params = {'list': channel_id.replace('UC', 'UU', 1)}
+            response = requests.get(
+                'https://www.youtube.com/playlist', params=params)
+            items = json.loads(re.search(_YT_INITIAL_DATA_RE, response.text).group(1))['contents']['twoColumnBrowseResultsRenderer']['tabs'][0]['tabRenderer']['content'][
+                'sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['playlistVideoListRenderer']['contents']
+        else:
+            params = {'key': INNERTUBE_KEY}
+            data = {
+                'context': YT_CONTEXT,
+                'continuation': continuation
+            }
+            response = requests.post(
+                'https://www.youtube.com/youtubei/v1/browse', params=params, json=data)
+            items = response.json()[
+                'onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']
+        new_token = None
+        for vid in items:
+            info = vid.get('playlistVideoRenderer')
+            if info:
+                yield info['videoId']
+                continue
+            info = vid.get('continuationItemRenderer')
+            if info:
+                new_token = info['continuationEndpoint']['continuationCommand']['token']
+        if new_token is None:
+            break
+        continuation = new_token
 @dataclass
+class InferenceArguments:
     model_path: str = field(
         default='Xenova/sponsorblock-small',
     output_dir: Optional[str] = OutputArguments.__dataclass_fields__[
         'output_dir']
+    max_videos: Optional[int] = field(
+        default=None,
+        metadata={
+            'help': 'The number of videos to test on'
+        }
+    )
+    start_index: int = field(default=None, metadata={
+        'help': 'Video to start the evaluation at.'})
+    channel_id: Optional[str] = field(
+        default=None,
+        metadata={
+            'help': 'Used to evaluate a channel'
+        }
+    )
+    video_ids: str = field(
+        default_factory=lambda: [],
+        metadata={
+            'nargs': '+'
+        }
+    )
+    def __post_init__(self):
+        # Try to load model from latest checkpoint
+        if self.model_path is None:
+            if os.path.exists(self.output_dir):
+                last_checkpoint = get_last_checkpoint(self.output_dir)
+                if last_checkpoint is not None:
+                    self.model_path = last_checkpoint
+                else:
+                    raise ModelLoadError(
+                        'Unable to load model from checkpoint, explicitly set `--model_path`')
+            else:
+                raise ModelLoadError(
+                    f'Unable to find model in {self.output_dir}, explicitly set `--model_path`')
+        if any(len(video_id) != 11 for video_id in self.video_ids):
+            raise PredictionException('Invalid video IDs (length not 11)')
+        if self.channel_id is not None:
+            start = self.start_index or 0
+            end = None if self.max_videos is None else start + self.max_videos
+            channel_video_ids = list(itertools.islice(get_all_channel_vids(
+                self.channel_id), start, end))
+            print('Found', len(channel_video_ids),
+                  'for channel', self.channel_id)
+            self.video_ids += channel_video_ids
 @dataclass
+class PredictArguments(InferenceArguments):
     video_id: str = field(
         default=None,
         metadata={
+            'help': 'Video to predict segments for'}
     )
+    def __post_init__(self):
+        if self.video_id is not None:
+            self.video_ids.append(self.video_id)
+        super().__post_init__()
 _SEGMENT_START = START_SEGMENT_TEMPLATE.format(r'(?P<category>\w+)')
 _SEGMENT_END = END_SEGMENT_TEMPLATE.format(r'\w+')
     ))
     predict_args, segmentation_args, classifier_args = hf_parser.parse_args_into_dataclasses()
+    if not predict_args.video_ids:
+        print('No video IDs supplied. Use `--video_id`, `--video_ids`, or `--channel_id`.')
         return
+    model, tokenizer = get_model_tokenizer(
+        predict_args.model_path, predict_args.cache_dir)
+    for video_id in predict_args.video_ids:
+        video_id = video_id.strip()
+        try:
+            predictions = predict(video_id, model, tokenizer,
+                                  segmentation_args, classifier_args=classifier_args)
+        except TranscriptError:
+            print('No transcript available for', video_id, end='\n\n')
+            continue
+        video_url = f'https://www.youtube.com/watch?v={video_id}'
+        if not predictions:
+            print('No predictions found for', video_url, end='\n\n')
+            continue
+        print(len(predictions), 'predictions found for', video_url)
+        for index, prediction in enumerate(predictions, start=1):
+            print(f'Prediction #{index}:')
+            print('Text: "',
+                  ' '.join([w['text'] for w in prediction['words']]), '"', sep='')
+            print('Time:', seconds_to_time(
+                prediction['start']), '\u2192', seconds_to_time(prediction['end']))
+            print('Category:', prediction.get('category'))
+            if 'probability' in prediction:
+                print('Probability:', prediction['probability'])
+            print()
         print()