Spaces:

Xenova
/

sponsorblock-ml

Running

App Files Files Community

Joshua Lochner commited on Jan 29, 2022

Commit

bb58e90

1 Parent(s): f9281a4

Move `seconds_to_time` to shared

Browse files

Files changed (2) hide show

src/predict.py +18 -24
src/shared.py +14 -1

src/predict.py CHANGED Viewed

@@ -5,7 +5,8 @@ from typing import Optional
 from segment import (
     generate_segments,
     extract_segment,
-    SAFETY_TOKENS,
     CustomTokens,
     word_start,
     word_end,
@@ -13,7 +14,7 @@ from segment import (
 )
 import preprocess
 from errors import TranscriptError
-from model import get_classifier_vectorizer
 from transformers import (
     AutoModelForSeq2SeqLM,
     AutoTokenizer,
@@ -26,25 +27,15 @@ import logging
 import re
-def seconds_to_time(seconds, remove_leading_zeroes=False):
-    fractional = round(seconds % 1, 3)
-    fractional = '' if fractional == 0 else str(fractional)[1:]
-    h, remainder = divmod(abs(int(seconds)), 3600)
-    m, s = divmod(remainder, 60)
-    hms = f'{h:02}:{m:02}:{s:02}'
-    if remove_leading_zeroes:
-        hms = re.sub(r'^0(?:0:0?)?', '', hms)
-    return f"{'-' if seconds < 0 else ''}{hms}{fractional}"
 @dataclass
 class TrainingOutputArguments:
     model_path: str = field(
         default=None,
         metadata={
-            'help': 'Path to pretrained model used for prediction'}
     )
     output_dir: Optional[str] = OutputArguments.__dataclass_fields__[
@@ -106,7 +97,8 @@ class ClassifierArguments:
         default=0.5, metadata={'help': 'Remove all predictions whose classification probability is below this threshold.'})
-def filter_and_add_probabilities(predictions, classifier_args):  # classifier, vectorizer,
     """Use classifier to filter predictions"""
     if not predictions:
         return predictions
@@ -135,7 +127,7 @@ def filter_and_add_probabilities(predictions, classifier_args):  # classifier, v
             continue  # Ignore
         if (prediction['category'] not in predicted_probabilities) \
-            or (classifier_category is not None and classifier_probability > 0.5):  # TODO make param
             # Unknown category or we are confident enough to overrule,
             # so change category to what was predicted by classifier
             prediction['category'] = classifier_category
@@ -175,7 +167,8 @@ def predict(video_id, model, tokenizer, segmentation_args, words=None, classifie
     # TODO add back
     if classifier_args is not None:
-        predictions = filter_and_add_probabilities(predictions, classifier_args)
     return predictions
@@ -205,8 +198,12 @@ def predict_sponsor_text(text, model, tokenizer):
     input_ids = tokenizer(
         f'{CustomTokens.EXTRACT_SEGMENTS_PREFIX.value} {text}', return_tensors='pt', truncation=True).input_ids.to(device())
-    # Can't be longer than input length + SAFETY_TOKENS or model input dim
-    max_out_len = min(len(input_ids[0]) + SAFETY_TOKENS, model.model_dim)
     outputs = model.generate(input_ids, max_length=max_out_len)
     return tokenizer.decode(outputs[0], skip_special_tokens=True)
@@ -300,10 +297,7 @@ def main():
         print('No video ID supplied. Use `--video_id`.')
         return
-    model = AutoModelForSeq2SeqLM.from_pretrained(predict_args.model_path)
-    model.to(device())
-    tokenizer = AutoTokenizer.from_pretrained(predict_args.model_path)
     predict_args.video_id = predict_args.video_id.strip()
     predictions = predict(predict_args.video_id, model, tokenizer,

 from segment import (
     generate_segments,
     extract_segment,
+    MIN_SAFETY_TOKENS,
+    SAFETY_TOKENS_PERCENTAGE,
     CustomTokens,
     word_start,
     word_end,
 )
 import preprocess
 from errors import TranscriptError
+from model import get_classifier_vectorizer, get_model_tokenizer
 from transformers import (
     AutoModelForSeq2SeqLM,
     AutoTokenizer,
 import re
+from shared import seconds_to_time
 @dataclass
 class TrainingOutputArguments:
     model_path: str = field(
         default=None,
         metadata={
+            'help': 'Path to pretrained model used for prediction'
+        }
     )
     output_dir: Optional[str] = OutputArguments.__dataclass_fields__[
         default=0.5, metadata={'help': 'Remove all predictions whose classification probability is below this threshold.'})
+# classifier, vectorizer,
+def filter_and_add_probabilities(predictions, classifier_args):
     """Use classifier to filter predictions"""
     if not predictions:
         return predictions
             continue  # Ignore
         if (prediction['category'] not in predicted_probabilities) \
+                or (classifier_category is not None and classifier_probability > 0.5):  # TODO make param
             # Unknown category or we are confident enough to overrule,
             # so change category to what was predicted by classifier
             prediction['category'] = classifier_category
     # TODO add back
     if classifier_args is not None:
+        predictions = filter_and_add_probabilities(
+            predictions, classifier_args)
     return predictions
     input_ids = tokenizer(
         f'{CustomTokens.EXTRACT_SEGMENTS_PREFIX.value} {text}', return_tensors='pt', truncation=True).input_ids.to(device())
+    max_out_len = round(min(
+        max(
+            len(input_ids[0])/SAFETY_TOKENS_PERCENTAGE,
+            len(input_ids[0]) + MIN_SAFETY_TOKENS
+        ),
+        model.model_dim))
     outputs = model.generate(input_ids, max_length=max_out_len)
     return tokenizer.decode(outputs[0], skip_special_tokens=True)
         print('No video ID supplied. Use `--video_id`.')
         return
+    model, tokenizer = get_model_tokenizer(predict_args.model_path)
     predict_args.video_id = predict_args.video_id.strip()
     predictions = predict(predict_args.video_id, model, tokenizer,

src/shared.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gc
 from time import time_ns
 import random
@@ -11,6 +12,7 @@ from enum import Enum
 START_SEGMENT_TEMPLATE = 'START_{}_TOKEN'
 END_SEGMENT_TEMPLATE = 'END_{}_TOKEN'
 class CustomTokens(Enum):
     EXTRACT_SEGMENTS_PREFIX = 'EXTRACT_SEGMENTS: '
@@ -29,7 +31,7 @@ class CustomTokens(Enum):
     LAUGHTER = '[Laughter]'
     PROFANITY = 'PROFANITY_TOKEN'
     # Segment tokens
     NO_SEGMENT = 'NO_SEGMENT_TOKEN'
@@ -103,6 +105,17 @@ def device():
     return torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 def reset():
     torch.clear_autocast_cache()
     torch.cuda.empty_cache()

+import re
 import gc
 from time import time_ns
 import random
 START_SEGMENT_TEMPLATE = 'START_{}_TOKEN'
 END_SEGMENT_TEMPLATE = 'END_{}_TOKEN'
 class CustomTokens(Enum):
     EXTRACT_SEGMENTS_PREFIX = 'EXTRACT_SEGMENTS: '
     LAUGHTER = '[Laughter]'
     PROFANITY = 'PROFANITY_TOKEN'
     # Segment tokens
     NO_SEGMENT = 'NO_SEGMENT_TOKEN'
     return torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+def seconds_to_time(seconds, remove_leading_zeroes=False):
+    fractional = round(seconds % 1, 3)
+    fractional = '' if fractional == 0 else str(fractional)[1:]
+    h, remainder = divmod(abs(int(seconds)), 3600)
+    m, s = divmod(remainder, 60)
+    hms = f'{h:02}:{m:02}:{s:02}'
+    if remove_leading_zeroes:
+        hms = re.sub(r'^0(?:0:0?)?', '', hms)
+    return f"{'-' if seconds < 0 else ''}{hms}{fractional}"
 def reset():
     torch.clear_autocast_cache()
     torch.cuda.empty_cache()