Joshua Lochner commited on
Commit
1286fe5
1 Parent(s): c4f250e

Add support for mute action type and remove videos with full action type

Browse files
Files changed (2) hide show
  1. src/preprocess.py +10 -5
  2. src/shared.py +1 -0
src/preprocess.py CHANGED
@@ -8,7 +8,7 @@ import segment
8
  from tqdm import tqdm
9
  from dataclasses import dataclass, field
10
  from transformers import HfArgumentParser
11
- from shared import CATGEGORY_OPTIONS, START_SEGMENT_TEMPLATE, END_SEGMENT_TEMPLATE, GeneralArguments, CustomTokens
12
  import csv
13
  import re
14
  import random
@@ -582,7 +582,7 @@ def main():
582
 
583
  if line['category'] not in allowed_categories:
584
  continue
585
- if line['actionType'] != 'skip':
586
  continue
587
 
588
  # Ignore hidden items
@@ -616,9 +616,16 @@ def main():
616
  'submission_time': float(line['timeSubmitted'])/1e3,
617
  'reputation': reputation,
618
  'category': line['category'],
619
- # 'action': line['actionType'],
620
  })
621
 
 
 
 
 
 
 
 
622
  # Remove duplicate sponsor segments by choosing best (most votes)
623
  if not preprocess_args.keep_duplicate_segments:
624
  logger.info('Remove duplicate segments')
@@ -647,8 +654,6 @@ def main():
647
  # Always include segments locked by VIPs, regardless of view count
648
  del db[key]
649
 
650
- # TODO remove videos that contain a full-video label?
651
-
652
  logger.info(f'Saved {len(db)} videos')
653
 
654
  with open(processed_db_path, 'w') as fp:
 
8
  from tqdm import tqdm
9
  from dataclasses import dataclass, field
10
  from transformers import HfArgumentParser
11
+ from shared import ACTION_OPTIONS, CATGEGORY_OPTIONS, START_SEGMENT_TEMPLATE, END_SEGMENT_TEMPLATE, GeneralArguments, CustomTokens
12
  import csv
13
  import re
14
  import random
 
582
 
583
  if line['category'] not in allowed_categories:
584
  continue
585
+ if line['actionType'] not in ACTION_OPTIONS:
586
  continue
587
 
588
  # Ignore hidden items
 
616
  'submission_time': float(line['timeSubmitted'])/1e3,
617
  'reputation': reputation,
618
  'category': line['category'],
619
+ 'action': line['actionType'],
620
  })
621
 
622
+ # First, remove videos that contain a full-video label
623
+ # (may confuse model since disclaimers and such aren't labelled)
624
+ # Must do it here before removing duplicate segments
625
+ for key in list(db):
626
+ if any(x['action'] == 'full' for x in db[key]):
627
+ del db[key]
628
+
629
  # Remove duplicate sponsor segments by choosing best (most votes)
630
  if not preprocess_args.keep_duplicate_segments:
631
  logger.info('Remove duplicate segments')
 
654
  # Always include segments locked by VIPs, regardless of view count
655
  del db[key]
656
 
 
 
657
  logger.info(f'Saved {len(db)} videos')
658
 
659
  with open(processed_db_path, 'w') as fp:
src/shared.py CHANGED
@@ -8,6 +8,7 @@ from typing import Optional
8
  from dataclasses import dataclass, field
9
  from enum import Enum
10
 
 
11
 
12
  CATGEGORY_OPTIONS = {
13
  'SPONSOR': 'Sponsor',
 
8
  from dataclasses import dataclass, field
9
  from enum import Enum
10
 
11
+ ACTION_OPTIONS = ['skip', 'mute', 'full']
12
 
13
  CATGEGORY_OPTIONS = {
14
  'SPONSOR': 'Sponsor',