Spaces:
Running
Running
Joshua Lochner
commited on
Commit
•
1286fe5
1
Parent(s):
c4f250e
Add support for mute action type and remove videos with full action type
Browse files- src/preprocess.py +10 -5
- src/shared.py +1 -0
src/preprocess.py
CHANGED
@@ -8,7 +8,7 @@ import segment
|
|
8 |
from tqdm import tqdm
|
9 |
from dataclasses import dataclass, field
|
10 |
from transformers import HfArgumentParser
|
11 |
-
from shared import CATGEGORY_OPTIONS, START_SEGMENT_TEMPLATE, END_SEGMENT_TEMPLATE, GeneralArguments, CustomTokens
|
12 |
import csv
|
13 |
import re
|
14 |
import random
|
@@ -582,7 +582,7 @@ def main():
|
|
582 |
|
583 |
if line['category'] not in allowed_categories:
|
584 |
continue
|
585 |
-
if line['actionType']
|
586 |
continue
|
587 |
|
588 |
# Ignore hidden items
|
@@ -616,9 +616,16 @@ def main():
|
|
616 |
'submission_time': float(line['timeSubmitted'])/1e3,
|
617 |
'reputation': reputation,
|
618 |
'category': line['category'],
|
619 |
-
|
620 |
})
|
621 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
622 |
# Remove duplicate sponsor segments by choosing best (most votes)
|
623 |
if not preprocess_args.keep_duplicate_segments:
|
624 |
logger.info('Remove duplicate segments')
|
@@ -647,8 +654,6 @@ def main():
|
|
647 |
# Always include segments locked by VIPs, regardless of view count
|
648 |
del db[key]
|
649 |
|
650 |
-
# TODO remove videos that contain a full-video label?
|
651 |
-
|
652 |
logger.info(f'Saved {len(db)} videos')
|
653 |
|
654 |
with open(processed_db_path, 'w') as fp:
|
|
|
8 |
from tqdm import tqdm
|
9 |
from dataclasses import dataclass, field
|
10 |
from transformers import HfArgumentParser
|
11 |
+
from shared import ACTION_OPTIONS, CATGEGORY_OPTIONS, START_SEGMENT_TEMPLATE, END_SEGMENT_TEMPLATE, GeneralArguments, CustomTokens
|
12 |
import csv
|
13 |
import re
|
14 |
import random
|
|
|
582 |
|
583 |
if line['category'] not in allowed_categories:
|
584 |
continue
|
585 |
+
if line['actionType'] not in ACTION_OPTIONS:
|
586 |
continue
|
587 |
|
588 |
# Ignore hidden items
|
|
|
616 |
'submission_time': float(line['timeSubmitted'])/1e3,
|
617 |
'reputation': reputation,
|
618 |
'category': line['category'],
|
619 |
+
'action': line['actionType'],
|
620 |
})
|
621 |
|
622 |
+
# First, remove videos that contain a full-video label
|
623 |
+
# (may confuse model since disclaimers and such aren't labelled)
|
624 |
+
# Must do it here before removing duplicate segments
|
625 |
+
for key in list(db):
|
626 |
+
if any(x['action'] == 'full' for x in db[key]):
|
627 |
+
del db[key]
|
628 |
+
|
629 |
# Remove duplicate sponsor segments by choosing best (most votes)
|
630 |
if not preprocess_args.keep_duplicate_segments:
|
631 |
logger.info('Remove duplicate segments')
|
|
|
654 |
# Always include segments locked by VIPs, regardless of view count
|
655 |
del db[key]
|
656 |
|
|
|
|
|
657 |
logger.info(f'Saved {len(db)} videos')
|
658 |
|
659 |
with open(processed_db_path, 'w') as fp:
|
src/shared.py
CHANGED
@@ -8,6 +8,7 @@ from typing import Optional
|
|
8 |
from dataclasses import dataclass, field
|
9 |
from enum import Enum
|
10 |
|
|
|
11 |
|
12 |
CATGEGORY_OPTIONS = {
|
13 |
'SPONSOR': 'Sponsor',
|
|
|
8 |
from dataclasses import dataclass, field
|
9 |
from enum import Enum
|
10 |
|
11 |
+
ACTION_OPTIONS = ['skip', 'mute', 'full']
|
12 |
|
13 |
CATGEGORY_OPTIONS = {
|
14 |
'SPONSOR': 'Sponsor',
|