Spaces:

Xenova
/

sponsorblock-ml

Running

App Files Files Community

Joshua Lochner commited on Jan 29, 2022

Commit

b27b0d5

1 Parent(s): bce5ce9

Improve preprocessing and segmentation

Browse files

Files changed (2) hide show

src/preprocess.py +140 -111
src/segment.py +44 -56

src/preprocess.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from utils import jaccard
 from shared import START_SEGMENT_TEMPLATE, END_SEGMENT_TEMPLATE
 from functools import lru_cache
@@ -113,19 +114,26 @@ def list_transcripts(video_id):
     return YouTubeTranscriptApi.list_transcripts(video_id)
 @lru_cache(maxsize=16)
-def get_words(video_id, process=True, fallback=True, transcript_type='auto'):
     """Get parsed video transcript with caching system
     returns None if not processed yet and process is False
     """
-    get_manual_if_fail = fallback and transcript_type == 'auto'
     transcript_path = os.path.join(  # TODO use relative path to this
         'transcripts', transcript_type, f'{video_id}.json')
-    words = []
     try:
         if os.path.exists(transcript_path):  # Load from file
             with open(transcript_path) as fp:
-                words = json.load(fp)
         elif process:
             transcript_list = list_transcripts(video_id)
@@ -138,52 +146,55 @@ def get_words(video_id, process=True, fallback=True, transcript_type='auto'):
     except (TooManyRequests, YouTubeRequestFailed, requests.exceptions.ConnectionError) as e:  # Can retry
         print(e)
         time.sleep(10)  # Timeout
-        return get_words(video_id, process, fallback, transcript_type)
     except CouldNotRetrieveTranscript:
         pass
     except json.decoder.JSONDecodeError:
         print('JSONDecodeError for', video_id)
         os.remove(transcript_path)  # Remove file and try again
-        return get_words(video_id, process, fallback, transcript_type)
-    # Even save empty
-    with open(transcript_path, 'w') as fp:
-        json.dump(words, fp)
-    if not words and get_manual_if_fail:
-        return get_words(video_id, process, fallback, 'manual')
     return words
 # TODO make min_sponsor_segment_length param
 def extract_sponsors(words, min_sponsor_segment_length=3):
-    if not words:
         return []
     paragraphs = []
     current = []
     prev_category = None
-    i = 0
-    while i <= len(words):
-        unimportant = i == len(words) or words[i]['category'] is None
-        if unimportant or words[i]['category'] != prev_category:
             if current:  # Save the current batch
                 paragraphs.append({
                     'words': current,
-                    'category': current[-1]['category'],
                 })
                 current = []
         if not unimportant:  # Some useful information to save
             current.append(words[i])
-            prev_category = words[i]['category']
-        i += 1
     # Remove all too short:
     return list(filter(lambda x: len(x['words']) >= min_sponsor_segment_length, paragraphs))
@@ -277,24 +288,20 @@ class PreprocessArguments:
     min_views: int = field(
         default=5, metadata={'help': 'Minimum number of views a segment must have to be considered. 0 = show all'})
     min_date: str = field(
-        # release of v2.0 (https://github.com/ajayyy/SponsorBlock/releases/tag/2.0)
-        default='08/06/2020',
-        # default='20/08/2021', # release of v3.0 (https://github.com/ajayyy/SponsorBlock/releases/tag/3.0)
         # default='01/10/2020', # No more autovote
-        metadata={'help': 'Only use submissions from after this date'})
-    # TODO move?
-    categories: str = field(
-        default_factory=lambda: ['sponsor', 'selfpromo', 'interaction'],
-        metadata={
-            'nargs': '+',
-            'choices': ['intro', 'sponsor', 'interaction']
-            # 'outro', 'selfpromo', 'preview',
-            # 'poi_highlight', 'filler', 'music_offtopic',
-            # 'moreCategories'
-        }
-    )
     do_transcribe: bool = field(
         default=False, metadata={'help': 'Get transcripts for videos'}
@@ -302,9 +309,9 @@ class PreprocessArguments:
     num_jobs: int = field(
         default=4, metadata={'help': 'Number of transcripts to download in parallel'})
-    # append: bool = field(
-    #     default=False, metadata={'help': 'Append to training, testing and validation data, if present.'}
-    # )
     do_generate: bool = field(
         default=False, metadata={'help': 'Generate labelled data.'}
@@ -381,22 +388,6 @@ def download_file(url, filename):
     return total_bytes == os.path.getsize(filename)
-@dataclass
-class ProcessedArguments:
-    processed_dir: Optional[str] = field(
-        default='processed',
-        metadata={
-            'help': 'Processed data directory'
-        },
-    )
-    processed_file: Optional[str] = field(
-        default='final.json',
-        metadata={
-            'help': 'Processed data file'
-        },
-    )
 def load_datasets(dataset_args):
     print('Reading datasets')
     data_files = {}
@@ -411,7 +402,7 @@ def load_datasets(dataset_args):
         data_files['test'] = os.path.join(
             dataset_args.data_dir, dataset_args.test_file)
-    return load_dataset('json', data_files=data_files)
 @dataclass
@@ -422,6 +413,18 @@ class DatasetArguments:
             'help': 'The directory which stores train, test and/or validation data.'
         },
     )
     train_file: Optional[str] = field(
         default='train.json', metadata={'help': 'The input training data file (a jsonlines file).'}
@@ -444,7 +447,12 @@ class DatasetArguments:
             'help': 'The excess segments left after the split'
         },
     )
     overwrite_cache: bool = field(
         default=False, metadata={'help': 'Overwrite the cached training and evaluation sets'}
     )
@@ -472,13 +480,12 @@ def main():
     # Generate final.json from sponsorTimes.csv
     hf_parser = HfArgumentParser((
         PreprocessArguments,
-        ProcessedArguments,
         DatasetArguments,
         segment.SegmentationArguments,
         ModelArguments,
         GeneralArguments
     ))
-    preprocess_args, processed_args, dataset_args, segmentation_args, model_args, _ = hf_parser.parse_args_into_dataclasses()
     raw_dataset_path = os.path.join(
         preprocess_args.raw_data_dir, preprocess_args.raw_data_file)
@@ -491,28 +498,28 @@ def main():
                 break
             print('Failed, trying next')
-    @lru_cache
-    def read_db():  # TODO save as file
-        print('Parsing raw database')
-        db = {}
-        latest_time = datetime.strptime(preprocess_args.min_date, '%d/%m/%Y')
         with open(raw_dataset_path, newline='') as csvfile:
             reader = csv.DictReader(csvfile)
             for line in reader:
-                submission_time = float(line['timeSubmitted'])/1e3
-                if datetime.fromtimestamp(submission_time) < latest_time:
-                    continue
                 if line['service'] != 'YouTube':
                     continue
                 if len(line['videoID']) != 11:
                     continue  # Invalid youtube video ID
-                if line['category'] not in preprocess_args.categories:
                     continue
                 if line['actionType'] != 'skip':
                     continue
@@ -522,17 +529,18 @@ def main():
                     continue
                 # Skip those that aren't highly voted
-                line['votes'] = int(line['votes'])
-                if line['votes'] < preprocess_args.min_votes:
                     continue
                 locked = line['locked'] == '1'
-                # Skip segments with low views (i.e., not really reviewed)
-                # Always include segments locked by VIPs, regardless of view count
-                line['views'] = int(line['views'])
-                if not locked and line['views'] < preprocess_args.min_views:
-                    continue
                 if line['videoID'] not in db:
                     db[line['videoID']] = []
@@ -541,15 +549,37 @@ def main():
                     'uuid': line['UUID'],
                     'start': float(line['startTime']),
                     'end': float(line['endTime']),
-                    'votes': line['votes'],
                     'locked': locked,
-                    'views': line['views'],
-                    'submission_time': submission_time,
-                    'reputation': line['reputation'],
                     'category': line['category'],
-                    'action': line['actionType'],
                 })
         num_segments = 0
         # Remove duplicate sponsor segments by choosing best (most votes)
@@ -559,20 +589,21 @@ def main():
             num_segments += len(db[key])
         print('Saved', len(db), 'videos and', num_segments, 'segments')
         return db
     # 'videoID', 'startTime', 'endTime', 'votes', 'locked', 'incorrectVotes', 'UUID',
     # 'userID', 'timeSubmitted', 'views', 'category', 'actionType', 'service', 'videoDuration',
     # 'hidden', 'reputation', 'shadowHidden', 'hashedVideoID', 'userAgent', 'description'
-    parsed_database = None
     if preprocess_args.do_transcribe:
         print('Collecting videos')
         parsed_database = read_db()
         # Remove transcripts already processed
-        finished = set(os.listdir('transcripts/auto/') +
-                       os.listdir('transcripts/manual/'))
-        finished = set([x.split('.')[0] for x in finished])
         video_ids = list(parsed_database.keys() - finished)
@@ -592,7 +623,7 @@ def main():
                 tasks, preprocess_args.num_jobs, callback).start()
     final_path = os.path.join(
-        processed_args.processed_dir, processed_args.processed_file)
     if preprocess_args.do_create:
         print('Create final data')
@@ -601,22 +632,19 @@ def main():
         parsed_database = read_db()
-        # TODO add progress bar
         # TODO parallelise?
         with tqdm(total=len(parsed_database)) as progress:
             for index, (video_id, segments) in enumerate(parsed_database.items()):
                 if preprocess_args.max_videos is not None and index >= preprocess_args.max_videos:
                     break
                 progress.set_description(f'Processing {video_id}')
                 progress.update()
-                final_data[video_id] = []
                 video_words = get_words(video_id, process=False)
                 if not video_words:
                     continue
                 for seg in segments:  # Only add segments with high enough wps
                     segment_words = segment.extract_segment(
                         video_words, seg['start'], seg['end'])
@@ -634,7 +662,10 @@ def main():
                         # e.g. music ads with some words on each side
                         # progress.set_description(f'Skipping bad segment in {video_id} (wps={wps})')
                         continue
-                    final_data[video_id].append(seg)
         # Save data
         with open(final_path, 'w') as fp:
@@ -666,13 +697,12 @@ def main():
     if preprocess_args.do_generate:
         print('Generating')
-        from model import get_tokenizer
         # max_videos=preprocess_args.max_videos,
         # max_segments=preprocess_args.max_segments,
         # , max_videos, max_segments
-        tokenizer = get_tokenizer(model_args)
         # TODO
         # count_videos = 0
@@ -685,8 +715,9 @@ def main():
         data = list(itertools.islice(data, start_index, end_index))
-        with open(positive_file, 'a', encoding='utf-8') as positive, \
-                open(negative_file, 'a', encoding='utf-8') as negative, \
                 tqdm(data) as progress:
             for offset, (video_id, sponsor_segments) in enumerate(data):
@@ -711,36 +742,36 @@ def main():
                     continue
                 for seg in segments:
-                    duration = segment.word_end(
-                        seg[-1]) - segment.word_start(seg[0])
-                    wps = len(seg)/duration if duration > 0 else 0
-                    # Ignore segments with "not enough words" in the transcript
-                    # Must do here since this includes non-sponsor segments
-                    if wps < preprocess_args.min_wps:
-                        continue
                     d = {
                         'video_index': offset + start_index,
                         'video_id': video_id,
-                        'text': clean_text(' '.join(x['text'] for x in seg)),
-                        'words_per_second': round(wps, 3),
                     }
                     extracted_segments = extract_sponsors(seg)
                     if extracted_segments:
                         extracted_texts = []
                         for s in extracted_segments:
-                            w = ' '.join(q['text'] for q in s['words'])
                             category = s['category'].upper()
                             extracted_texts.append(
                                 f'{START_SEGMENT_TEMPLATE.format(category)} {w} {END_SEGMENT_TEMPLATE.format(category)}'
                             )
-                        extracted_text = f' {CustomTokens.BETWEEN_SEGMENTS.value} '.join(
                             extracted_texts)
-                        d['extracted'] = clean_text(extracted_text)
                         print(json.dumps(d), file=positive)
                     else:
@@ -824,14 +855,12 @@ def main():
 def split(arr, ratios):
-    """Split array according to ratios. Sum of ratios should be less than 1"""
     to_return = []
     cumulative_sum = 0
     for r in ratios:
         current = cumulative_sum
         cumulative_sum += r * len(arr)
         to_return.append(arr[int(current):int(cumulative_sum)])

+from shared import CATGEGORY_OPTIONS
 from utils import jaccard
 from shared import START_SEGMENT_TEMPLATE, END_SEGMENT_TEMPLATE
 from functools import lru_cache
     return YouTubeTranscriptApi.list_transcripts(video_id)
+WORDS_TO_REMOVE = [
+    CustomTokens.MUSIC.value,
+    CustomTokens.APPLAUSE.value,
+    CustomTokens.LAUGHTER.value
+]
 @lru_cache(maxsize=16)
+def get_words(video_id, process=True, transcript_type='auto', fallback='manual', filter_words_to_remove=True):
     """Get parsed video transcript with caching system
     returns None if not processed yet and process is False
     """
     transcript_path = os.path.join(  # TODO use relative path to this
         'transcripts', transcript_type, f'{video_id}.json')
+    words = None
     try:
         if os.path.exists(transcript_path):  # Load from file
             with open(transcript_path) as fp:
+                words = json.load(fp)  # May be empty
         elif process:
             transcript_list = list_transcripts(video_id)
     except (TooManyRequests, YouTubeRequestFailed, requests.exceptions.ConnectionError) as e:  # Can retry
         print(e)
         time.sleep(10)  # Timeout
+        return get_words(video_id, process, transcript_type, fallback)
     except CouldNotRetrieveTranscript:
         pass
     except json.decoder.JSONDecodeError:
         print('JSONDecodeError for', video_id)
         os.remove(transcript_path)  # Remove file and try again
+        return get_words(video_id, process, transcript_type, fallback)
+    # Tried to process it, but it was empty...
+    if process and not os.path.exists(transcript_path):
+        with open(transcript_path, 'w') as fp:
+            json.dump(words, fp)
+    if not words and fallback is not None:
+        return get_words(video_id, process, transcript_type=fallback, fallback=None)
+    if words and filter_words_to_remove:
+        words = list(filter(lambda x: x['text'] not in WORDS_TO_REMOVE, words))
     return words
 # TODO make min_sponsor_segment_length param
+# TODO rename to extract_segments
 def extract_sponsors(words, min_sponsor_segment_length=3):
+    if not words or len(words) < min_sponsor_segment_length:
         return []
     paragraphs = []
     current = []
     prev_category = None
+    for i in range(len(words) + 1):
+        unimportant = i == len(words) or words[i].get('category') is None
+        if unimportant or words[i].get('category') != prev_category:
             if current:  # Save the current batch
                 paragraphs.append({
                     'words': current,
+                    'category': current[-1].get('category'),
                 })
                 current = []
         if not unimportant:  # Some useful information to save
             current.append(words[i])
+            prev_category = words[i].get('category')
     # Remove all too short:
     return list(filter(lambda x: len(x['words']) >= min_sponsor_segment_length, paragraphs))
     min_views: int = field(
         default=5, metadata={'help': 'Minimum number of views a segment must have to be considered. 0 = show all'})
+    # min_reputation: int = field(
+    #     default=0, metadata={'help': 'Minimum reputation a user must have for the segment to be included'})
     min_date: str = field(
+        # default='08/06/2020', # release of v2.0 (https://github.com/ajayyy/SponsorBlock/releases/tag/2.0)
+        # release of v3.0 (https://github.com/ajayyy/SponsorBlock/releases/tag/3.0)
+        default='20/08/2021',
         # default='01/10/2020', # No more autovote
+        metadata={'help': 'Only use submissions from after this date (inclusive)'})
+    max_date: str = field(
+        # default='01/01/9999', # Include all
+        default='27/01/2022',
+        metadata={'help': 'Only use videos that have some segment from before this date (exclusive). This allows for videos to have segments be corrected, but ignores new videos (posted after this date) to enter the pool.'})
     do_transcribe: bool = field(
         default=False, metadata={'help': 'Get transcripts for videos'}
     num_jobs: int = field(
         default=4, metadata={'help': 'Number of transcripts to download in parallel'})
+    overwrite: bool = field(
+        default=False, metadata={'help': 'Overwrite training, testing and validation data, if present.'}
+    )
     do_generate: bool = field(
         default=False, metadata={'help': 'Generate labelled data.'}
     return total_bytes == os.path.getsize(filename)
 def load_datasets(dataset_args):
     print('Reading datasets')
     data_files = {}
         data_files['test'] = os.path.join(
             dataset_args.data_dir, dataset_args.test_file)
+    return load_dataset('json', data_files=data_files, cache_dir=dataset_args.dataset_cache_dir)
 @dataclass
             'help': 'The directory which stores train, test and/or validation data.'
         },
     )
+    processed_file: Optional[str] = field(
+        default='segments.json',
+        metadata={
+            'help': 'Processed data file'
+        },
+    )
+    processed_database: Optional[str] = field(
+        default='processed_database.json',
+        metadata={
+            'help': 'Processed database file'
+        },
+    )
     train_file: Optional[str] = field(
         default='train.json', metadata={'help': 'The input training data file (a jsonlines file).'}
             'help': 'The excess segments left after the split'
         },
     )
+    dataset_cache_dir: Optional[str] = field(
+        default=None,
+        metadata={
+            'help': 'Where to store the cached datasets'
+        },
+    )
     overwrite_cache: bool = field(
         default=False, metadata={'help': 'Overwrite the cached training and evaluation sets'}
     )
     # Generate final.json from sponsorTimes.csv
     hf_parser = HfArgumentParser((
         PreprocessArguments,
         DatasetArguments,
         segment.SegmentationArguments,
         ModelArguments,
         GeneralArguments
     ))
+    preprocess_args, dataset_args, segmentation_args, model_args, _ = hf_parser.parse_args_into_dataclasses()
     raw_dataset_path = os.path.join(
         preprocess_args.raw_data_dir, preprocess_args.raw_data_file)
                 break
             print('Failed, trying next')
+    processed_db_path = os.path.join(
+        dataset_args.data_dir, dataset_args.processed_database)
+    def read_db():
+        if not preprocess_args.overwrite and os.path.exists(processed_db_path):
+            with open(processed_db_path) as fp:
+                return json.load(fp)
+        print('Processing raw database')
+        db = {}
+        allowed_categories = list(map(str.lower, CATGEGORY_OPTIONS))
         with open(raw_dataset_path, newline='') as csvfile:
             reader = csv.DictReader(csvfile)
             for line in reader:
                 if line['service'] != 'YouTube':
                     continue
                 if len(line['videoID']) != 11:
                     continue  # Invalid youtube video ID
+                if line['category'] not in allowed_categories:
                     continue
                 if line['actionType'] != 'skip':
                     continue
                     continue
                 # Skip those that aren't highly voted
+                votes = int(line['votes'])
+                if votes < preprocess_args.min_votes:
                     continue
                 locked = line['locked'] == '1'
+                reputation = float(line['reputation'])
+                # if reputation < preprocess_args.min_reputation:
+                #     continue # TODO add back?
+                # Problems like mGVn1wCkBrE
+                # TODO ignore if over max_duration
                 if line['videoID'] not in db:
                     db[line['videoID']] = []
                     'uuid': line['UUID'],
                     'start': float(line['startTime']),
                     'end': float(line['endTime']),
+                    'votes': votes,
                     'locked': locked,
+                    'views': int(line['views']),
+                    'submission_time': float(line['timeSubmitted'])/1e3,
+                    'reputation': reputation,
                     'category': line['category'],
+                    # 'action': line['actionType'],
                 })
+        # We now remove whole videos from the list
+        # Helps with obtaining "fully-labelled" videos
+        min_date = datetime.strptime(preprocess_args.min_date, '%d/%m/%Y')
+        max_date = datetime.strptime(preprocess_args.max_date, '%d/%m/%Y')
+        for key in list(db):
+            if any(datetime.fromtimestamp(x['submission_time']) < min_date for x in db[key]):
+                # Remove videos where any of its segments were submitted before min_date
+                # (essentially removes videos uploaded before min_date)
+                # Prevents issues where some segments of a video are excluded
+                del db[key]
+            elif all(datetime.fromtimestamp(x['submission_time']) > max_date for x in db[key]):
+                # Remove videos where all of its segments were submitted after max_date
+                # (essentially removes videos uploaded after max_date)
+                # Allows for segments to be corrected for past videos
+                del db[key]
+            elif any(not x['locked'] and x['views'] < preprocess_args.min_views for x in db[key]):
+                # Remove videos where any of its non-locked segments do not have enough views
+                # (essentially skips videos that have not been fully watched/reviewed)
+                # Always include segments locked by VIPs, regardless of view count
+                del db[key]
         num_segments = 0
         # Remove duplicate sponsor segments by choosing best (most votes)
             num_segments += len(db[key])
         print('Saved', len(db), 'videos and', num_segments, 'segments')
+        with open(processed_db_path, 'w') as fp:
+            json.dump(db, fp)
         return db
     # 'videoID', 'startTime', 'endTime', 'votes', 'locked', 'incorrectVotes', 'UUID',
     # 'userID', 'timeSubmitted', 'views', 'category', 'actionType', 'service', 'videoDuration',
     # 'hidden', 'reputation', 'shadowHidden', 'hashedVideoID', 'userAgent', 'description'
     if preprocess_args.do_transcribe:
         print('Collecting videos')
         parsed_database = read_db()
         # Remove transcripts already processed
+        finished = set(x.split('.')[0] for x in os.listdir(
+            'transcripts/auto/') + os.listdir('transcripts/manual/'))
         video_ids = list(parsed_database.keys() - finished)
                 tasks, preprocess_args.num_jobs, callback).start()
     final_path = os.path.join(
+        dataset_args.data_dir, dataset_args.processed_file)
     if preprocess_args.do_create:
         print('Create final data')
         parsed_database = read_db()
         # TODO parallelise?
         with tqdm(total=len(parsed_database)) as progress:
             for index, (video_id, segments) in enumerate(parsed_database.items()):
                 if preprocess_args.max_videos is not None and index >= preprocess_args.max_videos:
                     break
                 progress.set_description(f'Processing {video_id}')
                 progress.update()
                 video_words = get_words(video_id, process=False)
                 if not video_words:
                     continue
+                final_vid_segs = []
                 for seg in segments:  # Only add segments with high enough wps
                     segment_words = segment.extract_segment(
                         video_words, seg['start'], seg['end'])
                         # e.g. music ads with some words on each side
                         # progress.set_description(f'Skipping bad segment in {video_id} (wps={wps})')
                         continue
+                    final_vid_segs.append(seg)
+                if final_vid_segs:
+                    final_data[video_id] = final_vid_segs
         # Save data
         with open(final_path, 'w') as fp:
     if preprocess_args.do_generate:
         print('Generating')
         # max_videos=preprocess_args.max_videos,
         # max_segments=preprocess_args.max_segments,
         # , max_videos, max_segments
+        from model import get_model_tokenizer
+        model, tokenizer = get_model_tokenizer(model_args.model_name_or_path)
         # TODO
         # count_videos = 0
         data = list(itertools.islice(data, start_index, end_index))
+        write_mode = 'w' if preprocess_args.overwrite else 'a'
+        with open(positive_file, write_mode, encoding='utf-8') as positive, \
+                open(negative_file, write_mode, encoding='utf-8') as negative, \
                 tqdm(data) as progress:
             for offset, (video_id, sponsor_segments) in enumerate(data):
                     continue
                 for seg in segments:
+                    seg_start = segment.word_start(seg[0])
+                    seg_end = segment.word_end(seg[-1])
+                    # duration = seg_end - seg_start
+                    # wps = len(seg)/duration if duration > 0 else 0
+                    # # Ignore segments with "not enough words" in the transcript
+                    # # Must do here since this includes non-sponsor segments
+                    # if wps < preprocess_args.min_wps:
+                    #     continue
                     d = {
                         'video_index': offset + start_index,
                         'video_id': video_id,
+                        'text': ' '.join(x['cleaned'] for x in seg),
+                        'start': seg_start,
+                        'end': seg_end,
                     }
                     extracted_segments = extract_sponsors(seg)
                     if extracted_segments:
                         extracted_texts = []
                         for s in extracted_segments:
+                            w = ' '.join(q['cleaned'] for q in s['words'])
                             category = s['category'].upper()
                             extracted_texts.append(
                                 f'{START_SEGMENT_TEMPLATE.format(category)} {w} {END_SEGMENT_TEMPLATE.format(category)}'
                             )
+                        d['extracted'] = f' {CustomTokens.BETWEEN_SEGMENTS.value} '.join(
                             extracted_texts)
                         print(json.dumps(d), file=positive)
                     else:
 def split(arr, ratios):
+    """Split array according to ratios. Sum of ratios should be <= 1"""
     to_return = []
     cumulative_sum = 0
     for r in ratios:
         current = cumulative_sum
         cumulative_sum += r * len(arr)
         to_return.append(arr[int(current):int(cumulative_sum)])

src/segment.py CHANGED Viewed

@@ -5,27 +5,19 @@ from dataclasses import dataclass, field
 @dataclass
 class SegmentationArguments:
-    pause_threshold: int = field(default=2, metadata={
         'help': 'When the time between words is greater than pause threshold, force into a new segment'})
-# WORDS TO ALWAYS HAVE ON THEIR OWN
-# always_split_re = re.compile(r'\[\w+\]')
-# e.g., [Laughter], [Applause], [Music]
-always_split = [
-    CustomTokens.MUSIC.value,
-    CustomTokens.APPLAUSE.value,
-    CustomTokens.LAUGHTER.value
-]
 def get_overlapping_chunks_of_tokens(tokens, size, overlap):
     for i in range(0, len(tokens), size-overlap+1):
         yield tokens[i:i+size]
-# Generate up to max_tokens - SAFETY_TOKENS
-SAFETY_TOKENS = 12
 # TODO play around with this?
@@ -34,15 +26,9 @@ OVERLAP_TOKEN_PERCENTAGE = 0.5  # 0.25
 def add_labels_to_words(words, sponsor_segments):
-    # TODO binary search
-    for word in words:
-        word['category'] = None
-        for sponsor_segment in sponsor_segments:
-            if sponsor_segment['start'] <= word['start'] <= sponsor_segment['end']:
-                word['category'] = sponsor_segment['category']
-    # TODO use extract_segment with mapping function?
-    # TODO remove sponsor segments that contain mostly empty space?
     return words
@@ -69,84 +55,86 @@ def generate_segments(words, tokenizer, segmentation_args):
     for index, word in enumerate(words):
         # Get length of tokenized word
-        cleaned = preprocess.clean_text(word['text'])
         word['num_tokens'] = len(
-            tokenizer(cleaned, add_special_tokens=False, truncation=True).input_ids)
-        add_new_segment = index == 0
-        if not add_new_segment:
-            if word['text'] in always_split or words[index-1]['text'] in always_split:
-                add_new_segment = True
-            # Pause too small, do not split
-            elif word_start(words[index]) - word_end(words[index-1]) >= segmentation_args.pause_threshold:
-                add_new_segment = True
-        if add_new_segment:  # New segment
             first_pass_segments.append([word])
         else:  # Add to current segment
             first_pass_segments[-1].append(word)
-    max_q_size = tokenizer.model_max_length - SAFETY_TOKENS
     buffer_size = OVERLAP_TOKEN_PERCENTAGE*max_q_size  # tokenizer.model_max_length
     # In second pass, we split those segments if too big
     second_pass_segments = []
     for segment in first_pass_segments:
         current_segment_num_tokens = 0
         current_segment = []
         for word in segment:
-            new_seg = current_segment_num_tokens + word['num_tokens'] >= max_q_size
             if new_seg:
                 # Adding this token would make it have too many tokens
                 # We save this batch and create new
-                second_pass_segments.append(current_segment.copy())
             # Add tokens to current segment
             current_segment.append(word)
             current_segment_num_tokens += word['num_tokens']
-            if new_seg:
-                # Just created a new segment, so we remove until we only have buffer_size tokens
-                while current_segment_num_tokens > buffer_size and current_segment:
-                    first_word = current_segment.pop(0)
-                    current_segment_num_tokens -= first_word['num_tokens']
-        if current_segment: # Add remaining segment
-            second_pass_segments.append(current_segment.copy())
     # Cleaning up, delete 'num_tokens' from each word
-    for segment in second_pass_segments:
-        for word in segment:
-            word.pop('num_tokens', None)
     return second_pass_segments
 def extract_segment(words, start, end, map_function=None):
     """Extracts all words with time in [start, end]"""
     a = binary_search(words, 0, len(words), start, True)
-    b = min(binary_search(words, 0, len(words), end , False) + 1, len(words))
     to_transform = map_function is not None and callable(map_function)
     return [
         map_function(words[i]) if to_transform else words[i] for i in range(a, b)
     ]
-# Binary search to get first index of word whose start/end time is greater/less than some value
 def binary_search(words, start_index, end_index, time, below):
     if start_index >= end_index:
         return end_index
-    middle_index = (start_index + end_index ) // 2
-    middle_time = word_start(words[middle_index]) if below else word_end(words[middle_index])
     if time <= middle_time:
         return binary_search(words, start_index, middle_index, time, below)
     else:

 @dataclass
 class SegmentationArguments:
+    pause_threshold: int = field(default=2.5, metadata={
         'help': 'When the time between words is greater than pause threshold, force into a new segment'})
 def get_overlapping_chunks_of_tokens(tokens, size, overlap):
     for i in range(0, len(tokens), size-overlap+1):
         yield tokens[i:i+size]
+# Generate up to SAFETY_TOKENS_PERCENTAGE*max_tokens tokens
+MIN_SAFETY_TOKENS = 8
+SAFETY_TOKENS_PERCENTAGE = 0.9765625
+# e.g. 512 -> 500, 768 -> 750
 # TODO play around with this?
 def add_labels_to_words(words, sponsor_segments):
+    for sponsor_segment in sponsor_segments:
+        for w in extract_segment(words, sponsor_segment['start'], sponsor_segment['end']):
+            w['category'] = sponsor_segment['category']
     return words
     for index, word in enumerate(words):
         # Get length of tokenized word
+        word['cleaned'] = preprocess.clean_text(word['text'])
         word['num_tokens'] = len(
+            tokenizer(word['cleaned'], add_special_tokens=False, truncation=True).input_ids)
+        # Add new segment
+        if index == 0 or word_start(words[index]) - word_end(words[index-1]) >= segmentation_args.pause_threshold:
             first_pass_segments.append([word])
         else:  # Add to current segment
             first_pass_segments[-1].append(word)
+    max_q_size = round(SAFETY_TOKENS_PERCENTAGE * tokenizer.model_max_length)
     buffer_size = OVERLAP_TOKEN_PERCENTAGE*max_q_size  # tokenizer.model_max_length
     # In second pass, we split those segments if too big
     second_pass_segments = []
     for segment in first_pass_segments:
         current_segment_num_tokens = 0
         current_segment = []
         for word in segment:
+            new_seg = current_segment_num_tokens + \
+                word['num_tokens'] >= max_q_size
             if new_seg:
                 # Adding this token would make it have too many tokens
                 # We save this batch and create new
+                second_pass_segments.append(current_segment)
             # Add tokens to current segment
             current_segment.append(word)
             current_segment_num_tokens += word['num_tokens']
+            if not new_seg:
+                continue
+            # Just created a new segment, so we remove until we only have buffer_size tokens
+            last_index = 0
+            while current_segment_num_tokens > buffer_size and current_segment:
+                current_segment_num_tokens -= current_segment[last_index]['num_tokens']
+                last_index += 1
+            current_segment = current_segment[last_index:]
+        if current_segment:  # Add remaining segment
+            second_pass_segments.append(current_segment)
     # Cleaning up, delete 'num_tokens' from each word
+    # for segment in second_pass_segments:
+    for word in words:
+        word.pop('num_tokens', None)
     return second_pass_segments
 def extract_segment(words, start, end, map_function=None):
     """Extracts all words with time in [start, end]"""
     a = binary_search(words, 0, len(words), start, True)
+    b = min(binary_search(words, 0, len(words), end, False) + 1, len(words))
     to_transform = map_function is not None and callable(map_function)
     return [
         map_function(words[i]) if to_transform else words[i] for i in range(a, b)
     ]
 def binary_search(words, start_index, end_index, time, below):
+    """Binary search to get first index of word whose start/end time is greater/less than some value"""
     if start_index >= end_index:
         return end_index
+    middle_index = (start_index + end_index) // 2
+    middle_time = word_start(
+        words[middle_index]) if below else word_end(words[middle_index])
+    # TODO if above: if time < middle_time binary_search(start, middle-1)
     if time <= middle_time:
         return binary_search(words, start_index, middle_index, time, below)
     else: