import re def delete_special(pre_text, character_list): for c in character_list: pre_text = pre_text.replace(c, "") return pre_text def break_down2scenes(text: str): # Split the text based on the 's#' pattern scenes = re.split(r'(s#\d+)', text) # Remove empty elements from the split results scenes = [scene for scene in scenes if scene.strip()] scenes_list = [] current_scene_number = None for i in range(0, len(scenes), 2): # Process the 's#' marker and its corresponding text as pairs scene_marker = scenes[i].strip() try: scene_number = int(scene_marker.split('#')[1]) # Extract the number part except: if len(scenes) % 2 == 1: return [scenes[0]] import ipdb;ipdb.set_trace(context=10) scene_text = scenes[i+1].strip() if i+1 < len(scenes) else "" # Check if the scene numbers are in the correct sequence if current_scene_number is not None: expected_scene_number = current_scene_number + 1 if scene_number != expected_scene_number: raise ValueError(f"Unexpected scene number: {scene_number}, expected {expected_scene_number}") # Store the scene number and its corresponding text together scenes_list.append({ 'detected_scene_number': scene_number, 'text': f"{scene_marker}\n{scene_text}".strip() }) filtered_scene_list = [] scene_number = 0 for scene_dict in scenes_list: detected_scene_number = int(scene_dict['detected_scene_number']) filtered_scene_list.append(scene_dict['text']) scene_number = detected_scene_number return filtered_scene_list