oceansweep commited on
Commit
6d8f9bd
·
verified ·
1 Parent(s): 22a38f7

Update App_Function_Libraries/Utils.py

Browse files
Files changed (1) hide show
  1. App_Function_Libraries/Utils.py +468 -440
App_Function_Libraries/Utils.py CHANGED
@@ -1,440 +1,468 @@
1
- # Utils.py
2
- #########################################
3
- # General Utilities Library
4
- # This library is used to hold random utilities used by various other libraries.
5
- #
6
- ####
7
- ####################
8
- # Function List
9
- #
10
- # 1. extract_text_from_segments(segments: List[Dict]) -> str
11
- # 2. download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5)
12
- # 3. verify_checksum(file_path, expected_checksum)
13
- # 4. create_download_directory(title)
14
- # 5. sanitize_filename(filename)
15
- # 6. normalize_title(title)
16
- # 7.
17
- #
18
- #
19
- #
20
- ####################
21
- # Import necessary libraries
22
- import configparser
23
- import hashlib
24
- import json
25
- import logging
26
- from datetime import timedelta
27
- from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
28
-
29
- import requests
30
- import time
31
- from tqdm import tqdm
32
- import os
33
- import re
34
- import unicodedata
35
-
36
- from App_Function_Libraries.Video_DL_Ingestion_Lib import get_youtube
37
-
38
-
39
- #######################################################################################################################
40
- # Function Definitions
41
- #
42
-
43
- def extract_text_from_segments(segments):
44
- logging.debug(f"Segments received: {segments}")
45
- logging.debug(f"Type of segments: {type(segments)}")
46
-
47
- def extract_text_recursive(data):
48
- if isinstance(data, dict):
49
- for key, value in data.items():
50
- if key == 'Text':
51
- return value
52
- elif isinstance(value, (dict, list)):
53
- result = extract_text_recursive(value)
54
- if result:
55
- return result
56
- elif isinstance(data, list):
57
- return ' '.join(filter(None, [extract_text_recursive(item) for item in data]))
58
- return None
59
-
60
- text = extract_text_recursive(segments)
61
-
62
- if text:
63
- return text.strip()
64
- else:
65
- logging.error(f"Unable to extract text from segments: {segments}")
66
- return "Error: Unable to extract transcription"
67
-
68
-
69
- def download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5):
70
- temp_path = dest_path + '.tmp'
71
-
72
- for attempt in range(max_retries):
73
- try:
74
- # Check if a partial download exists and get its size
75
- resume_header = {}
76
- if os.path.exists(temp_path):
77
- resume_header = {'Range': f'bytes={os.path.getsize(temp_path)}-'}
78
-
79
- response = requests.get(url, stream=True, headers=resume_header)
80
- response.raise_for_status()
81
-
82
- # Get the total file size from headers
83
- total_size = int(response.headers.get('content-length', 0))
84
- initial_pos = os.path.getsize(temp_path) if os.path.exists(temp_path) else 0
85
-
86
- mode = 'ab' if 'Range' in response.headers else 'wb'
87
- with open(temp_path, mode) as temp_file, tqdm(
88
- total=total_size, unit='B', unit_scale=True, desc=dest_path, initial=initial_pos, ascii=True
89
- ) as pbar:
90
- for chunk in response.iter_content(chunk_size=8192):
91
- if chunk: # filter out keep-alive new chunks
92
- temp_file.write(chunk)
93
- pbar.update(len(chunk))
94
-
95
- # Verify the checksum if provided
96
- if expected_checksum:
97
- if not verify_checksum(temp_path, expected_checksum):
98
- os.remove(temp_path)
99
- raise ValueError("Downloaded file's checksum does not match the expected checksum")
100
-
101
- # Move the file to the final destination
102
- os.rename(temp_path, dest_path)
103
- print("Download complete and verified!")
104
- return dest_path
105
-
106
- except Exception as e:
107
- print(f"Attempt {attempt + 1} failed: {e}")
108
- if attempt < max_retries - 1:
109
- print(f"Retrying in {delay} seconds...")
110
- time.sleep(delay)
111
- else:
112
- print("Max retries reached. Download failed.")
113
- raise
114
-
115
-
116
- def verify_checksum(file_path, expected_checksum):
117
- sha256_hash = hashlib.sha256()
118
- with open(file_path, 'rb') as f:
119
- for byte_block in iter(lambda: f.read(4096), b''):
120
- sha256_hash.update(byte_block)
121
- return sha256_hash.hexdigest() == expected_checksum
122
-
123
-
124
- def create_download_directory(title):
125
- base_dir = "Results"
126
- # Remove characters that are illegal in Windows filenames and normalize
127
- safe_title = normalize_title(title)
128
- logging.debug(f"{title} successfully normalized")
129
- session_path = os.path.join(base_dir, safe_title)
130
- if not os.path.exists(session_path):
131
- os.makedirs(session_path, exist_ok=True)
132
- logging.debug(f"Created directory for downloaded video: {session_path}")
133
- else:
134
- logging.debug(f"Directory already exists for downloaded video: {session_path}")
135
- return session_path
136
-
137
-
138
- def sanitize_filename(filename):
139
- # Remove invalid characters and replace spaces with underscores
140
- sanitized = re.sub(r'[<>:"/\\|?*]', '', filename)
141
- sanitized = re.sub(r'\s+', ' ', sanitized).strip()
142
- return sanitized
143
-
144
-
145
- def normalize_title(title):
146
- # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
147
- title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
148
- title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?',
149
- '').replace(
150
- '<', '').replace('>', '').replace('|', '')
151
- return title
152
-
153
-
154
-
155
-
156
- def clean_youtube_url(url):
157
- parsed_url = urlparse(url)
158
- query_params = parse_qs(parsed_url.query)
159
- if 'list' in query_params:
160
- query_params.pop('list')
161
- cleaned_query = urlencode(query_params, doseq=True)
162
- cleaned_url = urlunparse(parsed_url._replace(query=cleaned_query))
163
- return cleaned_url
164
-
165
-
166
- def extract_video_info(url):
167
- info_dict = get_youtube(url)
168
- title = info_dict.get('title', 'Untitled')
169
- return info_dict, title
170
-
171
-
172
- def clean_youtube_url(url):
173
- parsed_url = urlparse(url)
174
- query_params = parse_qs(parsed_url.query)
175
- if 'list' in query_params:
176
- query_params.pop('list')
177
- cleaned_query = urlencode(query_params, doseq=True)
178
- cleaned_url = urlunparse(parsed_url._replace(query=cleaned_query))
179
- return cleaned_url
180
-
181
- def extract_video_info(url):
182
- info_dict = get_youtube(url)
183
- title = info_dict.get('title', 'Untitled')
184
- return info_dict, title
185
-
186
- def import_data(file):
187
- # Implement this function to import data from a file
188
- pass
189
-
190
-
191
-
192
-
193
- #######################
194
- # Config loading
195
- #
196
-
197
- def load_comprehensive_config():
198
- # Get the directory of the current script
199
- current_dir = os.path.dirname(os.path.abspath(__file__))
200
- # Go up one level to the project root directory
201
- project_root = os.path.dirname(current_dir)
202
- # Construct the path to the config file in the project root directory
203
- config_path = os.path.join(project_root, 'config.txt')
204
- # Create a ConfigParser object
205
- config = configparser.ConfigParser()
206
- # Read the configuration file
207
- files_read = config.read(config_path)
208
- if not files_read:
209
- raise FileNotFoundError(f"Config file not found at {config_path}")
210
- return config
211
-
212
-
213
- def load_and_log_configs():
214
- try:
215
- config = load_comprehensive_config()
216
- if config is None:
217
- logging.error("Config is None, cannot proceed")
218
- return None
219
- # API Keys
220
- anthropic_api_key = config.get('API', 'anthropic_api_key', fallback=None)
221
- logging.debug(
222
- f"Loaded Anthropic API Key: {anthropic_api_key[:5]}...{anthropic_api_key[-5:] if anthropic_api_key else None}")
223
-
224
- cohere_api_key = config.get('API', 'cohere_api_key', fallback=None)
225
- logging.debug(
226
- f"Loaded Cohere API Key: {cohere_api_key[:5]}...{cohere_api_key[-5:] if cohere_api_key else None}")
227
-
228
- groq_api_key = config.get('API', 'groq_api_key', fallback=None)
229
- logging.debug(f"Loaded Groq API Key: {groq_api_key[:5]}...{groq_api_key[-5:] if groq_api_key else None}")
230
-
231
- openai_api_key = config.get('API', 'openai_api_key', fallback=None)
232
- logging.debug(
233
- f"Loaded OpenAI API Key: {openai_api_key[:5]}...{openai_api_key[-5:] if openai_api_key else None}")
234
-
235
- huggingface_api_key = config.get('API', 'huggingface_api_key', fallback=None)
236
- logging.debug(
237
- f"Loaded HuggingFace API Key: {huggingface_api_key[:5]}...{huggingface_api_key[-5:] if huggingface_api_key else None}")
238
-
239
- openrouter_api_key = config.get('API', 'openrouter_api_key', fallback=None)
240
- logging.debug(
241
- f"Loaded OpenRouter API Key: {openrouter_api_key[:5]}...{openrouter_api_key[-5:] if openrouter_api_key else None}")
242
-
243
- deepseek_api_key = config.get('API', 'deepseek_api_key', fallback=None)
244
- logging.debug(
245
- f"Loaded DeepSeek API Key: {deepseek_api_key[:5]}...{deepseek_api_key[-5:] if deepseek_api_key else None}")
246
-
247
- # Models
248
- anthropic_model = config.get('API', 'anthropic_model', fallback='claude-3-sonnet-20240229')
249
- cohere_model = config.get('API', 'cohere_model', fallback='command-r-plus')
250
- groq_model = config.get('API', 'groq_model', fallback='llama3-70b-8192')
251
- openai_model = config.get('API', 'openai_model', fallback='gpt-4-turbo')
252
- huggingface_model = config.get('API', 'huggingface_model', fallback='CohereForAI/c4ai-command-r-plus')
253
- openrouter_model = config.get('API', 'openrouter_model', fallback='microsoft/wizardlm-2-8x22b')
254
- deepseek_model = config.get('API', 'deepseek_model', fallback='deepseek-chat')
255
-
256
- logging.debug(f"Loaded Anthropic Model: {anthropic_model}")
257
- logging.debug(f"Loaded Cohere Model: {cohere_model}")
258
- logging.debug(f"Loaded Groq Model: {groq_model}")
259
- logging.debug(f"Loaded OpenAI Model: {openai_model}")
260
- logging.debug(f"Loaded HuggingFace Model: {huggingface_model}")
261
- logging.debug(f"Loaded OpenRouter Model: {openrouter_model}")
262
-
263
- # Local-Models
264
- kobold_api_IP = config.get('Local-API', 'kobold_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
265
- kobold_api_key = config.get('Local-API', 'kobold_api_key', fallback='')
266
-
267
- llama_api_IP = config.get('Local-API', 'llama_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
268
- llama_api_key = config.get('Local-API', 'llama_api_key', fallback='')
269
-
270
- ooba_api_IP = config.get('Local-API', 'ooba_api_IP', fallback='http://127.0.0.1:5000/v1/chat/completions')
271
- ooba_api_key = config.get('Local-API', 'ooba_api_key', fallback='')
272
-
273
- tabby_api_IP = config.get('Local-API', 'tabby_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
274
- tabby_api_key = config.get('Local-API', 'tabby_api_key', fallback=None)
275
-
276
- vllm_api_url = config.get('Local-API', 'vllm_api_IP', fallback='http://127.0.0.1:500/api/v1/chat/completions')
277
- vllm_api_key = config.get('Local-API', 'vllm_api_key', fallback=None)
278
-
279
- logging.debug(f"Loaded Kobold API IP: {kobold_api_IP}")
280
- logging.debug(f"Loaded Llama API IP: {llama_api_IP}")
281
- logging.debug(f"Loaded Ooba API IP: {ooba_api_IP}")
282
- logging.debug(f"Loaded Tabby API IP: {tabby_api_IP}")
283
- logging.debug(f"Loaded VLLM API URL: {vllm_api_url}")
284
-
285
- # Retrieve output paths from the configuration file
286
- output_path = config.get('Paths', 'output_path', fallback='results')
287
- logging.debug(f"Output path set to: {output_path}")
288
-
289
- # Retrieve processing choice from the configuration file
290
- processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
291
- logging.debug(f"Processing choice set to: {processing_choice}")
292
-
293
- # Prompts - FIXME
294
- prompt_path = config.get('Prompts', 'prompt_path', fallback='prompts.db')
295
-
296
- return {
297
- 'api_keys': {
298
- 'anthropic': anthropic_api_key,
299
- 'cohere': cohere_api_key,
300
- 'groq': groq_api_key,
301
- 'openai': openai_api_key,
302
- 'huggingface': huggingface_api_key,
303
- 'openrouter': openrouter_api_key,
304
- 'deepseek': deepseek_api_key
305
- },
306
- 'models': {
307
- 'anthropic': anthropic_model,
308
- 'cohere': cohere_model,
309
- 'groq': groq_model,
310
- 'openai': openai_model,
311
- 'huggingface': huggingface_model,
312
- 'openrouter': openrouter_model,
313
- 'deepseek': deepseek_model
314
- },
315
- 'local_apis': {
316
- 'kobold': {'ip': kobold_api_IP, 'key': kobold_api_key},
317
- 'llama': {'ip': llama_api_IP, 'key': llama_api_key},
318
- 'ooba': {'ip': ooba_api_IP, 'key': ooba_api_key},
319
- 'tabby': {'ip': tabby_api_IP, 'key': tabby_api_key},
320
- 'vllm': {'ip': vllm_api_url, 'key': vllm_api_key}
321
- },
322
- 'output_path': output_path,
323
- 'processing_choice': processing_choice
324
- }
325
-
326
- except Exception as e:
327
- logging.error(f"Error loading config: {str(e)}")
328
- return None
329
-
330
-
331
-
332
- # Log file
333
- # logging.basicConfig(filename='debug-runtime.log', encoding='utf-8', level=logging.DEBUG)
334
-
335
-
336
-
337
-
338
-
339
-
340
-
341
- def format_metadata_as_text(metadata):
342
- if not metadata:
343
- return "No metadata available"
344
-
345
- formatted_text = "Video Metadata:\n"
346
- for key, value in metadata.items():
347
- if value is not None:
348
- if isinstance(value, list):
349
- # Join list items with commas
350
- formatted_value = ", ".join(str(item) for item in value)
351
- elif key == 'upload_date' and len(str(value)) == 8:
352
- # Format date as YYYY-MM-DD
353
- formatted_value = f"{value[:4]}-{value[4:6]}-{value[6:]}"
354
- elif key in ['view_count', 'like_count']:
355
- # Format large numbers with commas
356
- formatted_value = f"{value:,}"
357
- elif key == 'duration':
358
- # Convert seconds to HH:MM:SS format
359
- hours, remainder = divmod(value, 3600)
360
- minutes, seconds = divmod(remainder, 60)
361
- formatted_value = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
362
- else:
363
- formatted_value = str(value)
364
-
365
- formatted_text += f"{key.capitalize()}: {formatted_value}\n"
366
- return formatted_text.strip()
367
-
368
- # # Example usage:
369
- # example_metadata = {
370
- # 'title': 'Sample Video Title',
371
- # 'uploader': 'Channel Name',
372
- # 'upload_date': '20230615',
373
- # 'view_count': 1000000,
374
- # 'like_count': 50000,
375
- # 'duration': 3725, # 1 hour, 2 minutes, 5 seconds
376
- # 'tags': ['tag1', 'tag2', 'tag3'],
377
- # 'description': 'This is a sample video description.'
378
- # }
379
- #
380
- # print(format_metadata_as_text(example_metadata))
381
-
382
-
383
-
384
- def convert_to_seconds(time_str):
385
- if not time_str:
386
- return 0
387
-
388
- # If it's already a number, assume it's in seconds
389
- if time_str.isdigit():
390
- return int(time_str)
391
-
392
- # Parse time string in format HH:MM:SS, MM:SS, or SS
393
- time_parts = time_str.split(':')
394
- if len(time_parts) == 3:
395
- return int(timedelta(hours=int(time_parts[0]),
396
- minutes=int(time_parts[1]),
397
- seconds=int(time_parts[2])).total_seconds())
398
- elif len(time_parts) == 2:
399
- return int(timedelta(minutes=int(time_parts[0]),
400
- seconds=int(time_parts[1])).total_seconds())
401
- elif len(time_parts) == 1:
402
- return int(time_parts[0])
403
- else:
404
- raise ValueError(f"Invalid time format: {time_str}")
405
-
406
-
407
- def save_to_file(video_urls, filename):
408
- with open(filename, 'w') as file:
409
- file.write('\n'.join(video_urls))
410
- print(f"Video URLs saved to {filename}")
411
-
412
-
413
- def save_segments_to_json(segments, file_name="transcription_segments.json"):
414
- """
415
- Save transcription segments to a JSON file.
416
-
417
- Parameters:
418
- segments (list): List of transcription segments
419
- file_name (str): Name of the JSON file to save (default: "transcription_segments.json")
420
-
421
- Returns:
422
- str: Path to the saved JSON file
423
- """
424
- # Ensure the Results directory exists
425
- os.makedirs("Results", exist_ok=True)
426
-
427
- # Full path for the JSON file
428
- json_file_path = os.path.join("Results", file_name)
429
-
430
- # Save segments to JSON file
431
- with open(json_file_path, 'w', encoding='utf-8') as json_file:
432
- json.dump(segments, json_file, ensure_ascii=False, indent=4)
433
-
434
- return json_file_path
435
-
436
-
437
-
438
-
439
-
440
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Utils.py
2
+ #########################################
3
+ # General Utilities Library
4
+ # This library is used to hold random utilities used by various other libraries.
5
+ #
6
+ ####
7
+ ####################
8
+ # Function List
9
+ #
10
+ # 1. extract_text_from_segments(segments: List[Dict]) -> str
11
+ # 2. download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5)
12
+ # 3. verify_checksum(file_path, expected_checksum)
13
+ # 4. create_download_directory(title)
14
+ # 5. sanitize_filename(filename)
15
+ # 6. normalize_title(title)
16
+ # 7.
17
+ #
18
+ #
19
+ #
20
+ ####################
21
+ # Import necessary libraries
22
+ import configparser
23
+ import hashlib
24
+ import json
25
+ import logging
26
+ import os
27
+ import re
28
+ import time
29
+ from datetime import timedelta
30
+ from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
31
+
32
+ import requests
33
+ import unicodedata
34
+ from tqdm import tqdm
35
+
36
+ from App_Function_Libraries.Video_DL_Ingestion_Lib import get_youtube
37
+
38
+
39
+ #######################################################################################################################
40
+ # Function Definitions
41
+ #
42
+
43
+ def extract_text_from_segments(segments):
44
+ logging.debug(f"Segments received: {segments}")
45
+ logging.debug(f"Type of segments: {type(segments)}")
46
+
47
+ def extract_text_recursive(data):
48
+ if isinstance(data, dict):
49
+ for key, value in data.items():
50
+ if key == 'Text':
51
+ return value
52
+ elif isinstance(value, (dict, list)):
53
+ result = extract_text_recursive(value)
54
+ if result:
55
+ return result
56
+ elif isinstance(data, list):
57
+ return ' '.join(filter(None, [extract_text_recursive(item) for item in data]))
58
+ return None
59
+
60
+ text = extract_text_recursive(segments)
61
+
62
+ if text:
63
+ return text.strip()
64
+ else:
65
+ logging.error(f"Unable to extract text from segments: {segments}")
66
+ return "Error: Unable to extract transcription"
67
+
68
+
69
+ def download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5):
70
+ temp_path = dest_path + '.tmp'
71
+
72
+ for attempt in range(max_retries):
73
+ try:
74
+ # Check if a partial download exists and get its size
75
+ resume_header = {}
76
+ if os.path.exists(temp_path):
77
+ resume_header = {'Range': f'bytes={os.path.getsize(temp_path)}-'}
78
+
79
+ response = requests.get(url, stream=True, headers=resume_header)
80
+ response.raise_for_status()
81
+
82
+ # Get the total file size from headers
83
+ total_size = int(response.headers.get('content-length', 0))
84
+ initial_pos = os.path.getsize(temp_path) if os.path.exists(temp_path) else 0
85
+
86
+ mode = 'ab' if 'Range' in response.headers else 'wb'
87
+ with open(temp_path, mode) as temp_file, tqdm(
88
+ total=total_size, unit='B', unit_scale=True, desc=dest_path, initial=initial_pos, ascii=True
89
+ ) as pbar:
90
+ for chunk in response.iter_content(chunk_size=8192):
91
+ if chunk: # filter out keep-alive new chunks
92
+ temp_file.write(chunk)
93
+ pbar.update(len(chunk))
94
+
95
+ # Verify the checksum if provided
96
+ if expected_checksum:
97
+ if not verify_checksum(temp_path, expected_checksum):
98
+ os.remove(temp_path)
99
+ raise ValueError("Downloaded file's checksum does not match the expected checksum")
100
+
101
+ # Move the file to the final destination
102
+ os.rename(temp_path, dest_path)
103
+ print("Download complete and verified!")
104
+ return dest_path
105
+
106
+ except Exception as e:
107
+ print(f"Attempt {attempt + 1} failed: {e}")
108
+ if attempt < max_retries - 1:
109
+ print(f"Retrying in {delay} seconds...")
110
+ time.sleep(delay)
111
+ else:
112
+ print("Max retries reached. Download failed.")
113
+ raise
114
+
115
+
116
+ def verify_checksum(file_path, expected_checksum):
117
+ sha256_hash = hashlib.sha256()
118
+ with open(file_path, 'rb') as f:
119
+ for byte_block in iter(lambda: f.read(4096), b''):
120
+ sha256_hash.update(byte_block)
121
+ return sha256_hash.hexdigest() == expected_checksum
122
+
123
+
124
+ def create_download_directory(title):
125
+ base_dir = "Results"
126
+ # Remove characters that are illegal in Windows filenames and normalize
127
+ safe_title = normalize_title(title)
128
+ logging.debug(f"{title} successfully normalized")
129
+ session_path = os.path.join(base_dir, safe_title)
130
+ if not os.path.exists(session_path):
131
+ os.makedirs(session_path, exist_ok=True)
132
+ logging.debug(f"Created directory for downloaded video: {session_path}")
133
+ else:
134
+ logging.debug(f"Directory already exists for downloaded video: {session_path}")
135
+ return session_path
136
+
137
+
138
+ def sanitize_filename(filename):
139
+ # Remove invalid characters and replace spaces with underscores
140
+ sanitized = re.sub(r'[<>:"/\\|?*]', '', filename)
141
+ sanitized = re.sub(r'\s+', ' ', sanitized).strip()
142
+ return sanitized
143
+
144
+
145
+ def normalize_title(title):
146
+ # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
147
+ title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
148
+ title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?',
149
+ '').replace(
150
+ '<', '').replace('>', '').replace('|', '')
151
+ return title
152
+
153
+
154
+ def clean_youtube_url(url):
155
+ parsed_url = urlparse(url)
156
+ query_params = parse_qs(parsed_url.query)
157
+ if 'list' in query_params:
158
+ query_params.pop('list')
159
+ cleaned_query = urlencode(query_params, doseq=True)
160
+ cleaned_url = urlunparse(parsed_url._replace(query=cleaned_query))
161
+ return cleaned_url
162
+
163
+
164
+ def extract_video_info(url):
165
+ info_dict = get_youtube(url)
166
+ title = info_dict.get('title', 'Untitled')
167
+ return info_dict, title
168
+
169
+
170
+ def import_data(file):
171
+ # Implement this function to import data from a file
172
+ pass
173
+
174
+
175
+ def safe_read_file(file_path):
176
+ encodings = ['utf-8', 'utf-16', 'ascii', 'latin-1', 'iso-8859-1', 'cp1252']
177
+ for encoding in encodings:
178
+ try:
179
+ with open(file_path, 'r', encoding=encoding) as file:
180
+ return file.read()
181
+ except UnicodeDecodeError:
182
+ continue
183
+ except FileNotFoundError:
184
+ return f"File not found: {file_path}"
185
+ except Exception as e:
186
+ return f"An error occurred: {e}"
187
+ return f"Unable to decode the file {file_path} with any of the attempted encodings: {encodings}"
188
+
189
+ #
190
+ #
191
+ #######################
192
+ # Temp file cleanup
193
+ #
194
+ # Global list to keep track of downloaded files
195
+ downloaded_files = []
196
+
197
+ def cleanup_downloads():
198
+ """Function to clean up downloaded files when the server exits."""
199
+ for file_path in downloaded_files:
200
+ try:
201
+ if os.path.exists(file_path):
202
+ os.remove(file_path)
203
+ print(f"Cleaned up file: {file_path}")
204
+ except Exception as e:
205
+ print(f"Error cleaning up file {file_path}: {e}")
206
+
207
+ #
208
+ #
209
+ #######################
210
+ # Config loading
211
+ #
212
+
213
+ def load_comprehensive_config():
214
+ # Get the directory of the current script
215
+ current_dir = os.path.dirname(os.path.abspath(__file__))
216
+ # Go up one level to the project root directory
217
+ project_root = os.path.dirname(current_dir)
218
+ # Construct the path to the config file in the project root directory
219
+ config_path = os.path.join(project_root, 'config.txt')
220
+ # Create a ConfigParser object
221
+ config = configparser.ConfigParser()
222
+ # Read the configuration file
223
+ files_read = config.read(config_path)
224
+ if not files_read:
225
+ raise FileNotFoundError(f"Config file not found at {config_path}")
226
+ return config
227
+
228
+
229
+ # FIXME - update to include prompt path in return statement
230
+ def load_and_log_configs():
231
+ try:
232
+ config = load_comprehensive_config()
233
+ if config is None:
234
+ logging.error("Config is None, cannot proceed")
235
+ return None
236
+ # API Keys
237
+ anthropic_api_key = config.get('API', 'anthropic_api_key', fallback=None)
238
+ logging.debug(
239
+ f"Loaded Anthropic API Key: {anthropic_api_key[:5]}...{anthropic_api_key[-5:] if anthropic_api_key else None}")
240
+
241
+ cohere_api_key = config.get('API', 'cohere_api_key', fallback=None)
242
+ logging.debug(
243
+ f"Loaded Cohere API Key: {cohere_api_key[:5]}...{cohere_api_key[-5:] if cohere_api_key else None}")
244
+
245
+ groq_api_key = config.get('API', 'groq_api_key', fallback=None)
246
+ logging.debug(f"Loaded Groq API Key: {groq_api_key[:5]}...{groq_api_key[-5:] if groq_api_key else None}")
247
+
248
+ openai_api_key = config.get('API', 'openai_api_key', fallback=None)
249
+ logging.debug(
250
+ f"Loaded OpenAI API Key: {openai_api_key[:5]}...{openai_api_key[-5:] if openai_api_key else None}")
251
+
252
+ huggingface_api_key = config.get('API', 'huggingface_api_key', fallback=None)
253
+ logging.debug(
254
+ f"Loaded HuggingFace API Key: {huggingface_api_key[:5]}...{huggingface_api_key[-5:] if huggingface_api_key else None}")
255
+
256
+ openrouter_api_key = config.get('API', 'openrouter_api_key', fallback=None)
257
+ logging.debug(
258
+ f"Loaded OpenRouter API Key: {openrouter_api_key[:5]}...{openrouter_api_key[-5:] if openrouter_api_key else None}")
259
+
260
+ deepseek_api_key = config.get('API', 'deepseek_api_key', fallback=None)
261
+ logging.debug(
262
+ f"Loaded DeepSeek API Key: {deepseek_api_key[:5]}...{deepseek_api_key[-5:] if deepseek_api_key else None}")
263
+
264
+ # Models
265
+ anthropic_model = config.get('API', 'anthropic_model', fallback='claude-3-sonnet-20240229')
266
+ cohere_model = config.get('API', 'cohere_model', fallback='command-r-plus')
267
+ groq_model = config.get('API', 'groq_model', fallback='llama3-70b-8192')
268
+ openai_model = config.get('API', 'openai_model', fallback='gpt-4-turbo')
269
+ huggingface_model = config.get('API', 'huggingface_model', fallback='CohereForAI/c4ai-command-r-plus')
270
+ openrouter_model = config.get('API', 'openrouter_model', fallback='microsoft/wizardlm-2-8x22b')
271
+ deepseek_model = config.get('API', 'deepseek_model', fallback='deepseek-chat')
272
+
273
+ logging.debug(f"Loaded Anthropic Model: {anthropic_model}")
274
+ logging.debug(f"Loaded Cohere Model: {cohere_model}")
275
+ logging.debug(f"Loaded Groq Model: {groq_model}")
276
+ logging.debug(f"Loaded OpenAI Model: {openai_model}")
277
+ logging.debug(f"Loaded HuggingFace Model: {huggingface_model}")
278
+ logging.debug(f"Loaded OpenRouter Model: {openrouter_model}")
279
+
280
+ # Local-Models
281
+ kobold_api_ip = config.get('Local-API', 'kobold_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
282
+ kobold_api_key = config.get('Local-API', 'kobold_api_key', fallback='')
283
+
284
+ llama_api_IP = config.get('Local-API', 'llama_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
285
+ llama_api_key = config.get('Local-API', 'llama_api_key', fallback='')
286
+
287
+ ooba_api_IP = config.get('Local-API', 'ooba_api_IP', fallback='http://127.0.0.1:5000/v1/chat/completions')
288
+ ooba_api_key = config.get('Local-API', 'ooba_api_key', fallback='')
289
+
290
+ tabby_api_IP = config.get('Local-API', 'tabby_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
291
+ tabby_api_key = config.get('Local-API', 'tabby_api_key', fallback=None)
292
+ tabby_model = config.get('models', 'tabby_model', fallback=None)
293
+
294
+ vllm_api_url = config.get('Local-API', 'vllm_api_IP', fallback='http://127.0.0.1:500/api/v1/chat/completions')
295
+ vllm_api_key = config.get('Local-API', 'vllm_api_key', fallback=None)
296
+ vllm_model = config.get('Local-API', 'vllm_model', fallback=None)
297
+
298
+ logging.debug(f"Loaded Kobold API IP: {kobold_api_ip}")
299
+ logging.debug(f"Loaded Llama API IP: {llama_api_IP}")
300
+ logging.debug(f"Loaded Ooba API IP: {ooba_api_IP}")
301
+ logging.debug(f"Loaded Tabby API IP: {tabby_api_IP}")
302
+ logging.debug(f"Loaded VLLM API URL: {vllm_api_url}")
303
+
304
+ # Retrieve output paths from the configuration file
305
+ output_path = config.get('Paths', 'output_path', fallback='results')
306
+ logging.debug(f"Output path set to: {output_path}")
307
+
308
+ # Retrieve processing choice from the configuration file
309
+ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
310
+ logging.debug(f"Processing choice set to: {processing_choice}")
311
+
312
+ # Prompts - FIXME
313
+ prompt_path = config.get('Prompts', 'prompt_path', fallback='prompts.db')
314
+
315
+ return {
316
+ 'api_keys': {
317
+ 'anthropic': anthropic_api_key,
318
+ 'cohere': cohere_api_key,
319
+ 'groq': groq_api_key,
320
+ 'openai': openai_api_key,
321
+ 'huggingface': huggingface_api_key,
322
+ 'openrouter': openrouter_api_key,
323
+ 'deepseek': deepseek_api_key,
324
+ 'kobold': kobold_api_key,
325
+ 'llama': llama_api_key,
326
+ 'ooba': ooba_api_key,
327
+ 'tabby': tabby_api_key,
328
+ 'vllm': vllm_api_key
329
+ },
330
+ 'models': {
331
+ 'anthropic': anthropic_model,
332
+ 'cohere': cohere_model,
333
+ 'groq': groq_model,
334
+ 'openai': openai_model,
335
+ 'huggingface': huggingface_model,
336
+ 'openrouter': openrouter_model,
337
+ 'deepseek': deepseek_model,
338
+ 'vllm': vllm_model,
339
+ 'tabby': tabby_model
340
+
341
+ },
342
+ 'local_api_ip': {
343
+ 'kobold': kobold_api_ip,
344
+ 'llama': llama_api_IP,
345
+ 'ooba': ooba_api_IP,
346
+ 'tabby': tabby_api_IP,
347
+ 'vllm': vllm_api_url,
348
+ },
349
+ 'output_path': output_path,
350
+ 'processing_choice': processing_choice
351
+ }
352
+
353
+ except Exception as e:
354
+ logging.error(f"Error loading config: {str(e)}")
355
+ return None
356
+
357
+
358
+ # Log file
359
+ # logging.basicConfig(filename='debug-runtime.log', encoding='utf-8', level=logging.DEBUG)
360
+
361
+
362
+ def format_metadata_as_text(metadata):
363
+ if not metadata:
364
+ return "No metadata available"
365
+
366
+ formatted_text = "Video Metadata:\n"
367
+ for key, value in metadata.items():
368
+ if value is not None:
369
+ if isinstance(value, list):
370
+ # Join list items with commas
371
+ formatted_value = ", ".join(str(item) for item in value)
372
+ elif key == 'upload_date' and len(str(value)) == 8:
373
+ # Format date as YYYY-MM-DD
374
+ formatted_value = f"{value[:4]}-{value[4:6]}-{value[6:]}"
375
+ elif key in ['view_count', 'like_count']:
376
+ # Format large numbers with commas
377
+ formatted_value = f"{value:,}"
378
+ elif key == 'duration':
379
+ # Convert seconds to HH:MM:SS format
380
+ hours, remainder = divmod(value, 3600)
381
+ minutes, seconds = divmod(remainder, 60)
382
+ formatted_value = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
383
+ else:
384
+ formatted_value = str(value)
385
+
386
+ formatted_text += f"{key.capitalize()}: {formatted_value}\n"
387
+ return formatted_text.strip()
388
+
389
+ # # Example usage:
390
+ # example_metadata = {
391
+ # 'title': 'Sample Video Title',
392
+ # 'uploader': 'Channel Name',
393
+ # 'upload_date': '20230615',
394
+ # 'view_count': 1000000,
395
+ # 'like_count': 50000,
396
+ # 'duration': 3725, # 1 hour, 2 minutes, 5 seconds
397
+ # 'tags': ['tag1', 'tag2', 'tag3'],
398
+ # 'description': 'This is a sample video description.'
399
+ # }
400
+ #
401
+ # print(format_metadata_as_text(example_metadata))
402
+
403
+
404
+ def convert_to_seconds(time_str):
405
+ if not time_str:
406
+ return 0
407
+
408
+ # If it's already a number, assume it's in seconds
409
+ if time_str.isdigit():
410
+ return int(time_str)
411
+
412
+ # Parse time string in format HH:MM:SS, MM:SS, or SS
413
+ time_parts = time_str.split(':')
414
+ if len(time_parts) == 3:
415
+ return int(timedelta(hours=int(time_parts[0]),
416
+ minutes=int(time_parts[1]),
417
+ seconds=int(time_parts[2])).total_seconds())
418
+ elif len(time_parts) == 2:
419
+ return int(timedelta(minutes=int(time_parts[0]),
420
+ seconds=int(time_parts[1])).total_seconds())
421
+ elif len(time_parts) == 1:
422
+ return int(time_parts[0])
423
+ else:
424
+ raise ValueError(f"Invalid time format: {time_str}")
425
+
426
+
427
+ def save_to_file(video_urls, filename):
428
+ with open(filename, 'w') as file:
429
+ file.write('\n'.join(video_urls))
430
+ print(f"Video URLs saved to {filename}")
431
+
432
+
433
+ def save_segments_to_json(segments, file_name="transcription_segments.json"):
434
+ """
435
+ Save transcription segments to a JSON file.
436
+
437
+ Parameters:
438
+ segments (list): List of transcription segments
439
+ file_name (str): Name of the JSON file to save (default: "transcription_segments.json")
440
+
441
+ Returns:
442
+ str: Path to the saved JSON file
443
+ """
444
+ # Ensure the Results directory exists
445
+ os.makedirs("Results", exist_ok=True)
446
+
447
+ # Full path for the JSON file
448
+ json_file_path = os.path.join("Results", file_name)
449
+
450
+ # Save segments to JSON file
451
+ with open(json_file_path, 'w', encoding='utf-8') as json_file:
452
+ json.dump(segments, json_file, ensure_ascii=False, indent=4)
453
+
454
+ return json_file_path
455
+
456
+ #
457
+ #
458
+ #######################################################################################################################
459
+ #
460
+ # Backup code
461
+
462
+ #
463
+ # End of backup code
464
+ #######################################################################################################################
465
+
466
+
467
+
468
+